LLVM 22.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "AMDGPUTargetMachine.h"
19#include "GCNSubtarget.h"
22#include "SIRegisterInfo.h"
23#include "llvm/ADT/APInt.h"
25#include "llvm/ADT/Statistic.h"
39#include "llvm/IR/IRBuilder.h"
41#include "llvm/IR/IntrinsicsAMDGPU.h"
42#include "llvm/IR/IntrinsicsR600.h"
43#include "llvm/IR/MDBuilder.h"
46#include "llvm/Support/ModRef.h"
48#include <optional>
49
50using namespace llvm;
51using namespace llvm::SDPatternMatch;
52
53#define DEBUG_TYPE "si-lower"
54
55STATISTIC(NumTailCalls, "Number of tail calls");
56
57static cl::opt<bool>
58 DisableLoopAlignment("amdgpu-disable-loop-alignment",
59 cl::desc("Do not align and prefetch loops"),
60 cl::init(false));
61
63 "amdgpu-use-divergent-register-indexing", cl::Hidden,
64 cl::desc("Use indirect register addressing for divergent indexes"),
65 cl::init(false));
66
67// TODO: This option should be removed once we switch to always using PTRADD in
68// the SelectionDAG.
70 "amdgpu-use-sdag-ptradd", cl::Hidden,
71 cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the "
72 "SelectionDAG ISel"),
73 cl::init(false));
74
77 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
78}
79
82 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
83}
84
85static unsigned findFirstFreeSGPR(CCState &CCInfo) {
86 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
87 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
88 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
89 return AMDGPU::SGPR0 + Reg;
90 }
91 }
92 llvm_unreachable("Cannot allocate sgpr");
93}
94
96 const GCNSubtarget &STI)
97 : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) {
98 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
99 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
100
101 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
102 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
103
104 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
105
106 const SIRegisterInfo *TRI = STI.getRegisterInfo();
107 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
108
109 addRegisterClass(MVT::f64, V64RegClass);
110 addRegisterClass(MVT::v2f32, V64RegClass);
111 addRegisterClass(MVT::Untyped, V64RegClass);
112
113 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
114 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
115
116 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
117 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
118
119 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
120 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
121
122 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
123 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
124
125 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
126 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
127
128 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
129 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
130
131 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
132 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
133
134 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
135 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
136
137 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
138 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
139
140 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
141 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
142
143 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
144 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
145
146 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
147 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
148
149 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
150 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
151
152 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
153 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
154
155 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
156 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
157
158 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
159 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
160
161 if (Subtarget->has16BitInsts()) {
162 if (Subtarget->useRealTrue16Insts()) {
163 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
164 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
165 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
166 } else {
167 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
168 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
169 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
170 }
171
172 // Unless there are also VOP3P operations, not operations are really legal.
173 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
174 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
175 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
176 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
177 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
178 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
179 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
180 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
181 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
182 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
183 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
184 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
185 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
186 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
187 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
188 }
189
190 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
191 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
192
193 computeRegisterProperties(Subtarget->getRegisterInfo());
194
195 // The boolean content concept here is too inflexible. Compares only ever
196 // really produce a 1-bit result. Any copy/extend from these will turn into a
197 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
198 // it's what most targets use.
201
202 // We need to custom lower vector stores from local memory
203 setOperationAction(ISD::LOAD,
204 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
205 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
206 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
207 MVT::i1, MVT::v32i32},
208 Custom);
209
210 setOperationAction(ISD::STORE,
211 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
212 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
213 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
214 MVT::i1, MVT::v32i32},
215 Custom);
216
217 if (isTypeLegal(MVT::bf16)) {
218 for (unsigned Opc :
220 ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM,
221 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
222 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
223 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
224 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
225 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
226 ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE,
227 ISD::SETCC}) {
228 // FIXME: The promoted to type shouldn't need to be explicit
229 setOperationAction(Opc, MVT::bf16, Promote);
230 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
231 }
232
234
236 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
237
238 setOperationAction(ISD::FABS, MVT::bf16, Legal);
239 setOperationAction(ISD::FNEG, MVT::bf16, Legal);
241
242 // We only need to custom lower because we can't specify an action for bf16
243 // sources.
246 }
247
248 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
249 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
250 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
251 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
252 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
253 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
254 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
255 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
256 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
257 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
258 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
259 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
260 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
261 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
262 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
263 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
264
265 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
266 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
267 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
268 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
269 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
270 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
271 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
272
273 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
274
278 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
279
280 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
281
283 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
284
286 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
287 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
288
290 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
291 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
292 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
293 Expand);
295 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
296 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
297 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
298 Expand);
299
301 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
302 MVT::v3i16, MVT::v4i16, MVT::Other},
303 Custom);
304
305 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
306 setOperationAction(ISD::BR_CC,
307 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
308
310
312
314 Expand);
315
316#if 0
318#endif
319
320 // We only support LOAD/STORE and vector manipulation ops for vectors
321 // with > 4 elements.
322 for (MVT VT :
323 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
324 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
325 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
326 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
327 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
328 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
329 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
330 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
331 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
332 switch (Op) {
333 case ISD::LOAD:
334 case ISD::STORE:
336 case ISD::BITCAST:
337 case ISD::UNDEF:
341 case ISD::IS_FPCLASS:
342 break;
347 break;
348 default:
350 break;
351 }
352 }
353 }
354
355 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
356
357 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
358 // is expanded to avoid having two separate loops in case the index is a VGPR.
359
360 // Most operations are naturally 32-bit vector operations. We only support
361 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
362 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
364 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
365
367 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
368
370 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
371
373 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
374 }
375
376 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
378 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
379
381 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
382
384 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
385
387 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
388 }
389
390 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
392 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
393
395 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
396
398 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
399
401 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
402 }
403
404 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
406 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
407
409 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
410
412 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
413
415 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
416 }
417
418 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
420 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
421
423 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
424
426 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
427
429 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
430 }
431
433 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
434 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
435 Custom);
436
437 if (Subtarget->hasPkMovB32()) {
438 // TODO: 16-bit element vectors should be legal with even aligned elements.
439 // TODO: Can be legal with wider source types than the result with
440 // subregister extracts.
441 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
442 }
443
445 // Prevent SELECT v2i32 from being implemented with the above bitwise ops and
446 // instead lower to cndmask in SITargetLowering::LowerSELECT().
448 // Enable MatchRotate to produce ISD::ROTR, which is later transformed to
449 // alignbit.
450 setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
451
452 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
453 Custom);
454
455 // Avoid stack access for these.
456 // TODO: Generalize to more vector types.
458 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
459 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
460 Custom);
461
462 // Deal with vec3 vector operations when widened to vec4.
464 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
465
466 // Deal with vec5/6/7 vector operations when widened to vec8.
468 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
469 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
470 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
471 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
472 Custom);
473
474 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
475 // and output demarshalling
476 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
477
478 // We can't return success/failure, only the old value,
479 // let LLVM add the comparison
480 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, {MVT::i32, MVT::i64},
481 Expand);
482
483 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
484
485 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
486
487 // FIXME: This should be narrowed to i32, but that only happens if i64 is
488 // illegal.
489 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
490 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
491
492 // On SI this is s_memtime and s_memrealtime on VI.
493 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
494
495 if (Subtarget->hasSMemRealTime() ||
496 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
497 setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
498 setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom);
499
500 if (Subtarget->has16BitInsts()) {
501 setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote);
502 setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom);
503 } else {
504 setOperationAction(ISD::FSQRT, MVT::f16, Custom);
505 }
506
507 if (Subtarget->hasMadMacF32Insts())
509
510 if (!Subtarget->hasBFI())
511 // fcopysign can be done in a single instruction with BFI.
512 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
513
514 if (!Subtarget->hasBCNT(32))
516
517 if (!Subtarget->hasBCNT(64))
519
520 if (Subtarget->hasFFBH())
522
523 if (Subtarget->hasFFBL())
525
526 // We only really have 32-bit BFE instructions (and 16-bit on VI).
527 //
528 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
529 // effort to match them now. We want this to be false for i64 cases when the
530 // extraction isn't restricted to the upper or lower half. Ideally we would
531 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
532 // span the midpoint are probably relatively rare, so don't worry about them
533 // for now.
534 if (Subtarget->hasBFE())
536
537 // Clamp modifier on add/sub
538 if (Subtarget->hasIntClamp())
540
541 if (Subtarget->hasAddNoCarry())
542 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
543 Legal);
544
546 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
547 {MVT::f32, MVT::f64}, Custom);
548
549 // These are really only legal for ieee_mode functions. We should be avoiding
550 // them for functions that don't have ieee_mode enabled, so just say they are
551 // legal.
552 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
553 {MVT::f32, MVT::f64}, Legal);
554
555 if (Subtarget->haveRoundOpsF64())
556 setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, MVT::f64,
557 Legal);
558 else
559 setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
560 MVT::f64, Custom);
561
562 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
563 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
564 Legal);
565 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
566
567 setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom);
569
570 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
571 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
572
573 // Custom lower these because we can't specify a rule based on an illegal
574 // source bf16.
575 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom);
576 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f64, Custom);
577
578 if (Subtarget->has16BitInsts()) {
581 MVT::i16, Legal);
582
583 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
584
586 MVT::i16, Expand);
587
591 ISD::CTPOP},
592 MVT::i16, Promote);
593
594 setOperationAction(ISD::LOAD, MVT::i16, Custom);
595
596 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
597
598 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
599 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
600 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
601 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
602
606
608
609 // F16 - Constant Actions.
612
613 // F16 - Load/Store Actions.
614 setOperationAction(ISD::LOAD, MVT::f16, Promote);
615 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
616 setOperationAction(ISD::STORE, MVT::f16, Promote);
617 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
618
619 // BF16 - Load/Store Actions.
620 setOperationAction(ISD::LOAD, MVT::bf16, Promote);
621 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
622 setOperationAction(ISD::STORE, MVT::bf16, Promote);
623 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
624
625 // F16 - VOP1 Actions.
627 ISD::FSIN, ISD::FROUND},
628 MVT::f16, Custom);
629
630 // BF16 - VOP1 Actions.
631 if (Subtarget->hasBF16TransInsts())
632 setOperationAction({ISD::FCOS, ISD::FSIN, ISD::FDIV}, MVT::bf16, Custom);
633
636
637 // F16 - VOP2 Actions.
638 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
639 Expand);
640 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom);
641 setOperationAction(ISD::FFREXP, MVT::f16, Custom);
643
644 // F16 - VOP3 Actions.
646 if (STI.hasMadF16())
648
649 for (MVT VT :
650 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
651 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
652 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
653 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
654 switch (Op) {
655 case ISD::LOAD:
656 case ISD::STORE:
658 case ISD::BITCAST:
659 case ISD::UNDEF:
664 case ISD::IS_FPCLASS:
665 break;
669 break;
670 default:
672 break;
673 }
674 }
675 }
676
677 // v_perm_b32 can handle either of these.
678 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
680
681 // XXX - Do these do anything? Vector constants turn into build_vector.
682 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
683
684 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
685 Legal);
686
687 setOperationAction(ISD::STORE, MVT::v2i16, Promote);
688 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
689 setOperationAction(ISD::STORE, MVT::v2f16, Promote);
690 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
691
692 setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
693 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
694 setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
695 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
696
697 setOperationAction(ISD::AND, MVT::v2i16, Promote);
698 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
699 setOperationAction(ISD::OR, MVT::v2i16, Promote);
700 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
701 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
702 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
703
704 setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
705 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
706 setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
707 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
708 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
709 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
710
711 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
712 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
713 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
714 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
715 setOperationAction(ISD::STORE, MVT::v4bf16, Promote);
716 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
717
718 setOperationAction(ISD::LOAD, MVT::v8i16, Promote);
719 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
720 setOperationAction(ISD::LOAD, MVT::v8f16, Promote);
721 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
722 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
723 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
724
725 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
726 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
727 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
728 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
729
730 setOperationAction(ISD::STORE, MVT::v8i16, Promote);
731 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
732 setOperationAction(ISD::STORE, MVT::v8f16, Promote);
733 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
734 setOperationAction(ISD::STORE, MVT::v8bf16, Promote);
735 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
736
737 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
738 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
739 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
740 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
741 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
742 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
743
744 setOperationAction(ISD::STORE, MVT::v16i16, Promote);
745 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
746 setOperationAction(ISD::STORE, MVT::v16f16, Promote);
747 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
748 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
749 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
750
751 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
752 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
753 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
754 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
755 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
756 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
757
758 setOperationAction(ISD::STORE, MVT::v32i16, Promote);
759 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
760 setOperationAction(ISD::STORE, MVT::v32f16, Promote);
761 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
762 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
763 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
764
766 MVT::v2i32, Expand);
767 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
768
770 MVT::v4i32, Expand);
771
773 MVT::v8i32, Expand);
774
775 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
776 Subtarget->hasVOP3PInsts() ? Legal : Custom);
777
778 setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal);
779 // This isn't really legal, but this avoids the legalizer unrolling it (and
780 // allows matching fneg (fabs x) patterns)
781 setOperationAction(ISD::FABS, {MVT::v2f16, MVT::v2bf16}, Legal);
782
783 // Can do this in one BFI plus a constant materialize.
785 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
786 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
787 MVT::v32f16, MVT::v32bf16},
788 Custom);
789
791 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
792 MVT::f16, Custom);
793 setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
794
795 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM,
796 ISD::FMAXIMUMNUM},
797 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
798 Custom);
799
800 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},
801 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
802 Expand);
803
804 for (MVT Vec16 :
805 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
806 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
809 Vec16, Custom);
811 }
812 }
813
814 if (Subtarget->hasVOP3PInsts()) {
818 MVT::v2i16, Legal);
819
820 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
821 ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
822 MVT::v2f16, Legal);
823
825 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
826
828 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
829 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
830 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
831 Custom);
832
833 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
834 // Split vector operations.
839 VT, Custom);
840
841 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
842 // Split vector operations.
844 VT, Custom);
845
847 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
848 {MVT::v2f16, MVT::v4f16}, Custom);
849
850 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
851 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
852 Custom);
853
854 if (Subtarget->hasPackedFP32Ops()) {
856 MVT::v2f32, Legal);
858 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
859 Custom);
860 }
861 }
862
863 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom);
864
865 if (Subtarget->has16BitInsts()) {
867 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
869 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
870 } else {
871 // Legalization hack.
872 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
873
874 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom);
875 }
876
878 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
879 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
880 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
881 MVT::v32f16, MVT::v32bf16},
882 Custom);
883
885
886 if (Subtarget->hasVectorMulU64())
888 else if (Subtarget->hasScalarSMulU64())
890
891 if (Subtarget->hasMad64_32())
893
894 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
895 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
896
897 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
898 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM},
899 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
900 } else {
901 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
902 if (Subtarget->hasMinimum3Maximum3F32())
903 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal);
904
905 if (Subtarget->hasMinimum3Maximum3PKF16()) {
906 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::v2f16, Legal);
907
908 // If only the vector form is available, we need to widen to a vector.
909 if (!Subtarget->hasMinimum3Maximum3F16())
910 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f16, Custom);
911 }
912 }
913
914 if (Subtarget->hasVOP3PInsts()) {
915 // We want to break these into v2f16 pieces, not scalarize.
916 setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM},
917 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
918 Custom);
919 }
920
921 if (Subtarget->hasIntMinMax64())
923 Legal);
924
926 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
927 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
928 MVT::i8},
929 Custom);
930
932 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
933 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
934 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
935 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
936 Custom);
937
939 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
940 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
941 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
942 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
943 Custom);
944
945 setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
947 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
948 setOperationAction(ISD::GET_FPENV, MVT::i64, Custom);
949 setOperationAction(ISD::SET_FPENV, MVT::i64, Custom);
950
951 // TODO: Could move this to custom lowering, could benefit from combines on
952 // extract of relevant bits.
953 setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
954
956
957 if (Subtarget->hasBF16ConversionInsts()) {
958 setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom);
960 }
961
962 if (Subtarget->hasBF16PackedInsts()) {
964 {ISD::FADD, ISD::FMUL, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMA},
965 MVT::v2bf16, Legal);
966 }
967
968 if (Subtarget->hasBF16TransInsts()) {
969 setOperationAction({ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, MVT::bf16, Legal);
970 }
971
972 if (Subtarget->hasCvtPkF16F32Inst()) {
974 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
975 Custom);
976 }
977
979 ISD::PTRADD,
981 ISD::SUB,
983 ISD::MUL,
984 ISD::FADD,
985 ISD::FSUB,
986 ISD::FDIV,
987 ISD::FMUL,
988 ISD::FMINNUM,
989 ISD::FMAXNUM,
990 ISD::FMINNUM_IEEE,
991 ISD::FMAXNUM_IEEE,
992 ISD::FMINIMUM,
993 ISD::FMAXIMUM,
994 ISD::FMINIMUMNUM,
995 ISD::FMAXIMUMNUM,
996 ISD::FMA,
997 ISD::SMIN,
998 ISD::SMAX,
999 ISD::UMIN,
1000 ISD::UMAX,
1001 ISD::SETCC,
1003 ISD::SMIN,
1004 ISD::SMAX,
1005 ISD::UMIN,
1006 ISD::UMAX,
1007 ISD::AND,
1008 ISD::OR,
1009 ISD::XOR,
1010 ISD::SHL,
1011 ISD::SRL,
1012 ISD::SRA,
1013 ISD::FSHR,
1023
1024 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1026
1027 // All memory operations. Some folding on the pointer operand is done to help
1028 // matching the constant offsets in the addressing modes.
1029 setTargetDAGCombine({ISD::LOAD,
1030 ISD::STORE,
1031 ISD::ATOMIC_LOAD,
1032 ISD::ATOMIC_STORE,
1033 ISD::ATOMIC_CMP_SWAP,
1034 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1035 ISD::ATOMIC_SWAP,
1036 ISD::ATOMIC_LOAD_ADD,
1037 ISD::ATOMIC_LOAD_SUB,
1038 ISD::ATOMIC_LOAD_AND,
1039 ISD::ATOMIC_LOAD_OR,
1040 ISD::ATOMIC_LOAD_XOR,
1041 ISD::ATOMIC_LOAD_NAND,
1042 ISD::ATOMIC_LOAD_MIN,
1043 ISD::ATOMIC_LOAD_MAX,
1044 ISD::ATOMIC_LOAD_UMIN,
1045 ISD::ATOMIC_LOAD_UMAX,
1046 ISD::ATOMIC_LOAD_FADD,
1047 ISD::ATOMIC_LOAD_FMIN,
1048 ISD::ATOMIC_LOAD_FMAX,
1049 ISD::ATOMIC_LOAD_UINC_WRAP,
1050 ISD::ATOMIC_LOAD_UDEC_WRAP,
1053
1054 // FIXME: In other contexts we pretend this is a per-function property.
1056
1058}
1059
1060const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1061
1063 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1064 return RCRegs;
1065}
1066
1067//===----------------------------------------------------------------------===//
1068// TargetLowering queries
1069//===----------------------------------------------------------------------===//
1070
1071// v_mad_mix* support a conversion from f16 to f32.
1072//
1073// There is only one special case when denormals are enabled we don't currently,
1074// where this is OK to use.
1075bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1076 EVT DestVT, EVT SrcVT) const {
1077 return DestVT.getScalarType() == MVT::f32 &&
1078 ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1079 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1080 SrcVT.getScalarType() == MVT::f16) ||
1081 (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1082 SrcVT.getScalarType() == MVT::bf16)) &&
1083 // TODO: This probably only requires no input flushing?
1085}
1086
1088 LLT DestTy, LLT SrcTy) const {
1089 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1090 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1091 DestTy.getScalarSizeInBits() == 32 &&
1092 SrcTy.getScalarSizeInBits() == 16 &&
1093 // TODO: This probably only requires no input flushing?
1094 denormalModeIsFlushAllF32(*MI.getMF());
1095}
1096
1098 // SI has some legal vector types, but no legal vector operations. Say no
1099 // shuffles are legal in order to prefer scalarizing some vector operations.
1100 return false;
1101}
1102
1104 CallingConv::ID CC,
1105 EVT VT) const {
1107 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1108
1109 if (VT.isVector()) {
1110 EVT ScalarVT = VT.getScalarType();
1111 unsigned Size = ScalarVT.getSizeInBits();
1112 if (Size == 16) {
1113 if (Subtarget->has16BitInsts()) {
1114 if (VT.isInteger())
1115 return MVT::v2i16;
1116 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1117 }
1118 return VT.isInteger() ? MVT::i32 : MVT::f32;
1119 }
1120
1121 if (Size < 16)
1122 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1123 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1124 }
1125
1126 if (VT.getSizeInBits() > 32)
1127 return MVT::i32;
1128
1129 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1130}
1131
1133 CallingConv::ID CC,
1134 EVT VT) const {
1136 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1137
1138 if (VT.isVector()) {
1139 unsigned NumElts = VT.getVectorNumElements();
1140 EVT ScalarVT = VT.getScalarType();
1141 unsigned Size = ScalarVT.getSizeInBits();
1142
1143 // FIXME: Should probably promote 8-bit vectors to i16.
1144 if (Size == 16 && Subtarget->has16BitInsts())
1145 return (NumElts + 1) / 2;
1146
1147 if (Size <= 32)
1148 return NumElts;
1149
1150 if (Size > 32)
1151 return NumElts * ((Size + 31) / 32);
1152 } else if (VT.getSizeInBits() > 32)
1153 return (VT.getSizeInBits() + 31) / 32;
1154
1155 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1156}
1157
1159 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1160 unsigned &NumIntermediates, MVT &RegisterVT) const {
1161 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1162 unsigned NumElts = VT.getVectorNumElements();
1163 EVT ScalarVT = VT.getScalarType();
1164 unsigned Size = ScalarVT.getSizeInBits();
1165 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1166 // support, but unless we can properly handle 3-vectors, it will be still be
1167 // inconsistent.
1168 if (Size == 16 && Subtarget->has16BitInsts()) {
1169 if (ScalarVT == MVT::bf16) {
1170 RegisterVT = MVT::i32;
1171 IntermediateVT = MVT::v2bf16;
1172 } else {
1173 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1174 IntermediateVT = RegisterVT;
1175 }
1176 NumIntermediates = (NumElts + 1) / 2;
1177 return NumIntermediates;
1178 }
1179
1180 if (Size == 32) {
1181 RegisterVT = ScalarVT.getSimpleVT();
1182 IntermediateVT = RegisterVT;
1183 NumIntermediates = NumElts;
1184 return NumIntermediates;
1185 }
1186
1187 if (Size < 16 && Subtarget->has16BitInsts()) {
1188 // FIXME: Should probably form v2i16 pieces
1189 RegisterVT = MVT::i16;
1190 IntermediateVT = ScalarVT;
1191 NumIntermediates = NumElts;
1192 return NumIntermediates;
1193 }
1194
1195 if (Size != 16 && Size <= 32) {
1196 RegisterVT = MVT::i32;
1197 IntermediateVT = ScalarVT;
1198 NumIntermediates = NumElts;
1199 return NumIntermediates;
1200 }
1201
1202 if (Size > 32) {
1203 RegisterVT = MVT::i32;
1204 IntermediateVT = RegisterVT;
1205 NumIntermediates = NumElts * ((Size + 31) / 32);
1206 return NumIntermediates;
1207 }
1208 }
1209
1211 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1212}
1213
1215 const DataLayout &DL, Type *Ty,
1216 unsigned MaxNumLanes) {
1217 assert(MaxNumLanes != 0);
1218
1219 LLVMContext &Ctx = Ty->getContext();
1220 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1221 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1222 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1223 NumElts);
1224 }
1225
1226 return TLI.getValueType(DL, Ty);
1227}
1228
1229// Peek through TFE struct returns to only use the data size.
1231 const DataLayout &DL, Type *Ty,
1232 unsigned MaxNumLanes) {
1233 auto *ST = dyn_cast<StructType>(Ty);
1234 if (!ST)
1235 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1236
1237 // TFE intrinsics return an aggregate type.
1238 assert(ST->getNumContainedTypes() == 2 &&
1239 ST->getContainedType(1)->isIntegerTy(32));
1240 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1241}
1242
1243/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1244/// in-memory representation. This return value is a custom type because there
1245/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1246/// could cause issues during codegen, these address space 7 pointers will be
1247/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1248/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1249/// for cost modeling, to work. (This also sets us up decently for doing the
1250/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1252 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1253 return MVT::amdgpuBufferFatPointer;
1255 DL.getPointerSizeInBits(AS) == 192)
1256 return MVT::amdgpuBufferStridedPointer;
1258}
1259/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1260/// v8i32 when padding is added.
1261/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1262/// also v8i32 with padding.
1264 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1265 DL.getPointerSizeInBits(AS) == 160) ||
1267 DL.getPointerSizeInBits(AS) == 192))
1268 return MVT::v8i32;
1270}
1271
1272static unsigned getIntrMemWidth(unsigned IntrID) {
1273 switch (IntrID) {
1274 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1275 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1276 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1277 return 8;
1278 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1279 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1280 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1281 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1282 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1283 return 32;
1284 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1285 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1286 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1287 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1288 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1289 return 64;
1290 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1291 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1292 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1293 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1294 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1295 return 128;
1296 default:
1297 llvm_unreachable("Unknown width");
1298 }
1299}
1300
1301static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad,
1303 Value *OrderingArg = CI.getArgOperand(IsLoad ? 1 : 2);
1304 unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
1305 switch (AtomicOrderingCABI(Ord)) {
1308 break;
1311 break;
1314 break;
1315 default:
1317 break;
1318 }
1319
1320 Info.flags =
1322 Info.flags |= MOCooperative;
1323
1324 MDNode *ScopeMD = cast<MDNode>(
1325 cast<MetadataAsValue>(CI.getArgOperand(IsLoad ? 2 : 3))->getMetadata());
1326 StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
1327 Info.ssid = CI.getContext().getOrInsertSyncScopeID(Scope);
1328}
1329
1331 const CallInst &CI,
1332 MachineFunction &MF,
1333 unsigned IntrID) const {
1334 Info.flags = MachineMemOperand::MONone;
1335 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1336 Info.flags |= MachineMemOperand::MOInvariant;
1337 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1339 Info.flags |= getTargetMMOFlags(CI);
1340
1341 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1343 AttributeSet Attr =
1345 MemoryEffects ME = Attr.getMemoryEffects();
1346 if (ME.doesNotAccessMemory())
1347 return false;
1348
1349 // TODO: Should images get their own address space?
1350 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1351
1352 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1353 if (RsrcIntr->IsImage) {
1354 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1356 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1357 Info.align.reset();
1358 }
1359
1360 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1361 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1362 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1363 // We conservatively set the memory operand of a buffer intrinsic to the
1364 // base resource pointer, so that we can access alias information about
1365 // those pointers. Cases like "this points at the same value
1366 // but with a different offset" are handled in
1367 // areMemAccessesTriviallyDisjoint.
1368 Info.ptrVal = RsrcArg;
1369 }
1370
1371 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1372 if (!IsSPrefetch) {
1373 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1374 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1375 Info.flags |= MachineMemOperand::MOVolatile;
1376 }
1377
1379 if (ME.onlyReadsMemory()) {
1380 if (RsrcIntr->IsImage) {
1381 unsigned MaxNumLanes = 4;
1382
1383 if (!BaseOpcode->Gather4) {
1384 // If this isn't a gather, we may have excess loaded elements in the
1385 // IR type. Check the dmask for the real number of elements loaded.
1386 unsigned DMask =
1387 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1388 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1389 }
1390
1391 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1392 CI.getType(), MaxNumLanes);
1393 } else {
1394 Info.memVT =
1396 std::numeric_limits<unsigned>::max());
1397 }
1398
1399 // FIXME: What does alignment mean for an image?
1400 Info.opc = ISD::INTRINSIC_W_CHAIN;
1401 Info.flags |= MachineMemOperand::MOLoad;
1402 } else if (ME.onlyWritesMemory()) {
1403 Info.opc = ISD::INTRINSIC_VOID;
1404
1405 Type *DataTy = CI.getArgOperand(0)->getType();
1406 if (RsrcIntr->IsImage) {
1407 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1408 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1409 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1410 DMaskLanes);
1411 } else
1412 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1413
1414 Info.flags |= MachineMemOperand::MOStore;
1415 } else {
1416 // Atomic, NoReturn Sampler or prefetch
1417 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1419 Info.flags |=
1421
1422 if (!IsSPrefetch)
1423 Info.flags |= MachineMemOperand::MOStore;
1424
1425 switch (IntrID) {
1426 default:
1427 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1428 // Fake memory access type for no return sampler intrinsics
1429 Info.memVT = MVT::i32;
1430 } else {
1431 // XXX - Should this be volatile without known ordering?
1432 Info.flags |= MachineMemOperand::MOVolatile;
1433 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1434 }
1435 break;
1436 case Intrinsic::amdgcn_raw_buffer_load_lds:
1437 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1438 case Intrinsic::amdgcn_struct_buffer_load_lds:
1439 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1440 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1441 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1442 Info.ptrVal = CI.getArgOperand(1);
1443 return true;
1444 }
1445 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1446 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1447 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1448 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1449 Info.memVT =
1451 std::numeric_limits<unsigned>::max());
1452 Info.flags &= ~MachineMemOperand::MOStore;
1453 return true;
1454 }
1455 }
1456 }
1457 return true;
1458 }
1459
1460 switch (IntrID) {
1461 case Intrinsic::amdgcn_ds_ordered_add:
1462 case Intrinsic::amdgcn_ds_ordered_swap: {
1463 Info.opc = ISD::INTRINSIC_W_CHAIN;
1464 Info.memVT = MVT::getVT(CI.getType());
1465 Info.ptrVal = CI.getOperand(0);
1466 Info.align.reset();
1468
1469 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1470 if (!Vol->isZero())
1471 Info.flags |= MachineMemOperand::MOVolatile;
1472
1473 return true;
1474 }
1475 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1476 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1477 Info.opc = ISD::INTRINSIC_W_CHAIN;
1478 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1479 Info.ptrVal = nullptr;
1480 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1482 return true;
1483 }
1484 case Intrinsic::amdgcn_ds_append:
1485 case Intrinsic::amdgcn_ds_consume: {
1486 Info.opc = ISD::INTRINSIC_W_CHAIN;
1487 Info.memVT = MVT::getVT(CI.getType());
1488 Info.ptrVal = CI.getOperand(0);
1489 Info.align.reset();
1491
1492 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1493 if (!Vol->isZero())
1494 Info.flags |= MachineMemOperand::MOVolatile;
1495
1496 return true;
1497 }
1498 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1499 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1500 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1503 Info.memVT = MVT::getVT(CI.getType());
1504 Info.ptrVal = CI.getOperand(0);
1505 Info.memVT = MVT::i64;
1506 Info.size = 8;
1507 Info.align.reset();
1509 return true;
1510 }
1511 case Intrinsic::amdgcn_global_atomic_csub: {
1512 Info.opc = ISD::INTRINSIC_W_CHAIN;
1513 Info.memVT = MVT::getVT(CI.getType());
1514 Info.ptrVal = CI.getOperand(0);
1515 Info.align.reset();
1518 return true;
1519 }
1520 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1521 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1522 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1523 Info.opc = ISD::INTRINSIC_W_CHAIN;
1524 Info.memVT =
1525 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1526 ? CI.getType()
1528 ->getElementType(0)); // XXX: what is correct VT?
1529
1530 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1531 Info.align.reset();
1532 Info.flags |=
1534 return true;
1535 }
1536 case Intrinsic::amdgcn_global_atomic_fmin_num:
1537 case Intrinsic::amdgcn_global_atomic_fmax_num:
1538 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1539 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1540 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1541 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1542 Info.opc = ISD::INTRINSIC_W_CHAIN;
1543 Info.memVT = MVT::getVT(CI.getType());
1544 Info.ptrVal = CI.getOperand(0);
1545 Info.align.reset();
1549 return true;
1550 }
1551 case Intrinsic::amdgcn_flat_load_monitor_b32:
1552 case Intrinsic::amdgcn_flat_load_monitor_b64:
1553 case Intrinsic::amdgcn_flat_load_monitor_b128:
1554 case Intrinsic::amdgcn_global_load_monitor_b32:
1555 case Intrinsic::amdgcn_global_load_monitor_b64:
1556 case Intrinsic::amdgcn_global_load_monitor_b128:
1557 case Intrinsic::amdgcn_cluster_load_b32:
1558 case Intrinsic::amdgcn_cluster_load_b64:
1559 case Intrinsic::amdgcn_cluster_load_b128:
1560 case Intrinsic::amdgcn_ds_load_tr6_b96:
1561 case Intrinsic::amdgcn_ds_load_tr4_b64:
1562 case Intrinsic::amdgcn_ds_load_tr8_b64:
1563 case Intrinsic::amdgcn_ds_load_tr16_b128:
1564 case Intrinsic::amdgcn_global_load_tr6_b96:
1565 case Intrinsic::amdgcn_global_load_tr4_b64:
1566 case Intrinsic::amdgcn_global_load_tr_b64:
1567 case Intrinsic::amdgcn_global_load_tr_b128:
1568 case Intrinsic::amdgcn_ds_read_tr4_b64:
1569 case Intrinsic::amdgcn_ds_read_tr6_b96:
1570 case Intrinsic::amdgcn_ds_read_tr8_b64:
1571 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1572 Info.opc = ISD::INTRINSIC_W_CHAIN;
1573 Info.memVT = MVT::getVT(CI.getType());
1574 Info.ptrVal = CI.getOperand(0);
1575 Info.align.reset();
1576 Info.flags |= MachineMemOperand::MOLoad;
1577 return true;
1578 }
1579 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1580 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1581 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1582 Info.opc = ISD::INTRINSIC_W_CHAIN;
1583 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1584 Info.ptrVal = CI.getOperand(0);
1585 Info.align.reset();
1586 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/true, Info);
1587 return true;
1588 }
1589 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1590 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1591 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1592 Info.opc = ISD::INTRINSIC_VOID;
1593 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1594 Info.ptrVal = CI.getArgOperand(0);
1595 Info.align.reset();
1596 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/false, Info);
1597 return true;
1598 }
1599 case Intrinsic::amdgcn_ds_gws_init:
1600 case Intrinsic::amdgcn_ds_gws_barrier:
1601 case Intrinsic::amdgcn_ds_gws_sema_v:
1602 case Intrinsic::amdgcn_ds_gws_sema_br:
1603 case Intrinsic::amdgcn_ds_gws_sema_p:
1604 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1605 Info.opc = ISD::INTRINSIC_VOID;
1606
1607 const GCNTargetMachine &TM =
1608 static_cast<const GCNTargetMachine &>(getTargetMachine());
1609
1611 Info.ptrVal = MFI->getGWSPSV(TM);
1612
1613 // This is an abstract access, but we need to specify a type and size.
1614 Info.memVT = MVT::i32;
1615 Info.size = 4;
1616 Info.align = Align(4);
1617
1618 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1619 Info.flags |= MachineMemOperand::MOLoad;
1620 else
1621 Info.flags |= MachineMemOperand::MOStore;
1622 return true;
1623 }
1624 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1625 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1626 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1627 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1628 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1629 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1630 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1631 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1632 Info.opc = ISD::INTRINSIC_VOID;
1633 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1634 Info.ptrVal = CI.getArgOperand(1);
1636 return true;
1637 }
1638 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1639 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1640 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1641 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1642 Info.opc = ISD::INTRINSIC_VOID;
1643 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1644 Info.ptrVal = CI.getArgOperand(0);
1646 return true;
1647 }
1648 case Intrinsic::amdgcn_load_to_lds:
1649 case Intrinsic::amdgcn_global_load_lds: {
1650 Info.opc = ISD::INTRINSIC_VOID;
1651 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1652 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1653 Info.ptrVal = CI.getArgOperand(1);
1655 return true;
1656 }
1657 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1658 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1659 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1660 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1661 Info.opc = ISD::INTRINSIC_W_CHAIN;
1662
1663 const GCNTargetMachine &TM =
1664 static_cast<const GCNTargetMachine &>(getTargetMachine());
1665
1667 Info.ptrVal = MFI->getGWSPSV(TM);
1668
1669 // This is an abstract access, but we need to specify a type and size.
1670 Info.memVT = MVT::i32;
1671 Info.size = 4;
1672 Info.align = Align(4);
1673
1675 return true;
1676 }
1677 case Intrinsic::amdgcn_s_prefetch_data:
1678 case Intrinsic::amdgcn_flat_prefetch:
1679 case Intrinsic::amdgcn_global_prefetch: {
1680 Info.opc = ISD::INTRINSIC_VOID;
1681 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1682 Info.ptrVal = CI.getArgOperand(0);
1683 Info.flags |= MachineMemOperand::MOLoad;
1684 return true;
1685 }
1686 default:
1687 return false;
1688 }
1689}
1690
1692 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1694 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1695 // The DAG's ValueType loses the addrspaces.
1696 // Add them as 2 extra Constant operands "from" and "to".
1697 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1698 unsigned DstAS = I.getType()->getPointerAddressSpace();
1699 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1700 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1701 break;
1702 }
1703 default:
1704 break;
1705 }
1706}
1707
1710 Type *&AccessTy) const {
1711 Value *Ptr = nullptr;
1712 switch (II->getIntrinsicID()) {
1713 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1714 case Intrinsic::amdgcn_cluster_load_b128:
1715 case Intrinsic::amdgcn_cluster_load_b64:
1716 case Intrinsic::amdgcn_cluster_load_b32:
1717 case Intrinsic::amdgcn_ds_append:
1718 case Intrinsic::amdgcn_ds_consume:
1719 case Intrinsic::amdgcn_ds_load_tr8_b64:
1720 case Intrinsic::amdgcn_ds_load_tr16_b128:
1721 case Intrinsic::amdgcn_ds_load_tr4_b64:
1722 case Intrinsic::amdgcn_ds_load_tr6_b96:
1723 case Intrinsic::amdgcn_ds_read_tr4_b64:
1724 case Intrinsic::amdgcn_ds_read_tr6_b96:
1725 case Intrinsic::amdgcn_ds_read_tr8_b64:
1726 case Intrinsic::amdgcn_ds_read_tr16_b64:
1727 case Intrinsic::amdgcn_ds_ordered_add:
1728 case Intrinsic::amdgcn_ds_ordered_swap:
1729 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1730 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1731 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1732 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1733 case Intrinsic::amdgcn_flat_load_monitor_b128:
1734 case Intrinsic::amdgcn_flat_load_monitor_b32:
1735 case Intrinsic::amdgcn_flat_load_monitor_b64:
1736 case Intrinsic::amdgcn_global_atomic_csub:
1737 case Intrinsic::amdgcn_global_atomic_fmax_num:
1738 case Intrinsic::amdgcn_global_atomic_fmin_num:
1739 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1740 case Intrinsic::amdgcn_global_load_monitor_b128:
1741 case Intrinsic::amdgcn_global_load_monitor_b32:
1742 case Intrinsic::amdgcn_global_load_monitor_b64:
1743 case Intrinsic::amdgcn_global_load_tr_b64:
1744 case Intrinsic::amdgcn_global_load_tr_b128:
1745 case Intrinsic::amdgcn_global_load_tr4_b64:
1746 case Intrinsic::amdgcn_global_load_tr6_b96:
1747 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1748 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1749 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1750 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1751 Ptr = II->getArgOperand(0);
1752 break;
1753 case Intrinsic::amdgcn_load_to_lds:
1754 case Intrinsic::amdgcn_global_load_lds:
1755 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1756 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1757 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1758 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1759 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1760 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1761 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1762 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1763 Ptr = II->getArgOperand(1);
1764 break;
1765 default:
1766 return false;
1767 }
1768 AccessTy = II->getType();
1769 Ops.push_back(Ptr);
1770 return true;
1771}
1772
1774 unsigned AddrSpace) const {
1775 if (!Subtarget->hasFlatInstOffsets()) {
1776 // Flat instructions do not have offsets, and only have the register
1777 // address.
1778 return AM.BaseOffs == 0 && AM.Scale == 0;
1779 }
1780
1781 decltype(SIInstrFlags::FLAT) FlatVariant =
1785
1786 return AM.Scale == 0 &&
1787 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1788 AM.BaseOffs, AddrSpace, FlatVariant));
1789}
1790
1792 if (Subtarget->hasFlatGlobalInsts())
1794
1795 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1796 // Assume the we will use FLAT for all global memory accesses
1797 // on VI.
1798 // FIXME: This assumption is currently wrong. On VI we still use
1799 // MUBUF instructions for the r + i addressing mode. As currently
1800 // implemented, the MUBUF instructions only work on buffer < 4GB.
1801 // It may be possible to support > 4GB buffers with MUBUF instructions,
1802 // by setting the stride value in the resource descriptor which would
1803 // increase the size limit to (stride * 4GB). However, this is risky,
1804 // because it has never been validated.
1806 }
1807
1808 return isLegalMUBUFAddressingMode(AM);
1809}
1810
1811bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1812 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1813 // additionally can do r + r + i with addr64. 32-bit has more addressing
1814 // mode options. Depending on the resource constant, it can also do
1815 // (i64 r0) + (i32 r1) * (i14 i).
1816 //
1817 // Private arrays end up using a scratch buffer most of the time, so also
1818 // assume those use MUBUF instructions. Scratch loads / stores are currently
1819 // implemented as mubuf instructions with offen bit set, so slightly
1820 // different than the normal addr64.
1821 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1822 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1823 return false;
1824
1825 // FIXME: Since we can split immediate into soffset and immediate offset,
1826 // would it make sense to allow any immediate?
1827
1828 switch (AM.Scale) {
1829 case 0: // r + i or just i, depending on HasBaseReg.
1830 return true;
1831 case 1:
1832 return true; // We have r + r or r + i.
1833 case 2:
1834 if (AM.HasBaseReg) {
1835 // Reject 2 * r + r.
1836 return false;
1837 }
1838
1839 // Allow 2 * r as r + r
1840 // Or 2 * r + i is allowed as r + r + i.
1841 return true;
1842 default: // Don't allow n * r
1843 return false;
1844 }
1845}
1846
1848 const AddrMode &AM, Type *Ty,
1849 unsigned AS,
1850 Instruction *I) const {
1851 // No global is ever allowed as a base.
1852 if (AM.BaseGV)
1853 return false;
1854
1855 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1856 return isLegalGlobalAddressingMode(AM);
1857
1858 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1862 // If the offset isn't a multiple of 4, it probably isn't going to be
1863 // correctly aligned.
1864 // FIXME: Can we get the real alignment here?
1865 if (AM.BaseOffs % 4 != 0)
1866 return isLegalMUBUFAddressingMode(AM);
1867
1868 if (!Subtarget->hasScalarSubwordLoads()) {
1869 // There are no SMRD extloads, so if we have to do a small type access we
1870 // will use a MUBUF load.
1871 // FIXME?: We also need to do this if unaligned, but we don't know the
1872 // alignment here.
1873 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1874 return isLegalGlobalAddressingMode(AM);
1875 }
1876
1877 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1878 // SMRD instructions have an 8-bit, dword offset on SI.
1879 if (!isUInt<8>(AM.BaseOffs / 4))
1880 return false;
1881 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1882 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1883 // in 8-bits, it can use a smaller encoding.
1884 if (!isUInt<32>(AM.BaseOffs / 4))
1885 return false;
1886 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1887 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1888 if (!isUInt<20>(AM.BaseOffs))
1889 return false;
1890 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1891 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1892 // for S_BUFFER_* instructions).
1893 if (!isInt<21>(AM.BaseOffs))
1894 return false;
1895 } else {
1896 // On GFX12, all offsets are signed 24-bit in bytes.
1897 if (!isInt<24>(AM.BaseOffs))
1898 return false;
1899 }
1900
1901 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1903 AM.BaseOffs < 0) {
1904 // Scalar (non-buffer) loads can only use a negative offset if
1905 // soffset+offset is non-negative. Since the compiler can only prove that
1906 // in a few special cases, it is safer to claim that negative offsets are
1907 // not supported.
1908 return false;
1909 }
1910
1911 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1912 return true;
1913
1914 if (AM.Scale == 1 && AM.HasBaseReg)
1915 return true;
1916
1917 return false;
1918 }
1919
1920 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1921 return Subtarget->enableFlatScratch()
1923 : isLegalMUBUFAddressingMode(AM);
1924
1925 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1926 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1927 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1928 // field.
1929 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1930 // an 8-bit dword offset but we don't know the alignment here.
1931 if (!isUInt<16>(AM.BaseOffs))
1932 return false;
1933
1934 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1935 return true;
1936
1937 if (AM.Scale == 1 && AM.HasBaseReg)
1938 return true;
1939
1940 return false;
1941 }
1942
1944 // For an unknown address space, this usually means that this is for some
1945 // reason being used for pure arithmetic, and not based on some addressing
1946 // computation. We don't have instructions that compute pointers with any
1947 // addressing modes, so treat them as having no offset like flat
1948 // instructions.
1950 }
1951
1952 // Assume a user alias of global for unknown address spaces.
1953 return isLegalGlobalAddressingMode(AM);
1954}
1955
1957 const MachineFunction &MF) const {
1959 return (MemVT.getSizeInBits() <= 4 * 32);
1960 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1961 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1962 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1963 }
1965 return (MemVT.getSizeInBits() <= 2 * 32);
1966 return true;
1967}
1968
1970 unsigned Size, unsigned AddrSpace, Align Alignment,
1971 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1972 if (IsFast)
1973 *IsFast = 0;
1974
1975 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1976 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1977 // Check if alignment requirements for ds_read/write instructions are
1978 // disabled.
1979 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1980 return false;
1981
1982 Align RequiredAlignment(
1983 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1984 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1985 Alignment < RequiredAlignment)
1986 return false;
1987
1988 // Either, the alignment requirements are "enabled", or there is an
1989 // unaligned LDS access related hardware bug though alignment requirements
1990 // are "disabled". In either case, we need to check for proper alignment
1991 // requirements.
1992 //
1993 switch (Size) {
1994 case 64:
1995 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1996 // address is negative, then the instruction is incorrectly treated as
1997 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1998 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1999 // load later in the SILoadStoreOptimizer.
2000 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
2001 return false;
2002
2003 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
2004 // can do a 4 byte aligned, 8 byte access in a single operation using
2005 // ds_read2/write2_b32 with adjacent offsets.
2006 RequiredAlignment = Align(4);
2007
2008 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2009 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
2010 // ds_write2_b32 depending on the alignment. In either case with either
2011 // alignment there is no faster way of doing this.
2012
2013 // The numbers returned here and below are not additive, it is a 'speed
2014 // rank'. They are just meant to be compared to decide if a certain way
2015 // of lowering an operation is faster than another. For that purpose
2016 // naturally aligned operation gets it bitsize to indicate that "it
2017 // operates with a speed comparable to N-bit wide load". With the full
2018 // alignment ds128 is slower than ds96 for example. If underaligned it
2019 // is comparable to a speed of a single dword access, which would then
2020 // mean 32 < 128 and it is faster to issue a wide load regardless.
2021 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
2022 // wider load which will not be aligned anymore the latter is slower.
2023 if (IsFast)
2024 *IsFast = (Alignment >= RequiredAlignment) ? 64
2025 : (Alignment < Align(4)) ? 32
2026 : 1;
2027 return true;
2028 }
2029
2030 break;
2031 case 96:
2032 if (!Subtarget->hasDS96AndDS128())
2033 return false;
2034
2035 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
2036 // gfx8 and older.
2037
2038 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2039 // Naturally aligned access is fastest. However, also report it is Fast
2040 // if memory is aligned less than DWORD. A narrow load or store will be
2041 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
2042 // be more of them, so overall we will pay less penalty issuing a single
2043 // instruction.
2044
2045 // See comment on the values above.
2046 if (IsFast)
2047 *IsFast = (Alignment >= RequiredAlignment) ? 96
2048 : (Alignment < Align(4)) ? 32
2049 : 1;
2050 return true;
2051 }
2052
2053 break;
2054 case 128:
2055 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2056 return false;
2057
2058 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
2059 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
2060 // single operation using ds_read2/write2_b64.
2061 RequiredAlignment = Align(8);
2062
2063 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2064 // Naturally aligned access is fastest. However, also report it is Fast
2065 // if memory is aligned less than DWORD. A narrow load or store will be
2066 // be equally slow as a single ds_read_b128/ds_write_b128, but there
2067 // will be more of them, so overall we will pay less penalty issuing a
2068 // single instruction.
2069
2070 // See comment on the values above.
2071 if (IsFast)
2072 *IsFast = (Alignment >= RequiredAlignment) ? 128
2073 : (Alignment < Align(4)) ? 32
2074 : 1;
2075 return true;
2076 }
2077
2078 break;
2079 default:
2080 if (Size > 32)
2081 return false;
2082
2083 break;
2084 }
2085
2086 // See comment on the values above.
2087 // Note that we have a single-dword or sub-dword here, so if underaligned
2088 // it is a slowest possible access, hence returned value is 0.
2089 if (IsFast)
2090 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
2091
2092 return Alignment >= RequiredAlignment ||
2093 Subtarget->hasUnalignedDSAccessEnabled();
2094 }
2095
2096 // FIXME: We have to be conservative here and assume that flat operations
2097 // will access scratch. If we had access to the IR function, then we
2098 // could determine if any private memory was used in the function.
2099 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
2100 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2101 bool AlignedBy4 = Alignment >= Align(4);
2102 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2103 if (IsFast)
2104 *IsFast = AlignedBy4 ? Size : 1;
2105 return true;
2106 }
2107
2108 if (IsFast)
2109 *IsFast = AlignedBy4;
2110
2111 return AlignedBy4;
2112 }
2113
2114 // So long as they are correct, wide global memory operations perform better
2115 // than multiple smaller memory ops -- even when misaligned
2116 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
2117 if (IsFast)
2118 *IsFast = Size;
2119
2120 return Alignment >= Align(4) ||
2121 Subtarget->hasUnalignedBufferAccessEnabled();
2122 }
2123
2124 // Ensure robust out-of-bounds guarantees for buffer accesses are met if
2125 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
2126 // out-of-bounds behavior, but in the edge case where an access starts
2127 // out-of-bounds and then enter in-bounds, the entire access would be treated
2128 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
2129 // natural alignment of buffer accesses.
2130 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
2131 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
2132 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2133 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2134 Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
2135 return false;
2136 }
2137
2138 // Smaller than dword value must be aligned.
2139 if (Size < 32)
2140 return false;
2141
2142 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2143 // byte-address are ignored, thus forcing Dword alignment.
2144 // This applies to private, global, and constant memory.
2145 if (IsFast)
2146 *IsFast = 1;
2147
2148 return Size >= 32 && Alignment >= Align(4);
2149}
2150
2152 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2153 unsigned *IsFast) const {
2155 Alignment, Flags, IsFast);
2156}
2157
2159 LLVMContext &Context, const MemOp &Op,
2160 const AttributeList &FuncAttributes) const {
2161 // FIXME: Should account for address space here.
2162
2163 // The default fallback uses the private pointer size as a guess for a type to
2164 // use. Make sure we switch these to 64-bit accesses.
2165
2166 if (Op.size() >= 16 &&
2167 Op.isDstAligned(Align(4))) // XXX: Should only do for global
2168 return MVT::v4i32;
2169
2170 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
2171 return MVT::v2i32;
2172
2173 // Use the default.
2174 return MVT::Other;
2175}
2176
2178 const MemSDNode *MemNode = cast<MemSDNode>(N);
2179 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2180}
2181
2186
2188 unsigned DestAS) const {
2189 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2190 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2191 Subtarget->hasGloballyAddressableScratch()) {
2192 // Flat -> private requires subtracting src_flat_scratch_base_lo.
2193 return false;
2194 }
2195
2196 // Flat -> private/local is a simple truncate.
2197 // Flat -> global is no-op
2198 return true;
2199 }
2200
2201 const GCNTargetMachine &TM =
2202 static_cast<const GCNTargetMachine &>(getTargetMachine());
2203 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2204}
2205
2213
2215 Type *Ty) const {
2216 // FIXME: Could be smarter if called for vector constants.
2217 return true;
2218}
2219
2221 unsigned Index) const {
2223 return false;
2224
2225 // TODO: Add more cases that are cheap.
2226 return Index == 0;
2227}
2228
2229bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2230 // TODO: This should be more aggressive, particular for 16-bit element
2231 // vectors. However there are some mixed improvements and regressions.
2232 EVT EltTy = VT.getVectorElementType();
2233 return EltTy.getSizeInBits() % 32 == 0;
2234}
2235
2237 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2238 switch (Op) {
2239 case ISD::LOAD:
2240 case ISD::STORE:
2241 return true;
2242 default:
2243 return false;
2244 }
2245 }
2246
2247 // SimplifySetCC uses this function to determine whether or not it should
2248 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2249 if (VT == MVT::i1 && Op == ISD::SETCC)
2250 return false;
2251
2253}
2254
2255SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2256 const SDLoc &SL,
2257 SDValue Chain,
2258 uint64_t Offset) const {
2259 const DataLayout &DL = DAG.getDataLayout();
2263
2264 auto [InputPtrReg, RC, ArgTy] =
2265 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2266
2267 // We may not have the kernarg segment argument if we have no kernel
2268 // arguments.
2269 if (!InputPtrReg)
2270 return DAG.getConstant(Offset, SL, PtrVT);
2271
2273 SDValue BasePtr = DAG.getCopyFromReg(
2274 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2275
2276 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2277}
2278
2279SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2280 const SDLoc &SL) const {
2283 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2284}
2285
2286SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2287 const SDLoc &SL) const {
2288
2290 std::optional<uint32_t> KnownSize =
2292 if (KnownSize.has_value())
2293 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2294 return SDValue();
2295}
2296
2297SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2298 const SDLoc &SL, SDValue Val,
2299 bool Signed,
2300 const ISD::InputArg *Arg) const {
2301 // First, if it is a widened vector, narrow it.
2302 if (VT.isVector() &&
2304 EVT NarrowedVT =
2307 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2308 DAG.getConstant(0, SL, MVT::i32));
2309 }
2310
2311 // Then convert the vector elements or scalar value.
2312 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2313 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2314 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2315 }
2316
2317 if (MemVT.isFloatingPoint())
2318 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2319 else if (Signed)
2320 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2321 else
2322 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2323
2324 return Val;
2325}
2326
2327SDValue SITargetLowering::lowerKernargMemParameter(
2328 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2329 uint64_t Offset, Align Alignment, bool Signed,
2330 const ISD::InputArg *Arg) const {
2331 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2332
2333 // Try to avoid using an extload by loading earlier than the argument address,
2334 // and extracting the relevant bits. The load should hopefully be merged with
2335 // the previous argument.
2336 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2337 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2338 int64_t AlignDownOffset = alignDown(Offset, 4);
2339 int64_t OffsetDiff = Offset - AlignDownOffset;
2340
2341 EVT IntVT = MemVT.changeTypeToInteger();
2342
2343 // TODO: If we passed in the base kernel offset we could have a better
2344 // alignment than 4, but we don't really need it.
2345 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2346 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2349
2350 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2351 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2352
2353 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2354 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2355 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2356
2357 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2358 }
2359
2360 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2361 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2364
2365 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2366 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2367}
2368
2369/// Coerce an argument which was passed in a different ABI type to the original
2370/// expected value type.
2371SDValue SITargetLowering::convertABITypeToValueType(SelectionDAG &DAG,
2372 SDValue Val,
2373 CCValAssign &VA,
2374 const SDLoc &SL) const {
2375 EVT ValVT = VA.getValVT();
2376
2377 // If this is an 8 or 16-bit value, it is really passed promoted
2378 // to 32 bits. Insert an assert[sz]ext to capture this, then
2379 // truncate to the right size.
2380 switch (VA.getLocInfo()) {
2381 case CCValAssign::Full:
2382 return Val;
2383 case CCValAssign::BCvt:
2384 return DAG.getNode(ISD::BITCAST, SL, ValVT, Val);
2385 case CCValAssign::SExt:
2386 Val = DAG.getNode(ISD::AssertSext, SL, VA.getLocVT(), Val,
2387 DAG.getValueType(ValVT));
2388 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2389 case CCValAssign::ZExt:
2390 Val = DAG.getNode(ISD::AssertZext, SL, VA.getLocVT(), Val,
2391 DAG.getValueType(ValVT));
2392 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2393 case CCValAssign::AExt:
2394 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2395 default:
2396 llvm_unreachable("Unknown loc info!");
2397 }
2398}
2399
2400SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2401 CCValAssign &VA, const SDLoc &SL,
2402 SDValue Chain,
2403 const ISD::InputArg &Arg) const {
2404 MachineFunction &MF = DAG.getMachineFunction();
2405 MachineFrameInfo &MFI = MF.getFrameInfo();
2406
2407 if (Arg.Flags.isByVal()) {
2408 unsigned Size = Arg.Flags.getByValSize();
2409 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2410 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2411 }
2412
2413 unsigned ArgOffset = VA.getLocMemOffset();
2414 unsigned ArgSize = VA.getValVT().getStoreSize();
2415
2416 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2417
2418 // Create load nodes to retrieve arguments from the stack.
2419 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2420
2421 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2423 MVT MemVT = VA.getValVT();
2424
2425 switch (VA.getLocInfo()) {
2426 default:
2427 break;
2428 case CCValAssign::BCvt:
2429 MemVT = VA.getLocVT();
2430 break;
2431 case CCValAssign::SExt:
2432 ExtType = ISD::SEXTLOAD;
2433 break;
2434 case CCValAssign::ZExt:
2435 ExtType = ISD::ZEXTLOAD;
2436 break;
2437 case CCValAssign::AExt:
2438 ExtType = ISD::EXTLOAD;
2439 break;
2440 }
2441
2442 SDValue ArgValue = DAG.getExtLoad(
2443 ExtType, SL, VA.getLocVT(), Chain, FIN,
2445
2446 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2447 if (ConvertedVal == ArgValue)
2448 return ConvertedVal;
2449
2450 return DAG.getMergeValues({ConvertedVal, ArgValue.getValue(1)}, SL);
2451}
2452
2453SDValue SITargetLowering::lowerWorkGroupId(
2454 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2457 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
2458 if (!Subtarget->hasClusters())
2459 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2460
2461 // Clusters are supported. Return the global position in the grid. If clusters
2462 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
2463
2464 // WorkGroupIdXYZ = ClusterId == 0 ?
2465 // ClusterIdXYZ :
2466 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
2467 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2468 SDLoc SL(ClusterIdXYZ);
2469 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2470 SDValue One = DAG.getConstant(1, SL, VT);
2471 SDValue ClusterSizeXYZ = DAG.getNode(ISD::ADD, SL, VT, ClusterMaxIdXYZ, One);
2472 SDValue ClusterWorkGroupIdXYZ =
2473 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2474 SDValue GlobalIdXYZ =
2475 DAG.getNode(ISD::ADD, SL, VT, ClusterWorkGroupIdXYZ,
2476 DAG.getNode(ISD::MUL, SL, VT, ClusterIdXYZ, ClusterSizeXYZ));
2477
2478 switch (MFI.getClusterDims().getKind()) {
2481 return GlobalIdXYZ;
2483 return ClusterIdXYZ;
2485 using namespace AMDGPU::Hwreg;
2486 SDValue ClusterIdField =
2487 DAG.getTargetConstant(HwregEncoding::encode(ID_IB_STS2, 6, 4), SL, VT);
2488 SDNode *GetReg =
2489 DAG.getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2490 SDValue ClusterId(GetReg, 0);
2491 SDValue Zero = DAG.getConstant(0, SL, VT);
2492 return DAG.getNode(ISD::SELECT_CC, SL, VT, ClusterId, Zero, ClusterIdXYZ,
2493 GlobalIdXYZ, DAG.getCondCode(ISD::SETEQ));
2494 }
2495 }
2496
2497 llvm_unreachable("nothing should reach here");
2498}
2499
2500SDValue SITargetLowering::getPreloadedValue(
2501 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2503 const ArgDescriptor *Reg = nullptr;
2504 const TargetRegisterClass *RC;
2505 LLT Ty;
2506
2508 const ArgDescriptor WorkGroupIDX =
2509 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2510 // If GridZ is not programmed in an entry function then the hardware will set
2511 // it to all zeros, so there is no need to mask the GridY value in the low
2512 // order bits.
2513 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2514 AMDGPU::TTMP7,
2515 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2516 const ArgDescriptor WorkGroupIDZ =
2517 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2518 const ArgDescriptor ClusterWorkGroupIDX =
2519 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
2520 const ArgDescriptor ClusterWorkGroupIDY =
2521 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
2522 const ArgDescriptor ClusterWorkGroupIDZ =
2523 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
2524 const ArgDescriptor ClusterWorkGroupMaxIDX =
2525 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
2526 const ArgDescriptor ClusterWorkGroupMaxIDY =
2527 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
2528 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2529 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
2530 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2531 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
2532
2533 auto LoadConstant = [&](unsigned N) {
2534 return DAG.getConstant(N, SDLoc(), VT);
2535 };
2536
2537 if (Subtarget->hasArchitectedSGPRs() &&
2539 AMDGPU::ClusterDimsAttr ClusterDims = MFI.getClusterDims();
2540 bool HasFixedDims = ClusterDims.isFixedDims();
2541
2542 switch (PVID) {
2544 Reg = &WorkGroupIDX;
2545 RC = &AMDGPU::SReg_32RegClass;
2546 Ty = LLT::scalar(32);
2547 break;
2549 Reg = &WorkGroupIDY;
2550 RC = &AMDGPU::SReg_32RegClass;
2551 Ty = LLT::scalar(32);
2552 break;
2554 Reg = &WorkGroupIDZ;
2555 RC = &AMDGPU::SReg_32RegClass;
2556 Ty = LLT::scalar(32);
2557 break;
2559 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
2560 return LoadConstant(0);
2561 Reg = &ClusterWorkGroupIDX;
2562 RC = &AMDGPU::SReg_32RegClass;
2563 Ty = LLT::scalar(32);
2564 break;
2566 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
2567 return LoadConstant(0);
2568 Reg = &ClusterWorkGroupIDY;
2569 RC = &AMDGPU::SReg_32RegClass;
2570 Ty = LLT::scalar(32);
2571 break;
2573 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
2574 return LoadConstant(0);
2575 Reg = &ClusterWorkGroupIDZ;
2576 RC = &AMDGPU::SReg_32RegClass;
2577 Ty = LLT::scalar(32);
2578 break;
2580 if (HasFixedDims)
2581 return LoadConstant(ClusterDims.getDims()[0] - 1);
2582 Reg = &ClusterWorkGroupMaxIDX;
2583 RC = &AMDGPU::SReg_32RegClass;
2584 Ty = LLT::scalar(32);
2585 break;
2587 if (HasFixedDims)
2588 return LoadConstant(ClusterDims.getDims()[1] - 1);
2589 Reg = &ClusterWorkGroupMaxIDY;
2590 RC = &AMDGPU::SReg_32RegClass;
2591 Ty = LLT::scalar(32);
2592 break;
2594 if (HasFixedDims)
2595 return LoadConstant(ClusterDims.getDims()[2] - 1);
2596 Reg = &ClusterWorkGroupMaxIDZ;
2597 RC = &AMDGPU::SReg_32RegClass;
2598 Ty = LLT::scalar(32);
2599 break;
2601 Reg = &ClusterWorkGroupMaxFlatID;
2602 RC = &AMDGPU::SReg_32RegClass;
2603 Ty = LLT::scalar(32);
2604 break;
2605 default:
2606 break;
2607 }
2608 }
2609
2610 if (!Reg)
2611 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2612 if (!Reg) {
2614 // It's possible for a kernarg intrinsic call to appear in a kernel with
2615 // no allocated segment, in which case we do not add the user sgpr
2616 // argument, so just return null.
2617 return DAG.getConstant(0, SDLoc(), VT);
2618 }
2619
2620 // It's undefined behavior if a function marked with the amdgpu-no-*
2621 // attributes uses the corresponding intrinsic.
2622 return DAG.getPOISON(VT);
2623 }
2624
2625 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2626}
2627
2629 CallingConv::ID CallConv,
2630 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2631 FunctionType *FType,
2633 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2634 const ISD::InputArg *Arg = &Ins[I];
2635
2636 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2637 "vector type argument should have been split");
2638
2639 // First check if it's a PS input addr.
2640 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2641 PSInputNum <= 15) {
2642 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2643
2644 // Inconveniently only the first part of the split is marked as isSplit,
2645 // so skip to the end. We only want to increment PSInputNum once for the
2646 // entire split argument.
2647 if (Arg->Flags.isSplit()) {
2648 while (!Arg->Flags.isSplitEnd()) {
2649 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2650 "unexpected vector split in ps argument type");
2651 if (!SkipArg)
2652 Splits.push_back(*Arg);
2653 Arg = &Ins[++I];
2654 }
2655 }
2656
2657 if (SkipArg) {
2658 // We can safely skip PS inputs.
2659 Skipped.set(Arg->getOrigArgIndex());
2660 ++PSInputNum;
2661 continue;
2662 }
2663
2664 Info->markPSInputAllocated(PSInputNum);
2665 if (Arg->Used)
2666 Info->markPSInputEnabled(PSInputNum);
2667
2668 ++PSInputNum;
2669 }
2670
2671 Splits.push_back(*Arg);
2672 }
2673}
2674
2675// Allocate special inputs passed in VGPRs.
2677 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2678 SIMachineFunctionInfo &Info) const {
2679 const LLT S32 = LLT::scalar(32);
2681
2682 if (Info.hasWorkItemIDX()) {
2683 Register Reg = AMDGPU::VGPR0;
2684 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2685
2686 CCInfo.AllocateReg(Reg);
2687 unsigned Mask =
2688 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2689 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2690 }
2691
2692 if (Info.hasWorkItemIDY()) {
2693 assert(Info.hasWorkItemIDX());
2694 if (Subtarget->hasPackedTID()) {
2695 Info.setWorkItemIDY(
2696 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2697 } else {
2698 unsigned Reg = AMDGPU::VGPR1;
2699 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2700
2701 CCInfo.AllocateReg(Reg);
2702 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2703 }
2704 }
2705
2706 if (Info.hasWorkItemIDZ()) {
2707 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2708 if (Subtarget->hasPackedTID()) {
2709 Info.setWorkItemIDZ(
2710 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2711 } else {
2712 unsigned Reg = AMDGPU::VGPR2;
2713 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2714
2715 CCInfo.AllocateReg(Reg);
2716 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2717 }
2718 }
2719}
2720
2721// Try to allocate a VGPR at the end of the argument list, or if no argument
2722// VGPRs are left allocating a stack slot.
2723// If \p Mask is is given it indicates bitfield position in the register.
2724// If \p Arg is given use it with new ]p Mask instead of allocating new.
2725static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2726 ArgDescriptor Arg = ArgDescriptor()) {
2727 if (Arg.isSet())
2728 return ArgDescriptor::createArg(Arg, Mask);
2729
2730 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2731 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2732 if (RegIdx == ArgVGPRs.size()) {
2733 // Spill to stack required.
2734 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2735
2736 return ArgDescriptor::createStack(Offset, Mask);
2737 }
2738
2739 unsigned Reg = ArgVGPRs[RegIdx];
2740 Reg = CCInfo.AllocateReg(Reg);
2741 assert(Reg != AMDGPU::NoRegister);
2742
2743 MachineFunction &MF = CCInfo.getMachineFunction();
2744 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2745 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2746 return ArgDescriptor::createRegister(Reg, Mask);
2747}
2748
2750 const TargetRegisterClass *RC,
2751 unsigned NumArgRegs) {
2752 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2753 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2754 if (RegIdx == ArgSGPRs.size())
2755 report_fatal_error("ran out of SGPRs for arguments");
2756
2757 unsigned Reg = ArgSGPRs[RegIdx];
2758 Reg = CCInfo.AllocateReg(Reg);
2759 assert(Reg != AMDGPU::NoRegister);
2760
2761 MachineFunction &MF = CCInfo.getMachineFunction();
2762 MF.addLiveIn(Reg, RC);
2764}
2765
2766// If this has a fixed position, we still should allocate the register in the
2767// CCInfo state. Technically we could get away with this for values passed
2768// outside of the normal argument range.
2770 const TargetRegisterClass *RC,
2771 MCRegister Reg) {
2772 Reg = CCInfo.AllocateReg(Reg);
2773 assert(Reg != AMDGPU::NoRegister);
2774 MachineFunction &MF = CCInfo.getMachineFunction();
2775 MF.addLiveIn(Reg, RC);
2776}
2777
2778static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2779 if (Arg) {
2780 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2781 Arg.getRegister());
2782 } else
2783 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2784}
2785
2786static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2787 if (Arg) {
2788 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2789 Arg.getRegister());
2790 } else
2791 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2792}
2793
2794/// Allocate implicit function VGPR arguments at the end of allocated user
2795/// arguments.
2797 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2798 SIMachineFunctionInfo &Info) const {
2799 const unsigned Mask = 0x3ff;
2800 ArgDescriptor Arg;
2801
2802 if (Info.hasWorkItemIDX()) {
2803 Arg = allocateVGPR32Input(CCInfo, Mask);
2804 Info.setWorkItemIDX(Arg);
2805 }
2806
2807 if (Info.hasWorkItemIDY()) {
2808 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2809 Info.setWorkItemIDY(Arg);
2810 }
2811
2812 if (Info.hasWorkItemIDZ())
2813 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2814}
2815
2816/// Allocate implicit function VGPR arguments in fixed registers.
2818 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2819 SIMachineFunctionInfo &Info) const {
2820 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2821 if (!Reg)
2822 report_fatal_error("failed to allocate VGPR for implicit arguments");
2823
2824 const unsigned Mask = 0x3ff;
2825 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2826 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2827 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2828}
2829
2831 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2832 SIMachineFunctionInfo &Info) const {
2833 auto &ArgInfo = Info.getArgInfo();
2834 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2835
2836 // TODO: Unify handling with private memory pointers.
2837 if (UserSGPRInfo.hasDispatchPtr())
2838 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2839
2840 if (UserSGPRInfo.hasQueuePtr())
2841 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2842
2843 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2844 // constant offset from the kernarg segment.
2845 if (Info.hasImplicitArgPtr())
2846 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2847
2848 if (UserSGPRInfo.hasDispatchID())
2849 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2850
2851 // flat_scratch_init is not applicable for non-kernel functions.
2852
2853 if (Info.hasWorkGroupIDX())
2854 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2855
2856 if (Info.hasWorkGroupIDY())
2857 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2858
2859 if (Info.hasWorkGroupIDZ())
2860 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2861
2862 if (Info.hasLDSKernelId())
2863 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2864}
2865
2866// Allocate special inputs passed in user SGPRs.
2868 MachineFunction &MF,
2869 const SIRegisterInfo &TRI,
2870 SIMachineFunctionInfo &Info) const {
2871 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2872 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2873 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2874 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2875 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2876 }
2877
2878 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2879 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2880 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2881 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2882 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2883 }
2884
2885 if (UserSGPRInfo.hasDispatchPtr()) {
2886 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2887 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2888 CCInfo.AllocateReg(DispatchPtrReg);
2889 }
2890
2891 if (UserSGPRInfo.hasQueuePtr()) {
2892 Register QueuePtrReg = Info.addQueuePtr(TRI);
2893 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2894 CCInfo.AllocateReg(QueuePtrReg);
2895 }
2896
2897 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2899 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2900 CCInfo.AllocateReg(InputPtrReg);
2901
2902 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2903 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2904 }
2905
2906 if (UserSGPRInfo.hasDispatchID()) {
2907 Register DispatchIDReg = Info.addDispatchID(TRI);
2908 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2909 CCInfo.AllocateReg(DispatchIDReg);
2910 }
2911
2912 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2913 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2914 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2915 CCInfo.AllocateReg(FlatScratchInitReg);
2916 }
2917
2918 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2919 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2920 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2921 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2922 }
2923
2924 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2925 // these from the dispatch pointer.
2926}
2927
2928// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2929// sequential starting from the first argument.
2931 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2933 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2934 Function &F = MF.getFunction();
2935 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2936 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2937 bool InPreloadSequence = true;
2938 unsigned InIdx = 0;
2939 bool AlignedForImplictArgs = false;
2940 unsigned ImplicitArgOffset = 0;
2941 for (auto &Arg : F.args()) {
2942 if (!InPreloadSequence || !Arg.hasInRegAttr())
2943 break;
2944
2945 unsigned ArgIdx = Arg.getArgNo();
2946 // Don't preload non-original args or parts not in the current preload
2947 // sequence.
2948 if (InIdx < Ins.size() &&
2949 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2950 break;
2951
2952 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2953 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2954 InIdx++) {
2955 assert(ArgLocs[ArgIdx].isMemLoc());
2956 auto &ArgLoc = ArgLocs[InIdx];
2957 const Align KernelArgBaseAlign = Align(16);
2958 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2959 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2960 unsigned NumAllocSGPRs =
2961 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2962
2963 // Fix alignment for hidden arguments.
2964 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2965 if (!AlignedForImplictArgs) {
2966 ImplicitArgOffset =
2967 alignTo(LastExplicitArgOffset,
2968 Subtarget->getAlignmentForImplicitArgPtr()) -
2969 LastExplicitArgOffset;
2970 AlignedForImplictArgs = true;
2971 }
2972 ArgOffset += ImplicitArgOffset;
2973 }
2974
2975 // Arg is preloaded into the previous SGPR.
2976 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2977 assert(InIdx >= 1 && "No previous SGPR");
2978 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2979 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2980 continue;
2981 }
2982
2983 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2984 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2985 // Check for free user SGPRs for preloading.
2986 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2987 InPreloadSequence = false;
2988 break;
2989 }
2990
2991 // Preload this argument.
2992 const TargetRegisterClass *RC =
2993 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2994 SmallVectorImpl<MCRegister> *PreloadRegs =
2995 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2996
2997 if (PreloadRegs->size() > 1)
2998 RC = &AMDGPU::SGPR_32RegClass;
2999 for (auto &Reg : *PreloadRegs) {
3000 assert(Reg);
3001 MF.addLiveIn(Reg, RC);
3002 CCInfo.AllocateReg(Reg);
3003 }
3004
3005 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3006 }
3007 }
3008}
3009
3011 const SIRegisterInfo &TRI,
3012 SIMachineFunctionInfo &Info) const {
3013 // Always allocate this last since it is a synthetic preload.
3014 if (Info.hasLDSKernelId()) {
3015 Register Reg = Info.addLDSKernelId();
3016 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3017 CCInfo.AllocateReg(Reg);
3018 }
3019}
3020
3021// Allocate special input registers that are initialized per-wave.
3024 CallingConv::ID CallConv,
3025 bool IsShader) const {
3026 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3027 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
3028 // Note: user SGPRs are handled by the front-end for graphics shaders
3029 // Pad up the used user SGPRs with dead inputs.
3030
3031 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
3032 // before enabling architected SGPRs for workgroup IDs.
3033 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
3034
3035 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3036 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
3037 // rely on it to reach 16 since if we end up having no stack usage, it will
3038 // not really be added.
3039 unsigned NumRequiredSystemSGPRs =
3040 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3041 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3042 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3043 Register Reg = Info.addReservedUserSGPR();
3044 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3045 CCInfo.AllocateReg(Reg);
3046 }
3047 }
3048
3049 if (!HasArchitectedSGPRs) {
3050 if (Info.hasWorkGroupIDX()) {
3051 Register Reg = Info.addWorkGroupIDX();
3052 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3053 CCInfo.AllocateReg(Reg);
3054 }
3055
3056 if (Info.hasWorkGroupIDY()) {
3057 Register Reg = Info.addWorkGroupIDY();
3058 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3059 CCInfo.AllocateReg(Reg);
3060 }
3061
3062 if (Info.hasWorkGroupIDZ()) {
3063 Register Reg = Info.addWorkGroupIDZ();
3064 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3065 CCInfo.AllocateReg(Reg);
3066 }
3067 }
3068
3069 if (Info.hasWorkGroupInfo()) {
3070 Register Reg = Info.addWorkGroupInfo();
3071 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3072 CCInfo.AllocateReg(Reg);
3073 }
3074
3075 if (Info.hasPrivateSegmentWaveByteOffset()) {
3076 // Scratch wave offset passed in system SGPR.
3077 unsigned PrivateSegmentWaveByteOffsetReg;
3078
3079 if (IsShader) {
3080 PrivateSegmentWaveByteOffsetReg =
3081 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3082
3083 // This is true if the scratch wave byte offset doesn't have a fixed
3084 // location.
3085 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3086 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
3087 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3088 }
3089 } else
3090 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3091
3092 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3093 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
3094 }
3095
3096 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
3097 Info.getNumPreloadedSGPRs() >= 16);
3098}
3099
3101 MachineFunction &MF,
3102 const SIRegisterInfo &TRI,
3104 // Now that we've figured out where the scratch register inputs are, see if
3105 // should reserve the arguments and use them directly.
3106 MachineFrameInfo &MFI = MF.getFrameInfo();
3107 bool HasStackObjects = MFI.hasStackObjects();
3108 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3109
3110 // Record that we know we have non-spill stack objects so we don't need to
3111 // check all stack objects later.
3112 if (HasStackObjects)
3113 Info.setHasNonSpillStackObjects(true);
3114
3115 // Everything live out of a block is spilled with fast regalloc, so it's
3116 // almost certain that spilling will be required.
3117 if (TM.getOptLevel() == CodeGenOptLevel::None)
3118 HasStackObjects = true;
3119
3120 // For now assume stack access is needed in any callee functions, so we need
3121 // the scratch registers to pass in.
3122 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
3123
3124 if (!ST.enableFlatScratch()) {
3125 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
3126 // If we have stack objects, we unquestionably need the private buffer
3127 // resource. For the Code Object V2 ABI, this will be the first 4 user
3128 // SGPR inputs. We can reserve those and use them directly.
3129
3130 Register PrivateSegmentBufferReg =
3132 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3133 } else {
3134 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
3135 // We tentatively reserve the last registers (skipping the last registers
3136 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
3137 // we'll replace these with the ones immediately after those which were
3138 // really allocated. In the prologue copies will be inserted from the
3139 // argument to these reserved registers.
3140
3141 // Without HSA, relocations are used for the scratch pointer and the
3142 // buffer resource setup is always inserted in the prologue. Scratch wave
3143 // offset is still in an input SGPR.
3144 Info.setScratchRSrcReg(ReservedBufferReg);
3145 }
3146 }
3147
3149
3150 // For entry functions we have to set up the stack pointer if we use it,
3151 // whereas non-entry functions get this "for free". This means there is no
3152 // intrinsic advantage to using S32 over S34 in cases where we do not have
3153 // calls but do need a frame pointer (i.e. if we are requested to have one
3154 // because frame pointer elimination is disabled). To keep things simple we
3155 // only ever use S32 as the call ABI stack pointer, and so using it does not
3156 // imply we need a separate frame pointer.
3157 //
3158 // Try to use s32 as the SP, but move it if it would interfere with input
3159 // arguments. This won't work with calls though.
3160 //
3161 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
3162 // registers.
3163 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
3164 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3165 } else {
3167
3168 if (MFI.hasCalls())
3169 report_fatal_error("call in graphics shader with too many input SGPRs");
3170
3171 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
3172 if (!MRI.isLiveIn(Reg)) {
3173 Info.setStackPtrOffsetReg(Reg);
3174 break;
3175 }
3176 }
3177
3178 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3179 report_fatal_error("failed to find register for SP");
3180 }
3181
3182 // hasFP should be accurate for entry functions even before the frame is
3183 // finalized, because it does not rely on the known stack size, only
3184 // properties like whether variable sized objects are present.
3185 if (ST.getFrameLowering()->hasFP(MF)) {
3186 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3187 }
3188}
3189
3192 return !Info->isEntryFunction();
3193}
3194
3196
3198 MachineBasicBlock *Entry,
3199 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
3201
3202 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3203 if (!IStart)
3204 return;
3205
3206 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3207 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
3208 MachineBasicBlock::iterator MBBI = Entry->begin();
3209 for (const MCPhysReg *I = IStart; *I; ++I) {
3210 const TargetRegisterClass *RC = nullptr;
3211 if (AMDGPU::SReg_64RegClass.contains(*I))
3212 RC = &AMDGPU::SGPR_64RegClass;
3213 else if (AMDGPU::SReg_32RegClass.contains(*I))
3214 RC = &AMDGPU::SGPR_32RegClass;
3215 else
3216 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3217
3218 Register NewVR = MRI->createVirtualRegister(RC);
3219 // Create copy from CSR to a virtual register.
3220 Entry->addLiveIn(*I);
3221 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
3222 .addReg(*I);
3223
3224 // Insert the copy-back instructions right before the terminator.
3225 for (auto *Exit : Exits)
3226 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
3227 TII->get(TargetOpcode::COPY), *I)
3228 .addReg(NewVR);
3229 }
3230}
3231
3233 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3234 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3235 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3237
3239 const Function &Fn = MF.getFunction();
3242 bool IsError = false;
3243
3244 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
3246 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3247 IsError = true;
3248 }
3249
3252 BitVector Skipped(Ins.size());
3253 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3254 *DAG.getContext());
3255
3256 bool IsGraphics = AMDGPU::isGraphics(CallConv);
3257 bool IsKernel = AMDGPU::isKernel(CallConv);
3258 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
3259
3260 if (IsGraphics) {
3261 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3262 assert(!UserSGPRInfo.hasDispatchPtr() &&
3263 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3264 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3265 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3266 (void)UserSGPRInfo;
3267 if (!Subtarget->enableFlatScratch())
3268 assert(!UserSGPRInfo.hasFlatScratchInit());
3269 if ((CallConv != CallingConv::AMDGPU_CS &&
3270 CallConv != CallingConv::AMDGPU_Gfx &&
3271 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
3272 !Subtarget->hasArchitectedSGPRs())
3273 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3274 !Info->hasWorkGroupIDZ());
3275 }
3276
3277 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3278
3279 if (CallConv == CallingConv::AMDGPU_PS) {
3280 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3281
3282 // At least one interpolation mode must be enabled or else the GPU will
3283 // hang.
3284 //
3285 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3286 // set PSInputAddr, the user wants to enable some bits after the compilation
3287 // based on run-time states. Since we can't know what the final PSInputEna
3288 // will look like, so we shouldn't do anything here and the user should take
3289 // responsibility for the correct programming.
3290 //
3291 // Otherwise, the following restrictions apply:
3292 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
3293 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
3294 // enabled too.
3295 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3296 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3297 CCInfo.AllocateReg(AMDGPU::VGPR0);
3298 CCInfo.AllocateReg(AMDGPU::VGPR1);
3299 Info->markPSInputAllocated(0);
3300 Info->markPSInputEnabled(0);
3301 }
3302 if (Subtarget->isAmdPalOS()) {
3303 // For isAmdPalOS, the user does not enable some bits after compilation
3304 // based on run-time states; the register values being generated here are
3305 // the final ones set in hardware. Therefore we need to apply the
3306 // workaround to PSInputAddr and PSInputEnable together. (The case where
3307 // a bit is set in PSInputAddr but not PSInputEnable is where the
3308 // frontend set up an input arg for a particular interpolation mode, but
3309 // nothing uses that input arg. Really we should have an earlier pass
3310 // that removes such an arg.)
3311 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3312 if ((PsInputBits & 0x7F) == 0 ||
3313 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3314 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
3315 }
3316 } else if (IsKernel) {
3317 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3318 } else {
3319 Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3320 Ins.end());
3321 }
3322
3323 if (IsKernel)
3324 analyzeFormalArgumentsCompute(CCInfo, Ins);
3325
3326 if (IsEntryFunc) {
3327 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
3328 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
3329 if (IsKernel && Subtarget->hasKernargPreload())
3330 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
3331
3332 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
3333 } else if (!IsGraphics) {
3334 // For the fixed ABI, pass workitem IDs in the last argument register.
3335 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
3336
3337 // FIXME: Sink this into allocateSpecialInputSGPRs
3338 if (!Subtarget->enableFlatScratch())
3339 CCInfo.AllocateReg(Info->getScratchRSrcReg());
3340
3341 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
3342 }
3343
3344 if (!IsKernel) {
3345 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
3346 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
3347
3348 // This assumes the registers are allocated by CCInfo in ascending order
3349 // with no gaps.
3350 Info->setNumWaveDispatchSGPRs(
3351 CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
3352 Info->setNumWaveDispatchVGPRs(
3353 CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
3354 } else if (Info->getNumKernargPreloadedSGPRs()) {
3355 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3356 }
3357
3359
3360 if (IsWholeWaveFunc) {
3362 {MVT::i1, MVT::Other}, Chain);
3363 InVals.push_back(Setup.getValue(0));
3364 Chains.push_back(Setup.getValue(1));
3365 }
3366
3367 // FIXME: This is the minimum kernel argument alignment. We should improve
3368 // this to the maximum alignment of the arguments.
3369 //
3370 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3371 // kern arg offset.
3372 const Align KernelArgBaseAlign = Align(16);
3373
3374 for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3375 ++i) {
3376 const ISD::InputArg &Arg = Ins[i];
3377 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3378 InVals.push_back(DAG.getPOISON(Arg.VT));
3379 continue;
3380 }
3381
3382 CCValAssign &VA = ArgLocs[ArgIdx++];
3383 MVT VT = VA.getLocVT();
3384
3385 if (IsEntryFunc && VA.isMemLoc()) {
3386 VT = Ins[i].VT;
3387 EVT MemVT = VA.getLocVT();
3388
3389 const uint64_t Offset = VA.getLocMemOffset();
3390 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
3391
3392 if (Arg.Flags.isByRef()) {
3393 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
3394
3395 const GCNTargetMachine &TM =
3396 static_cast<const GCNTargetMachine &>(getTargetMachine());
3397 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
3398 Arg.Flags.getPointerAddrSpace())) {
3401 }
3402
3403 InVals.push_back(Ptr);
3404 continue;
3405 }
3406
3407 SDValue NewArg;
3408 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3409 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3410 // In this case the argument is packed into the previous preload SGPR.
3411 int64_t AlignDownOffset = alignDown(Offset, 4);
3412 int64_t OffsetDiff = Offset - AlignDownOffset;
3413 EVT IntVT = MemVT.changeTypeToInteger();
3414
3415 const SIMachineFunctionInfo *Info =
3418 Register Reg =
3419 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3420
3421 assert(Reg);
3422 Register VReg = MRI.getLiveInVirtReg(Reg);
3423 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3424
3425 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
3426 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
3427
3428 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
3429 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
3430 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
3431 Ins[i].Flags.isSExt(), &Ins[i]);
3432
3433 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
3434 } else {
3435 const SIMachineFunctionInfo *Info =
3438 const SmallVectorImpl<MCRegister> &PreloadRegs =
3439 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3440
3441 SDValue Copy;
3442 if (PreloadRegs.size() == 1) {
3443 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3444 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3445 NewArg = DAG.getCopyFromReg(
3446 Chain, DL, VReg,
3448 TRI->getRegSizeInBits(*RC)));
3449
3450 } else {
3451 // If the kernarg alignment does not match the alignment of the SGPR
3452 // tuple RC that can accommodate this argument, it will be built up
3453 // via copies from from the individual SGPRs that the argument was
3454 // preloaded to.
3456 for (auto Reg : PreloadRegs) {
3457 Register VReg = MRI.getLiveInVirtReg(Reg);
3458 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3459 Elts.push_back(Copy);
3460 }
3461 NewArg =
3462 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3463 PreloadRegs.size()),
3464 DL, Elts);
3465 }
3466
3467 // If the argument was preloaded to multiple consecutive 32-bit
3468 // registers because of misalignment between addressable SGPR tuples
3469 // and the argument size, we can still assume that because of kernarg
3470 // segment alignment restrictions that NewArg's size is the same as
3471 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3472 // truncate since we cannot preload to less than a single SGPR and the
3473 // MemVT may be smaller.
3474 EVT MemVTInt =
3476 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3477 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3478
3479 NewArg = DAG.getBitcast(MemVT, NewArg);
3480 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3481 Ins[i].Flags.isSExt(), &Ins[i]);
3482 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3483 }
3484 } else {
3485 // Hidden arguments that are in the kernel signature must be preloaded
3486 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3487 // the argument list and is not preloaded.
3488 if (Arg.isOrigArg()) {
3489 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3490 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3492 *OrigArg->getParent(),
3493 "hidden argument in kernel signature was not preloaded",
3494 DL.getDebugLoc()));
3495 }
3496 }
3497
3498 NewArg =
3499 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3500 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3501 }
3502 Chains.push_back(NewArg.getValue(1));
3503
3504 auto *ParamTy =
3505 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3506 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3507 ParamTy &&
3508 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3509 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3510 // On SI local pointers are just offsets into LDS, so they are always
3511 // less than 16-bits. On CI and newer they could potentially be
3512 // real pointers, so we can't guarantee their size.
3513 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3514 DAG.getValueType(MVT::i16));
3515 }
3516
3517 InVals.push_back(NewArg);
3518 continue;
3519 }
3520 if (!IsEntryFunc && VA.isMemLoc()) {
3521 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3522 InVals.push_back(Val);
3523 if (!Arg.Flags.isByVal())
3524 Chains.push_back(Val.getValue(1));
3525 continue;
3526 }
3527
3528 assert(VA.isRegLoc() && "Parameter must be in a register!");
3529
3530 Register Reg = VA.getLocReg();
3531 const TargetRegisterClass *RC = nullptr;
3532 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3533 RC = &AMDGPU::VGPR_32RegClass;
3534 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3535 RC = &AMDGPU::SGPR_32RegClass;
3536 else
3537 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3538
3539 Reg = MF.addLiveIn(Reg, RC);
3540 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3541
3542 if (Arg.Flags.isSRet()) {
3543 // The return object should be reasonably addressable.
3544
3545 // FIXME: This helps when the return is a real sret. If it is a
3546 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3547 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3548 unsigned NumBits =
3550 Val = DAG.getNode(
3551 ISD::AssertZext, DL, VT, Val,
3552 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3553 }
3554
3555 Val = convertABITypeToValueType(DAG, Val, VA, DL);
3556 InVals.push_back(Val);
3557 }
3558
3559 // Start adding system SGPRs.
3560 if (IsEntryFunc)
3561 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3562
3563 // DAG.getPass() returns nullptr when using new pass manager.
3564 // TODO: Use DAG.getMFAM() to access analysis result.
3565 if (DAG.getPass()) {
3566 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3567 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3568 }
3569
3570 unsigned StackArgSize = CCInfo.getStackSize();
3571 Info->setBytesInStackArgArea(StackArgSize);
3572
3573 return Chains.empty() ? Chain
3574 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3575}
3576
3577// TODO: If return values can't fit in registers, we should return as many as
3578// possible in registers before passing on stack.
3580 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3581 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3582 const Type *RetTy) const {
3583 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3584 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3585 // for shaders. Vector types should be explicitly handled by CC.
3586 if (AMDGPU::isEntryFunctionCC(CallConv))
3587 return true;
3588
3590 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3591 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3592 return false;
3593
3594 // We must use the stack if return would require unavailable registers.
3595 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3596 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3597 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3598 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3599 return false;
3600
3601 return true;
3602}
3603
3604SDValue
3606 bool isVarArg,
3608 const SmallVectorImpl<SDValue> &OutVals,
3609 const SDLoc &DL, SelectionDAG &DAG) const {
3613
3614 if (AMDGPU::isKernel(CallConv)) {
3615 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3616 OutVals, DL, DAG);
3617 }
3618
3619 bool IsShader = AMDGPU::isShader(CallConv);
3620
3621 Info->setIfReturnsVoid(Outs.empty());
3622 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3623
3624 // CCValAssign - represent the assignment of the return value to a location.
3626
3627 // CCState - Info about the registers and stack slots.
3628 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3629 *DAG.getContext());
3630
3631 // Analyze outgoing return values.
3632 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3633
3634 SDValue Glue;
3636 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3637
3638 SDValue ReadFirstLane =
3639 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3640 // Copy the result values into the output registers.
3641 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3642 ++I, ++RealRVLocIdx) {
3643 CCValAssign &VA = RVLocs[I];
3644 assert(VA.isRegLoc() && "Can only return in registers!");
3645 // TODO: Partially return in registers if return values don't fit.
3646 SDValue Arg = OutVals[RealRVLocIdx];
3647
3648 // Copied from other backends.
3649 switch (VA.getLocInfo()) {
3650 case CCValAssign::Full:
3651 break;
3652 case CCValAssign::BCvt:
3653 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3654 break;
3655 case CCValAssign::SExt:
3656 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3657 break;
3658 case CCValAssign::ZExt:
3659 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3660 break;
3661 case CCValAssign::AExt:
3662 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3663 break;
3664 default:
3665 llvm_unreachable("Unknown loc info!");
3666 }
3667 if (TRI->isSGPRPhysReg(VA.getLocReg()))
3669 ReadFirstLane, Arg);
3670 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3671 Glue = Chain.getValue(1);
3672 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3673 }
3674
3675 // FIXME: Does sret work properly?
3676 if (!Info->isEntryFunction()) {
3677 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3678 const MCPhysReg *I =
3679 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3680 if (I) {
3681 for (; *I; ++I) {
3682 if (AMDGPU::SReg_64RegClass.contains(*I))
3683 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3684 else if (AMDGPU::SReg_32RegClass.contains(*I))
3685 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3686 else
3687 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3688 }
3689 }
3690 }
3691
3692 // Update chain and glue.
3693 RetOps[0] = Chain;
3694 if (Glue.getNode())
3695 RetOps.push_back(Glue);
3696
3697 unsigned Opc = AMDGPUISD::ENDPGM;
3698 if (!IsWaveEnd)
3699 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3700 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3702 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3703}
3704
3706 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3707 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3708 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3709 SDValue ThisVal) const {
3710 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3711
3712 // Assign locations to each value returned by this call.
3714 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3715 *DAG.getContext());
3716 CCInfo.AnalyzeCallResult(Ins, RetCC);
3717
3718 // Copy all of the result registers out of their specified physreg.
3719 for (CCValAssign VA : RVLocs) {
3720 SDValue Val;
3721
3722 if (VA.isRegLoc()) {
3723 Val =
3724 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3725 Chain = Val.getValue(1);
3726 InGlue = Val.getValue(2);
3727 } else if (VA.isMemLoc()) {
3728 report_fatal_error("TODO: return values in memory");
3729 } else
3730 llvm_unreachable("unknown argument location type");
3731
3732 switch (VA.getLocInfo()) {
3733 case CCValAssign::Full:
3734 break;
3735 case CCValAssign::BCvt:
3736 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3737 break;
3738 case CCValAssign::ZExt:
3739 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3740 DAG.getValueType(VA.getValVT()));
3741 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3742 break;
3743 case CCValAssign::SExt:
3744 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3745 DAG.getValueType(VA.getValVT()));
3746 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3747 break;
3748 case CCValAssign::AExt:
3749 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3750 break;
3751 default:
3752 llvm_unreachable("Unknown loc info!");
3753 }
3754
3755 InVals.push_back(Val);
3756 }
3757
3758 return Chain;
3759}
3760
3761// Add code to pass special inputs required depending on used features separate
3762// from the explicit user arguments present in the IR.
3764 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3765 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3766 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3767 // If we don't have a call site, this was a call inserted by
3768 // legalization. These can never use special inputs.
3769 if (!CLI.CB)
3770 return;
3771
3772 SelectionDAG &DAG = CLI.DAG;
3773 const SDLoc &DL = CLI.DL;
3774 const Function &F = DAG.getMachineFunction().getFunction();
3775
3776 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3777 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3778
3779 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3781 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3782 // DAG.getPass() returns nullptr when using new pass manager.
3783 // TODO: Use DAG.getMFAM() to access analysis result.
3784 if (DAG.getPass()) {
3785 auto &ArgUsageInfo =
3787 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3788 }
3789 }
3790
3791 // TODO: Unify with private memory register handling. This is complicated by
3792 // the fact that at least in kernels, the input argument is not necessarily
3793 // in the same location as the input.
3794 // clang-format off
3795 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3796 std::array<StringLiteral, 2>> ImplicitAttrs[] = {
3797 {AMDGPUFunctionArgInfo::DISPATCH_PTR, {"amdgpu-no-dispatch-ptr", ""}},
3798 {AMDGPUFunctionArgInfo::QUEUE_PTR, {"amdgpu-no-queue-ptr", ""}},
3799 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, {"amdgpu-no-implicitarg-ptr", ""}},
3800 {AMDGPUFunctionArgInfo::DISPATCH_ID, {"amdgpu-no-dispatch-id", ""}},
3801 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, {"amdgpu-no-workgroup-id-x", "amdgpu-no-cluster-id-x"}},
3802 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, {"amdgpu-no-workgroup-id-y", "amdgpu-no-cluster-id-y"}},
3803 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, {"amdgpu-no-workgroup-id-z", "amdgpu-no-cluster-id-z"}},
3804 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID, {"amdgpu-no-lds-kernel-id", ""}},
3805 };
3806 // clang-format on
3807
3808 for (auto [InputID, Attrs] : ImplicitAttrs) {
3809 // If the callee does not use the attribute value, skip copying the value.
3810 if (all_of(Attrs, [&](StringRef Attr) {
3811 return Attr.empty() || CLI.CB->hasFnAttr(Attr);
3812 }))
3813 continue;
3814
3815 const auto [OutgoingArg, ArgRC, ArgTy] =
3816 CalleeArgInfo->getPreloadedValue(InputID);
3817 if (!OutgoingArg)
3818 continue;
3819
3820 const auto [IncomingArg, IncomingArgRC, Ty] =
3821 CallerArgInfo.getPreloadedValue(InputID);
3822 assert(IncomingArgRC == ArgRC);
3823
3824 // All special arguments are ints for now.
3825 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3826 SDValue InputReg;
3827
3828 if (IncomingArg) {
3829 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3830 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3831 // The implicit arg ptr is special because it doesn't have a corresponding
3832 // input for kernels, and is computed from the kernarg segment pointer.
3833 InputReg = getImplicitArgPtr(DAG, DL);
3834 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3835 std::optional<uint32_t> Id =
3837 if (Id.has_value()) {
3838 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3839 } else {
3840 InputReg = DAG.getPOISON(ArgVT);
3841 }
3842 } else {
3843 // We may have proven the input wasn't needed, although the ABI is
3844 // requiring it. We just need to allocate the register appropriately.
3845 InputReg = DAG.getPOISON(ArgVT);
3846 }
3847
3848 if (OutgoingArg->isRegister()) {
3849 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3850 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3851 report_fatal_error("failed to allocate implicit input argument");
3852 } else {
3853 unsigned SpecialArgOffset =
3854 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3855 SDValue ArgStore =
3856 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3857 MemOpChains.push_back(ArgStore);
3858 }
3859 }
3860
3861 // Pack workitem IDs into a single register or pass it as is if already
3862 // packed.
3863
3864 auto [OutgoingArg, ArgRC, Ty] =
3866 if (!OutgoingArg)
3867 std::tie(OutgoingArg, ArgRC, Ty) =
3869 if (!OutgoingArg)
3870 std::tie(OutgoingArg, ArgRC, Ty) =
3872 if (!OutgoingArg)
3873 return;
3874
3875 const ArgDescriptor *IncomingArgX = std::get<0>(
3877 const ArgDescriptor *IncomingArgY = std::get<0>(
3879 const ArgDescriptor *IncomingArgZ = std::get<0>(
3881
3882 SDValue InputReg;
3883 SDLoc SL;
3884
3885 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3886 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3887 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3888
3889 // If incoming ids are not packed we need to pack them.
3890 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3891 NeedWorkItemIDX) {
3892 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3893 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3894 } else {
3895 InputReg = DAG.getConstant(0, DL, MVT::i32);
3896 }
3897 }
3898
3899 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3900 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3901 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3902 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3903 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3904 InputReg = InputReg.getNode()
3905 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3906 : Y;
3907 }
3908
3909 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3910 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3911 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3912 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3913 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3914 InputReg = InputReg.getNode()
3915 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3916 : Z;
3917 }
3918
3919 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3920 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3921 // We're in a situation where the outgoing function requires the workitem
3922 // ID, but the calling function does not have it (e.g a graphics function
3923 // calling a C calling convention function). This is illegal, but we need
3924 // to produce something.
3925 InputReg = DAG.getPOISON(MVT::i32);
3926 } else {
3927 // Workitem ids are already packed, any of present incoming arguments
3928 // will carry all required fields.
3929 ArgDescriptor IncomingArg =
3930 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3931 : IncomingArgY ? *IncomingArgY
3932 : *IncomingArgZ,
3933 ~0u);
3934 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3935 }
3936 }
3937
3938 if (OutgoingArg->isRegister()) {
3939 if (InputReg)
3940 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3941
3942 CCInfo.AllocateReg(OutgoingArg->getRegister());
3943 } else {
3944 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3945 if (InputReg) {
3946 SDValue ArgStore =
3947 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3948 MemOpChains.push_back(ArgStore);
3949 }
3950 }
3951}
3952
3954 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3956 const SmallVectorImpl<SDValue> &OutVals,
3957 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3958 if (AMDGPU::isChainCC(CalleeCC))
3959 return true;
3960
3961 if (!AMDGPU::mayTailCallThisCC(CalleeCC))
3962 return false;
3963
3964 // For a divergent call target, we need to do a waterfall loop over the
3965 // possible callees which precludes us from using a simple jump.
3966 if (Callee->isDivergent())
3967 return false;
3968
3970 const Function &CallerF = MF.getFunction();
3971 CallingConv::ID CallerCC = CallerF.getCallingConv();
3973 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3974
3975 // Kernels aren't callable, and don't have a live in return address so it
3976 // doesn't make sense to do a tail call with entry functions.
3977 if (!CallerPreserved)
3978 return false;
3979
3980 bool CCMatch = CallerCC == CalleeCC;
3981
3983 if (AMDGPU::canGuaranteeTCO(CalleeCC) && CCMatch)
3984 return true;
3985 return false;
3986 }
3987
3988 // TODO: Can we handle var args?
3989 if (IsVarArg)
3990 return false;
3991
3992 for (const Argument &Arg : CallerF.args()) {
3993 if (Arg.hasByValAttr())
3994 return false;
3995 }
3996
3997 LLVMContext &Ctx = *DAG.getContext();
3998
3999 // Check that the call results are passed in the same way.
4000 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
4001 CCAssignFnForCall(CalleeCC, IsVarArg),
4002 CCAssignFnForCall(CallerCC, IsVarArg)))
4003 return false;
4004
4005 // The callee has to preserve all registers the caller needs to preserve.
4006 if (!CCMatch) {
4007 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4008 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4009 return false;
4010 }
4011
4012 // Nothing more to check if the callee is taking no arguments.
4013 if (Outs.empty())
4014 return true;
4015
4017 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4018
4019 // FIXME: We are not allocating special input registers, so we will be
4020 // deciding based on incorrect register assignments.
4021 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
4022
4023 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4024 // If the stack arguments for this call do not fit into our own save area then
4025 // the call cannot be made tail.
4026 // TODO: Is this really necessary?
4027 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
4028 return false;
4029
4030 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
4031 // FIXME: What about inreg arguments that end up passed in memory?
4032 if (!CCVA.isRegLoc())
4033 continue;
4034
4035 // If we are passing an argument in an SGPR, and the value is divergent,
4036 // this call requires a waterfall loop.
4037 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4038 LLVM_DEBUG(
4039 dbgs() << "Cannot tail call due to divergent outgoing argument in "
4040 << printReg(CCVA.getLocReg(), TRI) << '\n');
4041 return false;
4042 }
4043 }
4044
4045 const MachineRegisterInfo &MRI = MF.getRegInfo();
4046 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
4047}
4048
4050 if (!CI->isTailCall())
4051 return false;
4052
4053 const Function *ParentFn = CI->getParent()->getParent();
4055 return false;
4056 return true;
4057}
4058
4059namespace {
4060// Chain calls have special arguments that we need to handle. These are
4061// tagging along at the end of the arguments list(s), after the SGPR and VGPR
4062// arguments (index 0 and 1 respectively).
4063enum ChainCallArgIdx {
4064 Exec = 2,
4065 Flags,
4066 NumVGPRs,
4067 FallbackExec,
4068 FallbackCallee
4069};
4070} // anonymous namespace
4071
4072// The wave scratch offset register is used as the global base pointer.
4074 SmallVectorImpl<SDValue> &InVals) const {
4075 CallingConv::ID CallConv = CLI.CallConv;
4076 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
4077
4078 SelectionDAG &DAG = CLI.DAG;
4079
4080 const SDLoc &DL = CLI.DL;
4081 SDValue Chain = CLI.Chain;
4082 SDValue Callee = CLI.Callee;
4083
4084 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
4085 bool UsesDynamicVGPRs = false;
4086 if (IsChainCallConv) {
4087 // The last arguments should be the value that we need to put in EXEC,
4088 // followed by the flags and any other arguments with special meanings.
4089 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
4090 // we don't treat them like the "real" arguments.
4091 auto RequestedExecIt =
4092 llvm::find_if(CLI.Outs, [](const ISD::OutputArg &Arg) {
4093 return Arg.OrigArgIndex == 2;
4094 });
4095 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
4096
4097 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
4098 CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx,
4099 CLI.OutVals.end());
4100 CLI.Outs.erase(RequestedExecIt, CLI.Outs.end());
4101
4102 assert(CLI.Outs.back().OrigArgIndex < 2 &&
4103 "Haven't popped all the special args");
4104
4105 TargetLowering::ArgListEntry RequestedExecArg =
4106 CLI.Args[ChainCallArgIdx::Exec];
4107 if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
4108 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
4109
4110 // Convert constants into TargetConstants, so they become immediate operands
4111 // instead of being selected into S_MOV.
4112 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
4113 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {
4114 ChainCallSpecialArgs.push_back(DAG.getTargetConstant(
4115 ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0)));
4116 } else
4117 ChainCallSpecialArgs.push_back(Arg.Node);
4118 };
4119
4120 PushNodeOrTargetConstant(RequestedExecArg);
4121
4122 // Process any other special arguments depending on the value of the flags.
4123 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
4124
4125 const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
4126 if (FlagsValue.isZero()) {
4127 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
4128 return lowerUnhandledCall(CLI, InVals,
4129 "no additional args allowed if flags == 0");
4130 } else if (FlagsValue.isOneBitSet(0)) {
4131 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4132 return lowerUnhandledCall(CLI, InVals, "expected 3 additional args");
4133 }
4134
4135 if (!Subtarget->isWave32()) {
4136 return lowerUnhandledCall(
4137 CLI, InVals, "dynamic VGPR mode is only supported for wave32");
4138 }
4139
4140 UsesDynamicVGPRs = true;
4141 std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
4142 CLI.Args.end(), PushNodeOrTargetConstant);
4143 }
4144 }
4145
4147 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
4149 bool &IsTailCall = CLI.IsTailCall;
4150 bool IsVarArg = CLI.IsVarArg;
4151 bool IsSibCall = false;
4153
4154 if (Callee.isUndef() || isNullConstant(Callee)) {
4155 if (!CLI.IsTailCall) {
4156 for (ISD::InputArg &Arg : CLI.Ins)
4157 InVals.push_back(DAG.getPOISON(Arg.VT));
4158 }
4159
4160 return Chain;
4161 }
4162
4163 if (IsVarArg) {
4164 return lowerUnhandledCall(CLI, InVals,
4165 "unsupported call to variadic function ");
4166 }
4167
4168 if (!CLI.CB)
4169 return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization");
4170
4171 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
4172 return lowerUnhandledCall(CLI, InVals,
4173 "unsupported required tail call to function ");
4174 }
4175
4176 if (IsTailCall) {
4177 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
4178 Outs, OutVals, Ins, DAG);
4179 if (!IsTailCall &&
4180 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
4181 report_fatal_error("failed to perform tail call elimination on a call "
4182 "site marked musttail or on llvm.amdgcn.cs.chain");
4183 }
4184
4185 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4186
4187 // A sibling call is one where we're under the usual C ABI and not planning
4188 // to change that but can still do a tail call:
4189 if (!TailCallOpt && IsTailCall)
4190 IsSibCall = true;
4191
4192 if (IsTailCall)
4193 ++NumTailCalls;
4194 }
4195
4198 SmallVector<SDValue, 8> MemOpChains;
4199
4200 // Analyze operands of the call, assigning locations to each operand.
4202 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4203 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
4204
4205 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
4207 // With a fixed ABI, allocate fixed registers before user arguments.
4208 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
4209 }
4210
4211 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
4212
4213 // Get a count of how many bytes are to be pushed on the stack.
4214 unsigned NumBytes = CCInfo.getStackSize();
4215
4216 if (IsSibCall) {
4217 // Since we're not changing the ABI to make this a tail call, the memory
4218 // operands are already available in the caller's incoming argument space.
4219 NumBytes = 0;
4220 }
4221
4222 // FPDiff is the byte offset of the call's argument area from the callee's.
4223 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4224 // by this amount for a tail call. In a sibling call it must be 0 because the
4225 // caller will deallocate the entire stack and the callee still expects its
4226 // arguments to begin at SP+0. Completely unused for non-tail calls.
4227 int32_t FPDiff = 0;
4228 MachineFrameInfo &MFI = MF.getFrameInfo();
4229 auto *TRI = Subtarget->getRegisterInfo();
4230
4231 // Adjust the stack pointer for the new arguments...
4232 // These operations are automatically eliminated by the prolog/epilog pass
4233 if (!IsSibCall)
4234 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
4235
4236 if (!IsSibCall || IsChainCallConv) {
4237 if (!Subtarget->enableFlatScratch()) {
4238 SmallVector<SDValue, 4> CopyFromChains;
4239
4240 // In the HSA case, this should be an identity copy.
4241 SDValue ScratchRSrcReg =
4242 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
4243 RegsToPass.emplace_back(IsChainCallConv
4244 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4245 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4246 ScratchRSrcReg);
4247 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
4248 Chain = DAG.getTokenFactor(DL, CopyFromChains);
4249 }
4250 }
4251
4252 const unsigned NumSpecialInputs = RegsToPass.size();
4253
4254 MVT PtrVT = MVT::i32;
4255
4256 // Walk the register/memloc assignments, inserting copies/loads.
4257 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4258 CCValAssign &VA = ArgLocs[i];
4259 SDValue Arg = OutVals[i];
4260
4261 // Promote the value if needed.
4262 switch (VA.getLocInfo()) {
4263 case CCValAssign::Full:
4264 break;
4265 case CCValAssign::BCvt:
4266 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
4267 break;
4268 case CCValAssign::ZExt:
4269 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
4270 break;
4271 case CCValAssign::SExt:
4272 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
4273 break;
4274 case CCValAssign::AExt:
4275 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
4276 break;
4277 case CCValAssign::FPExt:
4278 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
4279 break;
4280 default:
4281 llvm_unreachable("Unknown loc info!");
4282 }
4283
4284 if (VA.isRegLoc()) {
4285 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
4286 } else {
4287 assert(VA.isMemLoc());
4288
4289 SDValue DstAddr;
4290 MachinePointerInfo DstInfo;
4291
4292 unsigned LocMemOffset = VA.getLocMemOffset();
4293 int32_t Offset = LocMemOffset;
4294
4295 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
4296 MaybeAlign Alignment;
4297
4298 if (IsTailCall) {
4299 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4300 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4301 : VA.getValVT().getStoreSize();
4302
4303 // FIXME: We can have better than the minimum byval required alignment.
4304 Alignment =
4305 Flags.isByVal()
4306 ? Flags.getNonZeroByValAlign()
4307 : commonAlignment(Subtarget->getStackAlignment(), Offset);
4308
4309 Offset = Offset + FPDiff;
4310 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
4311
4312 DstAddr = DAG.getFrameIndex(FI, PtrVT);
4313 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4314
4315 // Make sure any stack arguments overlapping with where we're storing
4316 // are loaded before this eventual operation. Otherwise they'll be
4317 // clobbered.
4318
4319 // FIXME: Why is this really necessary? This seems to just result in a
4320 // lot of code to copy the stack and write them back to the same
4321 // locations, which are supposed to be immutable?
4322 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
4323 } else {
4324 // Stores to the argument stack area are relative to the stack pointer.
4325 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
4326 MVT::i32);
4327 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
4328 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
4329 Alignment =
4330 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
4331 }
4332
4333 if (Outs[i].Flags.isByVal()) {
4334 SDValue SizeNode =
4335 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
4336 SDValue Cpy =
4337 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
4338 Outs[i].Flags.getNonZeroByValAlign(),
4339 /*isVol = */ false, /*AlwaysInline = */ true,
4340 /*CI=*/nullptr, std::nullopt, DstInfo,
4342
4343 MemOpChains.push_back(Cpy);
4344 } else {
4345 SDValue Store =
4346 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
4347 MemOpChains.push_back(Store);
4348 }
4349 }
4350 }
4351
4352 if (!MemOpChains.empty())
4353 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
4354
4355 SDValue ReadFirstLaneID =
4356 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4357
4358 SDValue TokenGlue;
4359 if (CLI.ConvergenceControlToken) {
4360 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
4362 }
4363
4364 // Build a sequence of copy-to-reg nodes chained together with token chain
4365 // and flag operands which copy the outgoing args into the appropriate regs.
4366 SDValue InGlue;
4367
4368 unsigned ArgIdx = 0;
4369 for (auto [Reg, Val] : RegsToPass) {
4370 if (ArgIdx++ >= NumSpecialInputs &&
4371 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4372 // For chain calls, the inreg arguments are required to be
4373 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4374 // they are uniform.
4375 //
4376 // For other calls, if an inreg arguments is known to be uniform,
4377 // speculatively insert a readfirstlane in case it is in a VGPR.
4378 //
4379 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4380 // value, so let that continue to produce invalid code.
4381
4382 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4383 if (TokenGlue)
4384 ReadfirstlaneArgs.push_back(TokenGlue);
4386 ReadfirstlaneArgs);
4387 }
4388
4389 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4390 InGlue = Chain.getValue(1);
4391 }
4392
4393 // We don't usually want to end the call-sequence here because we would tidy
4394 // the frame up *after* the call, however in the ABI-changing tail-call case
4395 // we've carefully laid out the parameters so that when sp is reset they'll be
4396 // in the correct location.
4397 if (IsTailCall && !IsSibCall) {
4398 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
4399 InGlue = Chain.getValue(1);
4400 }
4401
4402 std::vector<SDValue> Ops({Chain});
4403
4404 // Add a redundant copy of the callee global which will not be legalized, as
4405 // we need direct access to the callee later.
4407 const GlobalValue *GV = GSD->getGlobal();
4408 Ops.push_back(Callee);
4409 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
4410 } else {
4411 if (IsTailCall) {
4412 // isEligibleForTailCallOptimization considered whether the call target is
4413 // divergent, but we may still end up with a uniform value in a VGPR.
4414 // Insert a readfirstlane just in case.
4415 SDValue ReadFirstLaneID =
4416 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4417
4418 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4419 if (TokenGlue)
4420 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
4421 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
4422 ReadfirstlaneArgs);
4423 }
4424
4425 Ops.push_back(Callee);
4426 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
4427 }
4428
4429 if (IsTailCall) {
4430 // Each tail call may have to adjust the stack by a different amount, so
4431 // this information must travel along with the operation for eventual
4432 // consumption by emitEpilogue.
4433 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
4434 }
4435
4436 if (IsChainCallConv)
4437 llvm::append_range(Ops, ChainCallSpecialArgs);
4438
4439 // Add argument registers to the end of the list so that they are known live
4440 // into the call.
4441 for (auto &[Reg, Val] : RegsToPass)
4442 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
4443
4444 // Add a register mask operand representing the call-preserved registers.
4445 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4446 assert(Mask && "Missing call preserved mask for calling convention");
4447 Ops.push_back(DAG.getRegisterMask(Mask));
4448
4449 if (SDValue Token = CLI.ConvergenceControlToken) {
4451 GlueOps.push_back(Token);
4452 if (InGlue)
4453 GlueOps.push_back(InGlue);
4454
4455 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
4456 MVT::Glue, GlueOps),
4457 0);
4458 }
4459
4460 if (InGlue)
4461 Ops.push_back(InGlue);
4462
4463 // If we're doing a tall call, use a TC_RETURN here rather than an
4464 // actual call instruction.
4465 if (IsTailCall) {
4466 MFI.setHasTailCall();
4467 unsigned OPC = AMDGPUISD::TC_RETURN;
4468 switch (CallConv) {
4471 break;
4474 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4476 break;
4477 }
4478
4479 // If the caller is a whole wave function, we need to use a special opcode
4480 // so we can patch up EXEC.
4481 if (Info->isWholeWaveFunction())
4483
4484 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4485 }
4486
4487 // Returns a chain and a flag for retval copy to use.
4488 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4489 Chain = Call.getValue(0);
4490 InGlue = Call.getValue(1);
4491
4492 uint64_t CalleePopBytes = NumBytes;
4493 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4494 if (!Ins.empty())
4495 InGlue = Chain.getValue(1);
4496
4497 // Handle result values, copying them out of physregs into vregs that we
4498 // return.
4499 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4500 InVals, /*IsThisReturn=*/false, SDValue());
4501}
4502
4503// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4504// except for:
4505// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4506// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4508 SelectionDAG &DAG) const {
4509 const MachineFunction &MF = DAG.getMachineFunction();
4511
4512 SDLoc dl(Op);
4513 EVT VT = Op.getValueType();
4514 SDValue Chain = Op.getOperand(0);
4515 Register SPReg = Info->getStackPtrOffsetReg();
4516
4517 // Chain the dynamic stack allocation so that it doesn't modify the stack
4518 // pointer when other instructions are using the stack.
4519 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4520
4521 SDValue Size = Op.getOperand(1);
4522 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4523 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4524
4525 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4527 "Stack grows upwards for AMDGPU");
4528
4529 Chain = BaseAddr.getValue(1);
4530 Align StackAlign = TFL->getStackAlign();
4531 if (Alignment > StackAlign) {
4532 uint64_t ScaledAlignment = Alignment.value()
4533 << Subtarget->getWavefrontSizeLog2();
4534 uint64_t StackAlignMask = ScaledAlignment - 1;
4535 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4536 DAG.getConstant(StackAlignMask, dl, VT));
4537 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4538 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4539 }
4540
4541 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4542 SDValue NewSP;
4544 // For constant sized alloca, scale alloca size by wave-size
4545 SDValue ScaledSize = DAG.getNode(
4546 ISD::SHL, dl, VT, Size,
4547 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4548 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4549 } else {
4550 // For dynamic sized alloca, perform wave-wide reduction to get max of
4551 // alloca size(divergent) and then scale it by wave-size
4552 SDValue WaveReduction =
4553 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4554 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4555 Size, DAG.getConstant(0, dl, MVT::i32));
4556 SDValue ScaledSize = DAG.getNode(
4557 ISD::SHL, dl, VT, Size,
4558 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4559 NewSP =
4560 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4561 SDValue ReadFirstLaneID =
4562 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4563 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4564 NewSP);
4565 }
4566
4567 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4568 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4569
4570 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4571}
4572
4574 if (Op.getValueType() != MVT::i32)
4575 return Op; // Defer to cannot select error.
4576
4578 SDLoc SL(Op);
4579
4580 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4581
4582 // Convert from wave uniform to swizzled vector address. This should protect
4583 // from any edge cases where the stacksave result isn't directly used with
4584 // stackrestore.
4585 SDValue VectorAddress =
4586 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4587 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4588}
4589
4591 SelectionDAG &DAG) const {
4592 SDLoc SL(Op);
4593 assert(Op.getValueType() == MVT::i32);
4594
4595 uint32_t BothRoundHwReg =
4597 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4598
4599 SDValue IntrinID =
4600 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4601 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4602 Op.getOperand(0), IntrinID, GetRoundBothImm);
4603
4604 // There are two rounding modes, one for f32 and one for f64/f16. We only
4605 // report in the standard value range if both are the same.
4606 //
4607 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4608 // ties away from zero is not supported, and the other values are rotated by
4609 // 1.
4610 //
4611 // If the two rounding modes are not the same, report a target defined value.
4612
4613 // Mode register rounding mode fields:
4614 //
4615 // [1:0] Single-precision round mode.
4616 // [3:2] Double/Half-precision round mode.
4617 //
4618 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4619 //
4620 // Hardware Spec
4621 // Toward-0 3 0
4622 // Nearest Even 0 1
4623 // +Inf 1 2
4624 // -Inf 2 3
4625 // NearestAway0 N/A 4
4626 //
4627 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4628 // table we can index by the raw hardware mode.
4629 //
4630 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4631
4632 SDValue BitTable =
4634
4635 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4636 SDValue RoundModeTimesNumBits =
4637 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4638
4639 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4640 // knew only one mode was demanded.
4641 SDValue TableValue =
4642 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4643 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4644
4645 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4646 SDValue TableEntry =
4647 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4648
4649 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4650 // if it's an extended value.
4651 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4652 SDValue IsStandardValue =
4653 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4654 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4655 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4656 TableEntry, EnumOffset);
4657
4658 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4659}
4660
4662 SelectionDAG &DAG) const {
4663 SDLoc SL(Op);
4664
4665 SDValue NewMode = Op.getOperand(1);
4666 assert(NewMode.getValueType() == MVT::i32);
4667
4668 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4669 // hardware MODE.fp_round values.
4670 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4671 uint32_t ClampedVal = std::min(
4672 static_cast<uint32_t>(ConstMode->getZExtValue()),
4674 NewMode = DAG.getConstant(
4675 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4676 } else {
4677 // If we know the input can only be one of the supported standard modes in
4678 // the range 0-3, we can use a simplified mapping to hardware values.
4679 KnownBits KB = DAG.computeKnownBits(NewMode);
4680 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4681 // The supported standard values are 0-3. The extended values start at 8. We
4682 // need to offset by 4 if the value is in the extended range.
4683
4684 if (UseReducedTable) {
4685 // Truncate to the low 32-bits.
4686 SDValue BitTable = DAG.getConstant(
4687 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4688
4689 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4690 SDValue RoundModeTimesNumBits =
4691 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4692
4693 NewMode =
4694 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4695
4696 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4697 // the table extracted bits into inline immediates.
4698 } else {
4699 // table_index = umin(value, value - 4)
4700 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4701 SDValue BitTable =
4703
4704 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4705 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4706 SDValue IndexVal =
4707 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4708
4709 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4710 SDValue RoundModeTimesNumBits =
4711 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4712
4713 SDValue TableValue =
4714 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4715 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4716
4717 // No need to mask out the high bits since the setreg will ignore them
4718 // anyway.
4719 NewMode = TruncTable;
4720 }
4721
4722 // Insert a readfirstlane in case the value is a VGPR. We could do this
4723 // earlier and keep more operations scalar, but that interferes with
4724 // combining the source.
4725 SDValue ReadFirstLaneID =
4726 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4727 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4728 ReadFirstLaneID, NewMode);
4729 }
4730
4731 // N.B. The setreg will be later folded into s_round_mode on supported
4732 // targets.
4733 SDValue IntrinID =
4734 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4735 uint32_t BothRoundHwReg =
4737 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4738
4739 SDValue SetReg =
4740 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4741 IntrinID, RoundBothImm, NewMode);
4742
4743 return SetReg;
4744}
4745
4747 if (Op->isDivergent() &&
4748 (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
4749 // Cannot do I$ prefetch with divergent pointer.
4750 return SDValue();
4751
4752 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4756 break;
4758 if (Subtarget->hasSafeSmemPrefetch())
4759 break;
4760 [[fallthrough]];
4761 default:
4762 return SDValue();
4763 }
4764
4765 // I$ prefetch
4766 if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
4767 return SDValue();
4768
4769 return Op;
4770}
4771
4772// Work around DAG legality rules only based on the result type.
4774 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4775 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4776 EVT SrcVT = Src.getValueType();
4777
4778 if (SrcVT.getScalarType() != MVT::bf16)
4779 return Op;
4780
4781 SDLoc SL(Op);
4782 SDValue BitCast =
4783 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4784
4785 EVT DstVT = Op.getValueType();
4786 if (IsStrict)
4787 llvm_unreachable("Need STRICT_BF16_TO_FP");
4788
4789 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4790}
4791
4793 SDLoc SL(Op);
4794 if (Op.getValueType() != MVT::i64)
4795 return Op;
4796
4797 uint32_t ModeHwReg =
4799 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4800 uint32_t TrapHwReg =
4802 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4803
4804 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4805 SDValue IntrinID =
4806 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4807 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4808 Op.getOperand(0), IntrinID, ModeHwRegImm);
4809 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4810 Op.getOperand(0), IntrinID, TrapHwRegImm);
4811 SDValue TokenReg =
4812 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4813 GetTrapReg.getValue(1));
4814
4815 SDValue CvtPtr =
4816 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4817 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4818
4819 return DAG.getMergeValues({Result, TokenReg}, SL);
4820}
4821
4823 SDLoc SL(Op);
4824 if (Op.getOperand(1).getValueType() != MVT::i64)
4825 return Op;
4826
4827 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4828 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4829 DAG.getConstant(0, SL, MVT::i32));
4830 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4831 DAG.getConstant(1, SL, MVT::i32));
4832
4833 SDValue ReadFirstLaneID =
4834 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4835 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4836 ReadFirstLaneID, NewModeReg);
4837 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4838 ReadFirstLaneID, NewTrapReg);
4839
4840 unsigned ModeHwReg =
4842 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4843 unsigned TrapHwReg =
4845 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4846
4847 SDValue IntrinID =
4848 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4849 SDValue SetModeReg =
4850 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4851 IntrinID, ModeHwRegImm, NewModeReg);
4852 SDValue SetTrapReg =
4853 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4854 IntrinID, TrapHwRegImm, NewTrapReg);
4855 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4856}
4857
4859 const MachineFunction &MF) const {
4860 const Function &Fn = MF.getFunction();
4861
4863 .Case("m0", AMDGPU::M0)
4864 .Case("exec", AMDGPU::EXEC)
4865 .Case("exec_lo", AMDGPU::EXEC_LO)
4866 .Case("exec_hi", AMDGPU::EXEC_HI)
4867 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4868 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4869 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4870 .Default(Register());
4871 if (!Reg)
4872 return Reg;
4873
4874 if (!Subtarget->hasFlatScrRegister() &&
4875 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4876 Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) +
4877 "\" for subtarget."));
4878 }
4879
4880 switch (Reg) {
4881 case AMDGPU::M0:
4882 case AMDGPU::EXEC_LO:
4883 case AMDGPU::EXEC_HI:
4884 case AMDGPU::FLAT_SCR_LO:
4885 case AMDGPU::FLAT_SCR_HI:
4886 if (VT.getSizeInBits() == 32)
4887 return Reg;
4888 break;
4889 case AMDGPU::EXEC:
4890 case AMDGPU::FLAT_SCR:
4891 if (VT.getSizeInBits() == 64)
4892 return Reg;
4893 break;
4894 default:
4895 llvm_unreachable("missing register type checking");
4896 }
4897
4899 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4900}
4901
4902// If kill is not the last instruction, split the block so kill is always a
4903// proper terminator.
4906 MachineBasicBlock *BB) const {
4907 MachineBasicBlock *SplitBB = BB->splitAt(MI, /*UpdateLiveIns=*/true);
4909 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4910 return SplitBB;
4911}
4912
4913// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4914// \p MI will be the only instruction in the loop body block. Otherwise, it will
4915// be the first instruction in the remainder block.
4916//
4917/// \returns { LoopBody, Remainder }
4918static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4920 MachineFunction *MF = MBB.getParent();
4922
4923 // To insert the loop we need to split the block. Move everything after this
4924 // point to a new block, and insert a new empty block between the two.
4926 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4928 ++MBBI;
4929
4930 MF->insert(MBBI, LoopBB);
4931 MF->insert(MBBI, RemainderBB);
4932
4933 LoopBB->addSuccessor(LoopBB);
4934 LoopBB->addSuccessor(RemainderBB);
4935
4936 // Move the rest of the block into a new block.
4937 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4938
4939 if (InstInLoop) {
4940 auto Next = std::next(I);
4941
4942 // Move instruction to loop body.
4943 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4944
4945 // Move the rest of the block.
4946 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4947 } else {
4948 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4949 }
4950
4951 MBB.addSuccessor(LoopBB);
4952
4953 return std::pair(LoopBB, RemainderBB);
4954}
4955
4956/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4958 MachineBasicBlock *MBB = MI.getParent();
4960 auto I = MI.getIterator();
4961 auto E = std::next(I);
4962
4963 // clang-format off
4964 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4965 .addImm(0);
4966 // clang-format on
4967
4968 MIBundleBuilder Bundler(*MBB, I, E);
4969 finalizeBundle(*MBB, Bundler.begin());
4970}
4971
4974 MachineBasicBlock *BB) const {
4975 const DebugLoc &DL = MI.getDebugLoc();
4976
4978
4980
4981 // Apparently kill flags are only valid if the def is in the same block?
4982 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4983 Src->setIsKill(false);
4984
4985 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
4986
4987 MachineBasicBlock::iterator I = LoopBB->end();
4988
4989 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4991
4992 // Clear TRAP_STS.MEM_VIOL
4993 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4994 .addImm(0)
4995 .addImm(EncodedReg);
4996
4998
4999 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5000
5001 // Load and check TRAP_STS.MEM_VIOL
5002 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
5003 .addImm(EncodedReg);
5004
5005 // FIXME: Do we need to use an isel pseudo that may clobber scc?
5006 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5007 .addReg(Reg, RegState::Kill)
5008 .addImm(0);
5009 // clang-format off
5010 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5011 .addMBB(LoopBB);
5012 // clang-format on
5013
5014 return RemainderBB;
5015}
5016
5017// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
5018// wavefront. If the value is uniform and just happens to be in a VGPR, this
5019// will only do one iteration. In the worst case, this will loop 64 times.
5020//
5021// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
5024 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
5025 const DebugLoc &DL, const MachineOperand &Idx,
5026 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
5027 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
5028 Register &SGPRIdxReg) {
5029
5030 MachineFunction *MF = OrigBB.getParent();
5031 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5032 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5035
5036 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5037 Register PhiExec = MRI.createVirtualRegister(BoolRC);
5038 Register NewExec = MRI.createVirtualRegister(BoolRC);
5039 Register CurrentIdxReg =
5040 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5041 Register CondReg = MRI.createVirtualRegister(BoolRC);
5042
5043 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
5044 .addReg(InitReg)
5045 .addMBB(&OrigBB)
5046 .addReg(ResultReg)
5047 .addMBB(&LoopBB);
5048
5049 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
5050 .addReg(InitSaveExecReg)
5051 .addMBB(&OrigBB)
5052 .addReg(NewExec)
5053 .addMBB(&LoopBB);
5054
5055 // Read the next variant <- also loop target.
5056 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5057 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
5058
5059 // Compare the just read M0 value to all possible Idx values.
5060 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5061 .addReg(CurrentIdxReg)
5062 .addReg(Idx.getReg(), 0, Idx.getSubReg());
5063
5064 // Update EXEC, save the original EXEC value to VCC.
5065 BuildMI(LoopBB, I, DL, TII->get(LMC.AndSaveExecOpc), NewExec)
5066 .addReg(CondReg, RegState::Kill);
5067
5068 MRI.setSimpleHint(NewExec, CondReg);
5069
5070 if (UseGPRIdxMode) {
5071 if (Offset == 0) {
5072 SGPRIdxReg = CurrentIdxReg;
5073 } else {
5074 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5075 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5076 .addReg(CurrentIdxReg, RegState::Kill)
5077 .addImm(Offset);
5078 }
5079 } else {
5080 // Move index from VCC into M0
5081 if (Offset == 0) {
5082 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5083 .addReg(CurrentIdxReg, RegState::Kill);
5084 } else {
5085 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5086 .addReg(CurrentIdxReg, RegState::Kill)
5087 .addImm(Offset);
5088 }
5089 }
5090
5091 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
5092 MachineInstr *InsertPt =
5093 BuildMI(LoopBB, I, DL, TII->get(LMC.XorTermOpc), LMC.ExecReg)
5094 .addReg(LMC.ExecReg)
5095 .addReg(NewExec);
5096
5097 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
5098 // s_cbranch_scc0?
5099
5100 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
5101 // clang-format off
5102 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5103 .addMBB(&LoopBB);
5104 // clang-format on
5105
5106 return InsertPt->getIterator();
5107}
5108
5109// This has slightly sub-optimal regalloc when the source vector is killed by
5110// the read. The register allocator does not understand that the kill is
5111// per-workitem, so is kept alive for the whole loop so we end up not re-using a
5112// subregister from it, using 1 more VGPR than necessary. This was saved when
5113// this was expanded after register allocation.
5116 unsigned InitResultReg, unsigned PhiReg, int Offset,
5117 bool UseGPRIdxMode, Register &SGPRIdxReg) {
5118 MachineFunction *MF = MBB.getParent();
5119 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5120 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5122 const DebugLoc &DL = MI.getDebugLoc();
5124
5125 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
5126 Register DstReg = MI.getOperand(0).getReg();
5127 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
5128 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
5130
5131 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
5132
5133 // Save the EXEC mask
5134 // clang-format off
5135 BuildMI(MBB, I, DL, TII->get(LMC.MovOpc), SaveExec)
5136 .addReg(LMC.ExecReg);
5137 // clang-format on
5138
5139 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
5140
5141 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5142
5143 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
5144 InitResultReg, DstReg, PhiReg, TmpExec,
5145 Offset, UseGPRIdxMode, SGPRIdxReg);
5146
5147 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
5149 ++MBBI;
5150 MF->insert(MBBI, LandingPad);
5151 LoopBB->removeSuccessor(RemainderBB);
5152 LandingPad->addSuccessor(RemainderBB);
5153 LoopBB->addSuccessor(LandingPad);
5154 MachineBasicBlock::iterator First = LandingPad->begin();
5155 // clang-format off
5156 BuildMI(*LandingPad, First, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
5157 .addReg(SaveExec);
5158 // clang-format on
5159
5160 return InsPt;
5161}
5162
5163// Returns subreg index, offset
5164static std::pair<unsigned, int>
5166 const TargetRegisterClass *SuperRC, unsigned VecReg,
5167 int Offset) {
5168 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
5169
5170 // Skip out of bounds offsets, or else we would end up using an undefined
5171 // register.
5172 if (Offset >= NumElts || Offset < 0)
5173 return std::pair(AMDGPU::sub0, Offset);
5174
5175 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
5176}
5177
5180 int Offset) {
5181 MachineBasicBlock *MBB = MI.getParent();
5182 const DebugLoc &DL = MI.getDebugLoc();
5184
5185 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5186
5187 assert(Idx->getReg() != AMDGPU::NoRegister);
5188
5189 if (Offset == 0) {
5190 // clang-format off
5191 BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5192 .add(*Idx);
5193 // clang-format on
5194 } else {
5195 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5196 .add(*Idx)
5197 .addImm(Offset);
5198 }
5199}
5200
5203 int Offset) {
5204 MachineBasicBlock *MBB = MI.getParent();
5205 const DebugLoc &DL = MI.getDebugLoc();
5207
5208 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5209
5210 if (Offset == 0)
5211 return Idx->getReg();
5212
5213 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5214 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
5215 .add(*Idx)
5216 .addImm(Offset);
5217 return Tmp;
5218}
5219
5222 const GCNSubtarget &ST) {
5223 const SIInstrInfo *TII = ST.getInstrInfo();
5224 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5225 MachineFunction *MF = MBB.getParent();
5227
5228 Register Dst = MI.getOperand(0).getReg();
5229 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5230 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
5231 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5232
5233 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
5234 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5235
5236 unsigned SubReg;
5237 std::tie(SubReg, Offset) =
5238 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
5239
5240 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5241
5242 // Check for a SGPR index.
5243 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5245 const DebugLoc &DL = MI.getDebugLoc();
5246
5247 if (UseGPRIdxMode) {
5248 // TODO: Look at the uses to avoid the copy. This may require rescheduling
5249 // to avoid interfering with other uses, so probably requires a new
5250 // optimization pass.
5252
5253 const MCInstrDesc &GPRIDXDesc =
5254 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5255 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5256 .addReg(SrcReg)
5257 .addReg(Idx)
5258 .addImm(SubReg);
5259 } else {
5261
5262 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5263 .addReg(SrcReg, 0, SubReg)
5264 .addReg(SrcReg, RegState::Implicit);
5265 }
5266
5267 MI.eraseFromParent();
5268
5269 return &MBB;
5270 }
5271
5272 // Control flow needs to be inserted if indexing with a VGPR.
5273 const DebugLoc &DL = MI.getDebugLoc();
5275
5276 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5277 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5278
5279 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
5280
5281 Register SGPRIdxReg;
5282 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
5283 UseGPRIdxMode, SGPRIdxReg);
5284
5285 MachineBasicBlock *LoopBB = InsPt->getParent();
5286
5287 if (UseGPRIdxMode) {
5288 const MCInstrDesc &GPRIDXDesc =
5289 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5290
5291 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5292 .addReg(SrcReg)
5293 .addReg(SGPRIdxReg)
5294 .addImm(SubReg);
5295 } else {
5296 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5297 .addReg(SrcReg, 0, SubReg)
5298 .addReg(SrcReg, RegState::Implicit);
5299 }
5300
5301 MI.eraseFromParent();
5302
5303 return LoopBB;
5304}
5305
5308 const GCNSubtarget &ST) {
5309 const SIInstrInfo *TII = ST.getInstrInfo();
5310 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5311 MachineFunction *MF = MBB.getParent();
5313
5314 Register Dst = MI.getOperand(0).getReg();
5315 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
5316 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5317 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
5318 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5319 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
5320 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5321
5322 // This can be an immediate, but will be folded later.
5323 assert(Val->getReg());
5324
5325 unsigned SubReg;
5326 std::tie(SubReg, Offset) =
5327 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
5328 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5329
5330 if (Idx->getReg() == AMDGPU::NoRegister) {
5332 const DebugLoc &DL = MI.getDebugLoc();
5333
5334 assert(Offset == 0);
5335
5336 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
5337 .add(*SrcVec)
5338 .add(*Val)
5339 .addImm(SubReg);
5340
5341 MI.eraseFromParent();
5342 return &MBB;
5343 }
5344
5345 // Check for a SGPR index.
5346 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5348 const DebugLoc &DL = MI.getDebugLoc();
5349
5350 if (UseGPRIdxMode) {
5352
5353 const MCInstrDesc &GPRIDXDesc =
5354 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5355 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5356 .addReg(SrcVec->getReg())
5357 .add(*Val)
5358 .addReg(Idx)
5359 .addImm(SubReg);
5360 } else {
5362
5363 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5364 TRI.getRegSizeInBits(*VecRC), 32, false);
5365 BuildMI(MBB, I, DL, MovRelDesc, Dst)
5366 .addReg(SrcVec->getReg())
5367 .add(*Val)
5368 .addImm(SubReg);
5369 }
5370 MI.eraseFromParent();
5371 return &MBB;
5372 }
5373
5374 // Control flow needs to be inserted if indexing with a VGPR.
5375 if (Val->isReg())
5376 MRI.clearKillFlags(Val->getReg());
5377
5378 const DebugLoc &DL = MI.getDebugLoc();
5379
5380 Register PhiReg = MRI.createVirtualRegister(VecRC);
5381
5382 Register SGPRIdxReg;
5383 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
5384 UseGPRIdxMode, SGPRIdxReg);
5385 MachineBasicBlock *LoopBB = InsPt->getParent();
5386
5387 if (UseGPRIdxMode) {
5388 const MCInstrDesc &GPRIDXDesc =
5389 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5390
5391 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5392 .addReg(PhiReg)
5393 .add(*Val)
5394 .addReg(SGPRIdxReg)
5395 .addImm(SubReg);
5396 } else {
5397 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5398 TRI.getRegSizeInBits(*VecRC), 32, false);
5399 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
5400 .addReg(PhiReg)
5401 .add(*Val)
5402 .addImm(SubReg);
5403 }
5404
5405 MI.eraseFromParent();
5406 return LoopBB;
5407}
5408
5410 MachineBasicBlock *BB) {
5411 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5412 // For GFX12, we emit s_add_u64 and s_sub_u64.
5413 MachineFunction *MF = BB->getParent();
5414 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5415 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5417 const DebugLoc &DL = MI.getDebugLoc();
5418 MachineOperand &Dest = MI.getOperand(0);
5419 MachineOperand &Src0 = MI.getOperand(1);
5420 MachineOperand &Src1 = MI.getOperand(2);
5421 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5422 if (ST.hasScalarAddSub64()) {
5423 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5424 // clang-format off
5425 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5426 .add(Src0)
5427 .add(Src1);
5428 // clang-format on
5429 } else {
5430 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5431 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5432
5433 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5434 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5435
5436 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5437 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5438 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5439 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5440
5441 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5442 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5443 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5444 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5445
5446 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5447 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5448 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5449 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5450 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5451 .addReg(DestSub0)
5452 .addImm(AMDGPU::sub0)
5453 .addReg(DestSub1)
5454 .addImm(AMDGPU::sub1);
5455 }
5456 MI.eraseFromParent();
5457 return BB;
5458}
5459
5461 switch (Opc) {
5462 case AMDGPU::S_MIN_U32:
5463 return std::numeric_limits<uint32_t>::max();
5464 case AMDGPU::S_MIN_I32:
5465 return std::numeric_limits<int32_t>::max();
5466 case AMDGPU::S_MAX_U32:
5467 return std::numeric_limits<uint32_t>::min();
5468 case AMDGPU::S_MAX_I32:
5469 return std::numeric_limits<int32_t>::min();
5470 case AMDGPU::S_ADD_I32:
5471 case AMDGPU::S_SUB_I32:
5472 case AMDGPU::S_OR_B32:
5473 case AMDGPU::S_XOR_B32:
5474 return std::numeric_limits<uint32_t>::min();
5475 case AMDGPU::S_AND_B32:
5476 return std::numeric_limits<uint32_t>::max();
5477 default:
5479 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5480 }
5481}
5482
5484 switch (Opc) {
5485 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5486 return std::numeric_limits<uint64_t>::max();
5487 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5488 return std::numeric_limits<int64_t>::max();
5489 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5490 return std::numeric_limits<uint64_t>::min();
5491 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5492 return std::numeric_limits<int64_t>::min();
5493 case AMDGPU::S_ADD_U64_PSEUDO:
5494 case AMDGPU::S_SUB_U64_PSEUDO:
5495 case AMDGPU::S_OR_B64:
5496 case AMDGPU::S_XOR_B64:
5497 return std::numeric_limits<uint64_t>::min();
5498 case AMDGPU::S_AND_B64:
5499 return std::numeric_limits<uint64_t>::max();
5500 default:
5502 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5503 }
5504}
5505
5506static bool is32bitWaveReduceOperation(unsigned Opc) {
5507 return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
5508 Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
5509 Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
5510 Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5511 Opc == AMDGPU::S_XOR_B32;
5512}
5513
5516 const GCNSubtarget &ST,
5517 unsigned Opc) {
5519 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5520 const DebugLoc &DL = MI.getDebugLoc();
5521 const SIInstrInfo *TII = ST.getInstrInfo();
5522
5523 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5524 Register SrcReg = MI.getOperand(1).getReg();
5525 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
5526 Register DstReg = MI.getOperand(0).getReg();
5527 MachineBasicBlock *RetBB = nullptr;
5528 if (isSGPR) {
5529 switch (Opc) {
5530 case AMDGPU::S_MIN_U32:
5531 case AMDGPU::S_MIN_I32:
5532 case AMDGPU::S_MAX_U32:
5533 case AMDGPU::S_MAX_I32:
5534 case AMDGPU::S_AND_B32:
5535 case AMDGPU::S_OR_B32: {
5536 // Idempotent operations.
5537 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5538 RetBB = &BB;
5539 break;
5540 }
5541 case AMDGPU::V_CMP_LT_U64_e64: // umin
5542 case AMDGPU::V_CMP_LT_I64_e64: // min
5543 case AMDGPU::V_CMP_GT_U64_e64: // umax
5544 case AMDGPU::V_CMP_GT_I64_e64: // max
5545 case AMDGPU::S_AND_B64:
5546 case AMDGPU::S_OR_B64: {
5547 // Idempotent operations.
5548 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
5549 RetBB = &BB;
5550 break;
5551 }
5552 case AMDGPU::S_XOR_B32:
5553 case AMDGPU::S_XOR_B64:
5554 case AMDGPU::S_ADD_I32:
5555 case AMDGPU::S_ADD_U64_PSEUDO:
5556 case AMDGPU::S_SUB_I32:
5557 case AMDGPU::S_SUB_U64_PSEUDO: {
5558 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5559 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5560 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5561 Register NumActiveLanes =
5562 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5563
5564 bool IsWave32 = ST.isWave32();
5565 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5566 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5567 unsigned BitCountOpc =
5568 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5569
5570 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5571
5572 auto NewAccumulator =
5573 BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
5574 .addReg(ExecMask);
5575
5576 switch (Opc) {
5577 case AMDGPU::S_XOR_B32:
5578 case AMDGPU::S_XOR_B64: {
5579 // Performing an XOR operation on a uniform value
5580 // depends on the parity of the number of active lanes.
5581 // For even parity, the result will be 0, for odd
5582 // parity the result will be the same as the input value.
5583 Register ParityRegister =
5584 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5585
5586 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5587 .addReg(NewAccumulator->getOperand(0).getReg())
5588 .addImm(1)
5589 .setOperandDead(3); // Dead scc
5590 if (Opc == AMDGPU::S_XOR_B32) {
5591 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5592 .addReg(SrcReg)
5593 .addReg(ParityRegister);
5594 } else {
5595 Register DestSub0 =
5596 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5597 Register DestSub1 =
5598 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5599
5600 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5601 const TargetRegisterClass *SrcSubRC =
5602 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5603
5604 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5605 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5606 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5607 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5608
5609 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5610 .add(Op1L)
5611 .addReg(ParityRegister);
5612
5613 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5614 .add(Op1H)
5615 .addReg(ParityRegister);
5616
5617 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5618 .addReg(DestSub0)
5619 .addImm(AMDGPU::sub0)
5620 .addReg(DestSub1)
5621 .addImm(AMDGPU::sub1);
5622 }
5623 break;
5624 }
5625 case AMDGPU::S_SUB_I32: {
5626 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5627
5628 // Take the negation of the source operand.
5629 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5630 .addImm(0)
5631 .addReg(SrcReg);
5632 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5633 .addReg(NegatedVal)
5634 .addReg(NewAccumulator->getOperand(0).getReg());
5635 break;
5636 }
5637 case AMDGPU::S_ADD_I32: {
5638 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5639 .addReg(SrcReg)
5640 .addReg(NewAccumulator->getOperand(0).getReg());
5641 break;
5642 }
5643 case AMDGPU::S_ADD_U64_PSEUDO:
5644 case AMDGPU::S_SUB_U64_PSEUDO: {
5645 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5646 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5647 Register Op1H_Op0L_Reg =
5648 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5649 Register Op1L_Op0H_Reg =
5650 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5651 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5652 Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5653 Register NegatedValLo =
5654 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5655 Register NegatedValHi =
5656 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5657
5658 const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5659 const TargetRegisterClass *Src1SubRC =
5660 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5661
5662 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5663 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5664 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5665 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5666
5667 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5668 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5669 .addImm(0)
5670 .addReg(NewAccumulator->getOperand(0).getReg())
5671 .setOperandDead(3); // Dead scc
5672 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5673 .addReg(NegatedValLo)
5674 .addImm(31)
5675 .setOperandDead(3); // Dead scc
5676 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5677 .add(Op1L)
5678 .addReg(NegatedValHi);
5679 }
5680 Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5681 ? NegatedValLo
5682 : NewAccumulator->getOperand(0).getReg();
5683 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5684 .add(Op1L)
5685 .addReg(LowOpcode);
5686 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5687 .add(Op1L)
5688 .addReg(LowOpcode);
5689 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5690 .add(Op1H)
5691 .addReg(LowOpcode);
5692
5693 Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5694 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5695 .addReg(CarryReg)
5696 .addReg(Op1H_Op0L_Reg)
5697 .setOperandDead(3); // Dead scc
5698
5699 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5700 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5701 .addReg(HiVal)
5702 .addReg(Op1L_Op0H_Reg)
5703 .setOperandDead(3); // Dead scc
5704 }
5705 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5706 .addReg(DestSub0)
5707 .addImm(AMDGPU::sub0)
5708 .addReg(DestSub1)
5709 .addImm(AMDGPU::sub1);
5710 break;
5711 }
5712 }
5713 RetBB = &BB;
5714 }
5715 }
5716 } else {
5717 // TODO: Implement DPP Strategy and switch based on immediate strategy
5718 // operand. For now, for all the cases (default, Iterative and DPP we use
5719 // iterative approach by default.)
5720
5721 // To reduce the VGPR using iterative approach, we need to iterate
5722 // over all the active lanes. Lowering consists of ComputeLoop,
5723 // which iterate over only active lanes. We use copy of EXEC register
5724 // as induction variable and every active lane modifies it using bitset0
5725 // so that we will get the next active lane for next iteration.
5727 Register SrcReg = MI.getOperand(1).getReg();
5728 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5729
5730 // Create Control flow for loop
5731 // Split MI's Machine Basic block into For loop
5732 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
5733
5734 // Create virtual registers required for lowering.
5735 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5736 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5737 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5738 Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
5739 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5740 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5741 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5742 Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5743 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
5744
5745 bool IsWave32 = ST.isWave32();
5746 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5747 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5748
5749 // Create initial values of induction variable from Exec, Accumulator and
5750 // insert branch instr to newly created ComputeBlock
5751 BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
5752 if (is32BitOpc) {
5754 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5755 .addImm(IdentityValue);
5756 } else {
5758 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5759 .addImm(IdentityValue);
5760 }
5761 // clang-format off
5762 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5763 .addMBB(ComputeLoop);
5764 // clang-format on
5765
5766 // Start constructing ComputeLoop
5767 I = ComputeLoop->begin();
5768 auto Accumulator =
5769 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5770 .addReg(IdentityValReg)
5771 .addMBB(&BB);
5772 auto ActiveBits =
5773 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5774 .addReg(LoopIterator)
5775 .addMBB(&BB);
5776
5777 I = ComputeLoop->end();
5778 MachineInstr *NewAccumulator;
5779 // Perform the computations
5780 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5781 BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5782 .addReg(ActiveBitsReg);
5783 if (is32BitOpc) {
5784 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5785 LaneValueReg)
5786 .addReg(SrcReg)
5787 .addReg(FF1Reg);
5788 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5789 .addReg(Accumulator->getOperand(0).getReg())
5790 .addReg(LaneValueReg);
5791 } else {
5792 Register LaneValueLoReg =
5793 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5794 Register LaneValueHiReg =
5795 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5796 Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5797 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5798 const TargetRegisterClass *SrcSubRC =
5799 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5800 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5801 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5802 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5803 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5804 // lane value input should be in an sgpr
5805 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5806 LaneValueLoReg)
5807 .add(Op1L)
5808 .addReg(FF1Reg);
5809 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5810 LaneValueHiReg)
5811 .add(Op1H)
5812 .addReg(FF1Reg);
5813 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5814 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5815 .addReg(LaneValueLoReg)
5816 .addImm(AMDGPU::sub0)
5817 .addReg(LaneValueHiReg)
5818 .addImm(AMDGPU::sub1);
5819 switch (Opc) {
5820 case AMDGPU::S_OR_B64:
5821 case AMDGPU::S_AND_B64:
5822 case AMDGPU::S_XOR_B64: {
5823 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5824 .addReg(Accumulator->getOperand(0).getReg())
5825 .addReg(LaneValue->getOperand(0).getReg())
5826 .setOperandDead(3); // Dead scc
5827 break;
5828 }
5829 case AMDGPU::V_CMP_GT_I64_e64:
5830 case AMDGPU::V_CMP_GT_U64_e64:
5831 case AMDGPU::V_CMP_LT_I64_e64:
5832 case AMDGPU::V_CMP_LT_U64_e64: {
5833 Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
5834 Register ComparisonResultReg =
5835 MRI.createVirtualRegister(WaveMaskRegClass);
5836 const TargetRegisterClass *VregClass = TRI->getVGPR64Class();
5837 const TargetRegisterClass *VSubRegClass =
5838 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5839 Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
5840 MachineOperand SrcReg0Sub0 =
5841 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5842 VregClass, AMDGPU::sub0, VSubRegClass);
5843 MachineOperand SrcReg0Sub1 =
5844 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5845 VregClass, AMDGPU::sub1, VSubRegClass);
5846 BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
5847 AccumulatorVReg)
5848 .add(SrcReg0Sub0)
5849 .addImm(AMDGPU::sub0)
5850 .add(SrcReg0Sub1)
5851 .addImm(AMDGPU::sub1);
5852 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
5853 .addReg(LaneValue->getOperand(0).getReg())
5854 .addReg(AccumulatorVReg);
5855
5856 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5857 BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
5858 .addReg(LaneMaskReg)
5859 .addReg(ActiveBitsReg);
5860
5861 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5862 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5863 .addReg(LaneValue->getOperand(0).getReg())
5864 .addReg(Accumulator->getOperand(0).getReg());
5865 break;
5866 }
5867 case AMDGPU::S_ADD_U64_PSEUDO:
5868 case AMDGPU::S_SUB_U64_PSEUDO: {
5869 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5870 .addReg(Accumulator->getOperand(0).getReg())
5871 .addReg(LaneValue->getOperand(0).getReg());
5872 ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
5873 break;
5874 }
5875 }
5876 }
5877 // Manipulate the iterator to get the next active lane
5878 unsigned BITSETOpc =
5879 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5880 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5881 .addReg(FF1Reg)
5882 .addReg(ActiveBitsReg);
5883
5884 // Add phi nodes
5885 Accumulator.addReg(DstReg).addMBB(ComputeLoop);
5886 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5887
5888 // Creating branching
5889 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5890 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5891 .addReg(NewActiveBitsReg)
5892 .addImm(0);
5893 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5894 .addMBB(ComputeLoop);
5895
5896 RetBB = ComputeEnd;
5897 }
5898 MI.eraseFromParent();
5899 return RetBB;
5900}
5901
5904 MachineBasicBlock *BB) const {
5905 MachineFunction *MF = BB->getParent();
5907 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5909 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
5910
5911 switch (MI.getOpcode()) {
5912 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5913 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5914 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5915 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
5916 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5917 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5918 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5919 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
5920 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5921 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5922 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5923 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
5924 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5925 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5926 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
5927 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
5928 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5929 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5930 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5931 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
5932 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5933 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5934 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5935 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
5936 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5937 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5938 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
5939 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
5940 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5941 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5942 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
5943 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
5944 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5945 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
5946 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
5947 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
5948 case AMDGPU::S_UADDO_PSEUDO:
5949 case AMDGPU::S_USUBO_PSEUDO: {
5950 const DebugLoc &DL = MI.getDebugLoc();
5951 MachineOperand &Dest0 = MI.getOperand(0);
5952 MachineOperand &Dest1 = MI.getOperand(1);
5953 MachineOperand &Src0 = MI.getOperand(2);
5954 MachineOperand &Src1 = MI.getOperand(3);
5955
5956 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5957 ? AMDGPU::S_ADD_U32
5958 : AMDGPU::S_SUB_U32;
5959 // clang-format off
5960 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
5961 .add(Src0)
5962 .add(Src1);
5963 // clang-format on
5964
5965 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
5966 .addImm(1)
5967 .addImm(0);
5968
5969 MI.eraseFromParent();
5970 return BB;
5971 }
5972 case AMDGPU::S_ADD_U64_PSEUDO:
5973 case AMDGPU::S_SUB_U64_PSEUDO: {
5974 return Expand64BitScalarArithmetic(MI, BB);
5975 }
5976 case AMDGPU::V_ADD_U64_PSEUDO:
5977 case AMDGPU::V_SUB_U64_PSEUDO: {
5979 const DebugLoc &DL = MI.getDebugLoc();
5980
5981 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5982
5983 MachineOperand &Dest = MI.getOperand(0);
5984 MachineOperand &Src0 = MI.getOperand(1);
5985 MachineOperand &Src1 = MI.getOperand(2);
5986
5987 if (ST.hasAddSubU64Insts()) {
5988 auto I = BuildMI(*BB, MI, DL,
5989 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
5990 : AMDGPU::V_SUB_U64_e64),
5991 Dest.getReg())
5992 .add(Src0)
5993 .add(Src1)
5994 .addImm(0); // clamp
5995 TII->legalizeOperands(*I);
5996 MI.eraseFromParent();
5997 return BB;
5998 }
5999
6000 if (IsAdd && ST.hasLshlAddU64Inst()) {
6001 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
6002 Dest.getReg())
6003 .add(Src0)
6004 .addImm(0)
6005 .add(Src1);
6006 TII->legalizeOperands(*Add);
6007 MI.eraseFromParent();
6008 return BB;
6009 }
6010
6011 const auto *CarryRC = TRI->getWaveMaskRegClass();
6012
6013 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6014 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6015
6016 Register CarryReg = MRI.createVirtualRegister(CarryRC);
6017 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
6018
6019 const TargetRegisterClass *Src0RC = Src0.isReg()
6020 ? MRI.getRegClass(Src0.getReg())
6021 : &AMDGPU::VReg_64RegClass;
6022 const TargetRegisterClass *Src1RC = Src1.isReg()
6023 ? MRI.getRegClass(Src1.getReg())
6024 : &AMDGPU::VReg_64RegClass;
6025
6026 const TargetRegisterClass *Src0SubRC =
6027 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6028 const TargetRegisterClass *Src1SubRC =
6029 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6030
6031 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
6032 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6033 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
6034 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6035
6036 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
6037 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6038 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
6039 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6040
6041 unsigned LoOpc =
6042 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6043 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
6044 .addReg(CarryReg, RegState::Define)
6045 .add(SrcReg0Sub0)
6046 .add(SrcReg1Sub0)
6047 .addImm(0); // clamp bit
6048
6049 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6050 MachineInstr *HiHalf =
6051 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
6052 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
6053 .add(SrcReg0Sub1)
6054 .add(SrcReg1Sub1)
6055 .addReg(CarryReg, RegState::Kill)
6056 .addImm(0); // clamp bit
6057
6058 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
6059 .addReg(DestSub0)
6060 .addImm(AMDGPU::sub0)
6061 .addReg(DestSub1)
6062 .addImm(AMDGPU::sub1);
6063 TII->legalizeOperands(*LoHalf);
6064 TII->legalizeOperands(*HiHalf);
6065 MI.eraseFromParent();
6066 return BB;
6067 }
6068 case AMDGPU::S_ADD_CO_PSEUDO:
6069 case AMDGPU::S_SUB_CO_PSEUDO: {
6070 // This pseudo has a chance to be selected
6071 // only from uniform add/subcarry node. All the VGPR operands
6072 // therefore assumed to be splat vectors.
6075 const DebugLoc &DL = MI.getDebugLoc();
6076 MachineOperand &Dest = MI.getOperand(0);
6077 MachineOperand &CarryDest = MI.getOperand(1);
6078 MachineOperand &Src0 = MI.getOperand(2);
6079 MachineOperand &Src1 = MI.getOperand(3);
6080 MachineOperand &Src2 = MI.getOperand(4);
6081 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
6082 ? AMDGPU::S_ADDC_U32
6083 : AMDGPU::S_SUBB_U32;
6084 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
6085 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6086 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6087 .addReg(Src0.getReg());
6088 Src0.setReg(RegOp0);
6089 }
6090 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
6091 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6092 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6093 .addReg(Src1.getReg());
6094 Src1.setReg(RegOp1);
6095 }
6096 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6097 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
6098 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6099 .addReg(Src2.getReg());
6100 Src2.setReg(RegOp2);
6101 }
6102
6103 if (ST.isWave64()) {
6104 if (ST.hasScalarCompareEq64()) {
6105 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
6106 .addReg(Src2.getReg())
6107 .addImm(0);
6108 } else {
6109 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
6110 const TargetRegisterClass *SubRC =
6111 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6112 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6113 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6114 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6115 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6116 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6117
6118 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
6119 .add(Src2Sub0)
6120 .add(Src2Sub1);
6121
6122 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6123 .addReg(Src2_32, RegState::Kill)
6124 .addImm(0);
6125 }
6126 } else {
6127 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6128 .addReg(Src2.getReg())
6129 .addImm(0);
6130 }
6131
6132 // clang-format off
6133 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg())
6134 .add(Src0)
6135 .add(Src1);
6136 // clang-format on
6137
6138 unsigned SelOpc =
6139 (ST.isWave64()) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6140
6141 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
6142 .addImm(-1)
6143 .addImm(0);
6144
6145 MI.eraseFromParent();
6146 return BB;
6147 }
6148 case AMDGPU::SI_INIT_M0: {
6149 MachineOperand &M0Init = MI.getOperand(0);
6150 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6151 TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6152 AMDGPU::M0)
6153 .add(M0Init);
6154 MI.eraseFromParent();
6155 return BB;
6156 }
6157 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6158 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6159 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6160 TII->get(AMDGPU::S_CMP_EQ_U32))
6161 .addImm(0)
6162 .addImm(0);
6163 return BB;
6164 }
6165 case AMDGPU::GET_GROUPSTATICSIZE: {
6166 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
6167 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
6168 DebugLoc DL = MI.getDebugLoc();
6169 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
6170 .add(MI.getOperand(0))
6171 .addImm(MFI->getLDSSize());
6172 MI.eraseFromParent();
6173 return BB;
6174 }
6175 case AMDGPU::GET_SHADERCYCLESHILO: {
6178 const DebugLoc &DL = MI.getDebugLoc();
6179 // The algorithm is:
6180 //
6181 // hi1 = getreg(SHADER_CYCLES_HI)
6182 // lo1 = getreg(SHADER_CYCLES_LO)
6183 // hi2 = getreg(SHADER_CYCLES_HI)
6184 //
6185 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
6186 // Otherwise there was overflow and the result is hi2:0. In both cases the
6187 // result should represent the actual time at some point during the sequence
6188 // of three getregs.
6189 using namespace AMDGPU::Hwreg;
6190 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6191 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
6192 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6193 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6194 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
6195 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6196 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6197 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
6198 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6199 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
6200 .addReg(RegHi1)
6201 .addReg(RegHi2);
6202 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6203 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
6204 .addReg(RegLo1)
6205 .addImm(0);
6206 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
6207 .add(MI.getOperand(0))
6208 .addReg(RegLo)
6209 .addImm(AMDGPU::sub0)
6210 .addReg(RegHi2)
6211 .addImm(AMDGPU::sub1);
6212 MI.eraseFromParent();
6213 return BB;
6214 }
6215 case AMDGPU::SI_INDIRECT_SRC_V1:
6216 case AMDGPU::SI_INDIRECT_SRC_V2:
6217 case AMDGPU::SI_INDIRECT_SRC_V4:
6218 case AMDGPU::SI_INDIRECT_SRC_V8:
6219 case AMDGPU::SI_INDIRECT_SRC_V9:
6220 case AMDGPU::SI_INDIRECT_SRC_V10:
6221 case AMDGPU::SI_INDIRECT_SRC_V11:
6222 case AMDGPU::SI_INDIRECT_SRC_V12:
6223 case AMDGPU::SI_INDIRECT_SRC_V16:
6224 case AMDGPU::SI_INDIRECT_SRC_V32:
6225 return emitIndirectSrc(MI, *BB, *getSubtarget());
6226 case AMDGPU::SI_INDIRECT_DST_V1:
6227 case AMDGPU::SI_INDIRECT_DST_V2:
6228 case AMDGPU::SI_INDIRECT_DST_V4:
6229 case AMDGPU::SI_INDIRECT_DST_V8:
6230 case AMDGPU::SI_INDIRECT_DST_V9:
6231 case AMDGPU::SI_INDIRECT_DST_V10:
6232 case AMDGPU::SI_INDIRECT_DST_V11:
6233 case AMDGPU::SI_INDIRECT_DST_V12:
6234 case AMDGPU::SI_INDIRECT_DST_V16:
6235 case AMDGPU::SI_INDIRECT_DST_V32:
6236 return emitIndirectDst(MI, *BB, *getSubtarget());
6237 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6238 case AMDGPU::SI_KILL_I1_PSEUDO:
6239 return splitKillBlock(MI, BB);
6240 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6242
6243 Register Dst = MI.getOperand(0).getReg();
6244 const MachineOperand &Src0 = MI.getOperand(1);
6245 const MachineOperand &Src1 = MI.getOperand(2);
6246 const DebugLoc &DL = MI.getDebugLoc();
6247 Register SrcCond = MI.getOperand(3).getReg();
6248
6249 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6250 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6251 const auto *CondRC = TRI->getWaveMaskRegClass();
6252 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
6253
6254 const TargetRegisterClass *Src0RC = Src0.isReg()
6255 ? MRI.getRegClass(Src0.getReg())
6256 : &AMDGPU::VReg_64RegClass;
6257 const TargetRegisterClass *Src1RC = Src1.isReg()
6258 ? MRI.getRegClass(Src1.getReg())
6259 : &AMDGPU::VReg_64RegClass;
6260
6261 const TargetRegisterClass *Src0SubRC =
6262 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6263 const TargetRegisterClass *Src1SubRC =
6264 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6265
6266 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
6267 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6268 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
6269 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6270
6271 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
6272 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6273 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
6274 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6275
6276 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
6277 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
6278 .addImm(0)
6279 .add(Src0Sub0)
6280 .addImm(0)
6281 .add(Src1Sub0)
6282 .addReg(SrcCondCopy);
6283 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
6284 .addImm(0)
6285 .add(Src0Sub1)
6286 .addImm(0)
6287 .add(Src1Sub1)
6288 .addReg(SrcCondCopy);
6289
6290 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
6291 .addReg(DstLo)
6292 .addImm(AMDGPU::sub0)
6293 .addReg(DstHi)
6294 .addImm(AMDGPU::sub1);
6295 MI.eraseFromParent();
6296 return BB;
6297 }
6298 case AMDGPU::SI_BR_UNDEF: {
6299 const DebugLoc &DL = MI.getDebugLoc();
6300 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
6301 .add(MI.getOperand(0));
6302 Br->getOperand(1).setIsUndef(); // read undef SCC
6303 MI.eraseFromParent();
6304 return BB;
6305 }
6306 case AMDGPU::ADJCALLSTACKUP:
6307 case AMDGPU::ADJCALLSTACKDOWN: {
6309 MachineInstrBuilder MIB(*MF, &MI);
6310 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
6311 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
6312 return BB;
6313 }
6314 case AMDGPU::SI_CALL_ISEL: {
6315 const DebugLoc &DL = MI.getDebugLoc();
6316
6317 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
6318
6320 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6321
6322 for (const MachineOperand &MO : MI.operands())
6323 MIB.add(MO);
6324
6325 MIB.cloneMemRefs(MI);
6326 MI.eraseFromParent();
6327 return BB;
6328 }
6329 case AMDGPU::V_ADD_CO_U32_e32:
6330 case AMDGPU::V_SUB_CO_U32_e32:
6331 case AMDGPU::V_SUBREV_CO_U32_e32: {
6332 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
6333 const DebugLoc &DL = MI.getDebugLoc();
6334 unsigned Opc = MI.getOpcode();
6335
6336 bool NeedClampOperand = false;
6337 if (TII->pseudoToMCOpcode(Opc) == -1) {
6339 NeedClampOperand = true;
6340 }
6341
6342 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
6343 if (TII->isVOP3(*I)) {
6344 I.addReg(TRI->getVCC(), RegState::Define);
6345 }
6346 I.add(MI.getOperand(1)).add(MI.getOperand(2));
6347 if (NeedClampOperand)
6348 I.addImm(0); // clamp bit for e64 encoding
6349
6350 TII->legalizeOperands(*I);
6351
6352 MI.eraseFromParent();
6353 return BB;
6354 }
6355 case AMDGPU::V_ADDC_U32_e32:
6356 case AMDGPU::V_SUBB_U32_e32:
6357 case AMDGPU::V_SUBBREV_U32_e32:
6358 // These instructions have an implicit use of vcc which counts towards the
6359 // constant bus limit.
6360 TII->legalizeOperands(MI);
6361 return BB;
6362 case AMDGPU::DS_GWS_INIT:
6363 case AMDGPU::DS_GWS_SEMA_BR:
6364 case AMDGPU::DS_GWS_BARRIER:
6365 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
6366 [[fallthrough]];
6367 case AMDGPU::DS_GWS_SEMA_V:
6368 case AMDGPU::DS_GWS_SEMA_P:
6369 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6370 // A s_waitcnt 0 is required to be the instruction immediately following.
6371 if (getSubtarget()->hasGWSAutoReplay()) {
6373 return BB;
6374 }
6375
6376 return emitGWSMemViolTestLoop(MI, BB);
6377 case AMDGPU::S_SETREG_B32: {
6378 // Try to optimize cases that only set the denormal mode or rounding mode.
6379 //
6380 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
6381 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
6382 // instead.
6383 //
6384 // FIXME: This could be predicates on the immediate, but tablegen doesn't
6385 // allow you to have a no side effect instruction in the output of a
6386 // sideeffecting pattern.
6387 auto [ID, Offset, Width] =
6388 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
6390 return BB;
6391
6392 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
6393 const unsigned SetMask = WidthMask << Offset;
6394
6395 if (getSubtarget()->hasDenormModeInst()) {
6396 unsigned SetDenormOp = 0;
6397 unsigned SetRoundOp = 0;
6398
6399 // The dedicated instructions can only set the whole denorm or round mode
6400 // at once, not a subset of bits in either.
6401 if (SetMask ==
6403 // If this fully sets both the round and denorm mode, emit the two
6404 // dedicated instructions for these.
6405 SetRoundOp = AMDGPU::S_ROUND_MODE;
6406 SetDenormOp = AMDGPU::S_DENORM_MODE;
6407 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
6408 SetRoundOp = AMDGPU::S_ROUND_MODE;
6409 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
6410 SetDenormOp = AMDGPU::S_DENORM_MODE;
6411 }
6412
6413 if (SetRoundOp || SetDenormOp) {
6415 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
6416 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6417 unsigned ImmVal = Def->getOperand(1).getImm();
6418 if (SetRoundOp) {
6419 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
6420 .addImm(ImmVal & 0xf);
6421
6422 // If we also have the denorm mode, get just the denorm mode bits.
6423 ImmVal >>= 4;
6424 }
6425
6426 if (SetDenormOp) {
6427 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
6428 .addImm(ImmVal & 0xf);
6429 }
6430
6431 MI.eraseFromParent();
6432 return BB;
6433 }
6434 }
6435 }
6436
6437 // If only FP bits are touched, used the no side effects pseudo.
6438 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
6439 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
6440 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
6441
6442 return BB;
6443 }
6444 case AMDGPU::S_INVERSE_BALLOT_U32:
6445 case AMDGPU::S_INVERSE_BALLOT_U64:
6446 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
6447 // necessary. After that they are equivalent to a COPY.
6448 MI.setDesc(TII->get(AMDGPU::COPY));
6449 return BB;
6450 case AMDGPU::ENDPGM_TRAP: {
6451 const DebugLoc &DL = MI.getDebugLoc();
6452 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
6453 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
6454 MI.addOperand(MachineOperand::CreateImm(0));
6455 return BB;
6456 }
6457
6458 // We need a block split to make the real endpgm a terminator. We also don't
6459 // want to break phis in successor blocks, so we can't just delete to the
6460 // end of the block.
6461
6462 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
6464 MF->push_back(TrapBB);
6465 // clang-format off
6466 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
6467 .addImm(0);
6468 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
6469 .addMBB(TrapBB);
6470 // clang-format on
6471
6472 BB->addSuccessor(TrapBB);
6473 MI.eraseFromParent();
6474 return SplitBB;
6475 }
6476 case AMDGPU::SIMULATED_TRAP: {
6477 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6479 MachineBasicBlock *SplitBB =
6480 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
6481 MI.eraseFromParent();
6482 return SplitBB;
6483 }
6484 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6485 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6487
6488 // During ISel, it's difficult to propagate the original EXEC mask to use as
6489 // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
6490 MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
6491 assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6492 Register OriginalExec = Setup->getOperand(0).getReg();
6493 MF->getRegInfo().clearKillFlags(OriginalExec);
6494 MI.getOperand(0).setReg(OriginalExec);
6495 return BB;
6496 }
6497 default:
6498 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
6499 if (!MI.mayStore())
6501 return BB;
6502 }
6504 }
6505}
6506
6508 // This currently forces unfolding various combinations of fsub into fma with
6509 // free fneg'd operands. As long as we have fast FMA (controlled by
6510 // isFMAFasterThanFMulAndFAdd), we should perform these.
6511
6512 // When fma is quarter rate, for f64 where add / sub are at best half rate,
6513 // most of these combines appear to be cycle neutral but save on instruction
6514 // count / code size.
6515 return true;
6516}
6517
6519
6521 EVT VT) const {
6522 if (!VT.isVector()) {
6523 return MVT::i1;
6524 }
6525 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
6526}
6527
6529 // TODO: Should i16 be used always if legal? For now it would force VALU
6530 // shifts.
6531 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6532}
6533
6535 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6536 ? Ty.changeElementSize(16)
6537 : Ty.changeElementSize(32);
6538}
6539
6540// Answering this is somewhat tricky and depends on the specific device which
6541// have different rates for fma or all f64 operations.
6542//
6543// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
6544// regardless of which device (although the number of cycles differs between
6545// devices), so it is always profitable for f64.
6546//
6547// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
6548// only on full rate devices. Normally, we should prefer selecting v_mad_f32
6549// which we can always do even without fused FP ops since it returns the same
6550// result as the separate operations and since it is always full
6551// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
6552// however does not support denormals, so we do report fma as faster if we have
6553// a fast fma device and require denormals.
6554//
6556 EVT VT) const {
6557 VT = VT.getScalarType();
6558
6559 switch (VT.getSimpleVT().SimpleTy) {
6560 case MVT::f32: {
6561 // If mad is not available this depends only on if f32 fma is full rate.
6562 if (!Subtarget->hasMadMacF32Insts())
6563 return Subtarget->hasFastFMAF32();
6564
6565 // Otherwise f32 mad is always full rate and returns the same result as
6566 // the separate operations so should be preferred over fma.
6567 // However does not support denormals.
6569 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6570
6571 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
6572 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6573 }
6574 case MVT::f64:
6575 return true;
6576 case MVT::f16:
6577 case MVT::bf16:
6578 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
6579 default:
6580 break;
6581 }
6582
6583 return false;
6584}
6585
6587 LLT Ty) const {
6588 switch (Ty.getScalarSizeInBits()) {
6589 case 16:
6590 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
6591 case 32:
6592 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
6593 case 64:
6594 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
6595 default:
6596 break;
6597 }
6598
6599 return false;
6600}
6601
6603 if (!Ty.isScalar())
6604 return false;
6605
6606 if (Ty.getScalarSizeInBits() == 16)
6607 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
6608 if (Ty.getScalarSizeInBits() == 32)
6609 return Subtarget->hasMadMacF32Insts() &&
6610 denormalModeIsFlushAllF32(*MI.getMF());
6611
6612 return false;
6613}
6614
6616 const SDNode *N) const {
6617 // TODO: Check future ftz flag
6618 // v_mad_f32/v_mac_f32 do not support denormals.
6619 EVT VT = N->getValueType(0);
6620 if (VT == MVT::f32)
6621 return Subtarget->hasMadMacF32Insts() &&
6623 if (VT == MVT::f16) {
6624 return Subtarget->hasMadF16() &&
6626 }
6627
6628 return false;
6629}
6630
6631//===----------------------------------------------------------------------===//
6632// Custom DAG Lowering Operations
6633//===----------------------------------------------------------------------===//
6634
6635// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6636// wider vector type is legal.
6638 SelectionDAG &DAG) const {
6639 unsigned Opc = Op.getOpcode();
6640 EVT VT = Op.getValueType();
6641 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
6642 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
6643 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6644 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
6645
6646 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
6647
6648 SDLoc SL(Op);
6649 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
6650 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
6651
6652 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6653}
6654
6655// Enable lowering of ROTR for vxi32 types. This is a workaround for a
6656// regression whereby extra unnecessary instructions were added to codegen
6657// for rotr operations, casued by legalising v2i32 or. This resulted in extra
6658// instructions to extract the result from the vector.
6660 [[maybe_unused]] EVT VT = Op.getValueType();
6661
6662 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6663 VT == MVT::v16i32) &&
6664 "Unexpected ValueType.");
6665
6666 return DAG.UnrollVectorOp(Op.getNode());
6667}
6668
6669// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6670// wider vector type is legal.
6672 SelectionDAG &DAG) const {
6673 unsigned Opc = Op.getOpcode();
6674 EVT VT = Op.getValueType();
6675 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6676 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6677 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6678 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6679 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6680 VT == MVT::v32bf16);
6681
6682 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
6683 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6684
6685 SDLoc SL(Op);
6686
6687 SDValue OpLo =
6688 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
6689 SDValue OpHi =
6690 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
6691
6692 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6693}
6694
6696 SelectionDAG &DAG) const {
6697 unsigned Opc = Op.getOpcode();
6698 EVT VT = Op.getValueType();
6699 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6700 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6701 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6702 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6703 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6704 VT == MVT::v32bf16);
6705
6706 SDValue Op0 = Op.getOperand(0);
6707 auto [Lo0, Hi0] = Op0.getValueType().isVector()
6708 ? DAG.SplitVectorOperand(Op.getNode(), 0)
6709 : std::pair(Op0, Op0);
6710
6711 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6712 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
6713
6714 SDLoc SL(Op);
6715 auto ResVT = DAG.GetSplitDestVTs(VT);
6716
6717 SDValue OpLo =
6718 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
6719 SDValue OpHi =
6720 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
6721
6722 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6723}
6724
6726 switch (Op.getOpcode()) {
6727 default:
6729 case ISD::BRCOND:
6730 return LowerBRCOND(Op, DAG);
6731 case ISD::RETURNADDR:
6732 return LowerRETURNADDR(Op, DAG);
6733 case ISD::LOAD: {
6734 SDValue Result = LowerLOAD(Op, DAG);
6735 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6736 "Load should return a value and a chain");
6737 return Result;
6738 }
6739 case ISD::FSQRT: {
6740 EVT VT = Op.getValueType();
6741 if (VT == MVT::f32)
6742 return lowerFSQRTF32(Op, DAG);
6743 if (VT == MVT::f64)
6744 return lowerFSQRTF64(Op, DAG);
6745 return SDValue();
6746 }
6747 case ISD::FSIN:
6748 case ISD::FCOS:
6749 return LowerTrig(Op, DAG);
6750 case ISD::SELECT:
6751 return LowerSELECT(Op, DAG);
6752 case ISD::FDIV:
6753 return LowerFDIV(Op, DAG);
6754 case ISD::FFREXP:
6755 return LowerFFREXP(Op, DAG);
6756 case ISD::ATOMIC_CMP_SWAP:
6757 return LowerATOMIC_CMP_SWAP(Op, DAG);
6758 case ISD::STORE:
6759 return LowerSTORE(Op, DAG);
6760 case ISD::GlobalAddress: {
6763 return LowerGlobalAddress(MFI, Op, DAG);
6764 }
6766 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6768 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6770 return LowerINTRINSIC_VOID(Op, DAG);
6771 case ISD::ADDRSPACECAST:
6772 return lowerADDRSPACECAST(Op, DAG);
6774 return lowerINSERT_SUBVECTOR(Op, DAG);
6776 return lowerINSERT_VECTOR_ELT(Op, DAG);
6778 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
6780 return lowerVECTOR_SHUFFLE(Op, DAG);
6782 return lowerSCALAR_TO_VECTOR(Op, DAG);
6783 case ISD::BUILD_VECTOR:
6784 return lowerBUILD_VECTOR(Op, DAG);
6785 case ISD::FP_ROUND:
6787 return lowerFP_ROUND(Op, DAG);
6788 case ISD::TRAP:
6789 return lowerTRAP(Op, DAG);
6790 case ISD::DEBUGTRAP:
6791 return lowerDEBUGTRAP(Op, DAG);
6792 case ISD::ABS:
6793 case ISD::FABS:
6794 case ISD::FNEG:
6795 case ISD::FCANONICALIZE:
6796 case ISD::BSWAP:
6797 return splitUnaryVectorOp(Op, DAG);
6798 case ISD::FMINNUM:
6799 case ISD::FMAXNUM:
6800 return lowerFMINNUM_FMAXNUM(Op, DAG);
6801 case ISD::FMINIMUMNUM:
6802 case ISD::FMAXIMUMNUM:
6803 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
6804 case ISD::FMINIMUM:
6805 case ISD::FMAXIMUM:
6806 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
6807 case ISD::FLDEXP:
6808 case ISD::STRICT_FLDEXP:
6809 return lowerFLDEXP(Op, DAG);
6810 case ISD::FMA:
6811 return splitTernaryVectorOp(Op, DAG);
6812 case ISD::FP_TO_SINT:
6813 case ISD::FP_TO_UINT:
6814 return LowerFP_TO_INT(Op, DAG);
6815 case ISD::SHL:
6816 case ISD::SRA:
6817 case ISD::SRL:
6818 case ISD::ADD:
6819 case ISD::SUB:
6820 case ISD::SMIN:
6821 case ISD::SMAX:
6822 case ISD::UMIN:
6823 case ISD::UMAX:
6824 case ISD::FADD:
6825 case ISD::FMUL:
6826 case ISD::FMINNUM_IEEE:
6827 case ISD::FMAXNUM_IEEE:
6828 case ISD::UADDSAT:
6829 case ISD::USUBSAT:
6830 case ISD::SADDSAT:
6831 case ISD::SSUBSAT:
6832 return splitBinaryVectorOp(Op, DAG);
6833 case ISD::FCOPYSIGN:
6834 return lowerFCOPYSIGN(Op, DAG);
6835 case ISD::MUL:
6836 return lowerMUL(Op, DAG);
6837 case ISD::SMULO:
6838 case ISD::UMULO:
6839 return lowerXMULO(Op, DAG);
6840 case ISD::SMUL_LOHI:
6841 case ISD::UMUL_LOHI:
6842 return lowerXMUL_LOHI(Op, DAG);
6843 case ISD::DYNAMIC_STACKALLOC:
6844 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6845 case ISD::STACKSAVE:
6846 return LowerSTACKSAVE(Op, DAG);
6847 case ISD::GET_ROUNDING:
6848 return lowerGET_ROUNDING(Op, DAG);
6849 case ISD::SET_ROUNDING:
6850 return lowerSET_ROUNDING(Op, DAG);
6851 case ISD::PREFETCH:
6852 return lowerPREFETCH(Op, DAG);
6853 case ISD::FP_EXTEND:
6855 return lowerFP_EXTEND(Op, DAG);
6856 case ISD::GET_FPENV:
6857 return lowerGET_FPENV(Op, DAG);
6858 case ISD::SET_FPENV:
6859 return lowerSET_FPENV(Op, DAG);
6860 case ISD::ROTR:
6861 return lowerROTR(Op, DAG);
6862 }
6863 return SDValue();
6864}
6865
6866// Used for D16: Casts the result of an instruction into the right vector,
6867// packs values if loads return unpacked values.
6869 const SDLoc &DL, SelectionDAG &DAG,
6870 bool Unpacked) {
6871 if (!LoadVT.isVector())
6872 return Result;
6873
6874 // Cast back to the original packed type or to a larger type that is a
6875 // multiple of 32 bit for D16. Widening the return type is a required for
6876 // legalization.
6877 EVT FittingLoadVT = LoadVT;
6878 if ((LoadVT.getVectorNumElements() % 2) == 1) {
6879 FittingLoadVT =
6881 LoadVT.getVectorNumElements() + 1);
6882 }
6883
6884 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6885 // Truncate to v2i16/v4i16.
6886 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6887
6888 // Workaround legalizer not scalarizing truncate after vector op
6889 // legalization but not creating intermediate vector trunc.
6891 DAG.ExtractVectorElements(Result, Elts);
6892 for (SDValue &Elt : Elts)
6893 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6894
6895 // Pad illegal v1i16/v3fi6 to v4i16
6896 if ((LoadVT.getVectorNumElements() % 2) == 1)
6897 Elts.push_back(DAG.getPOISON(MVT::i16));
6898
6899 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6900
6901 // Bitcast to original type (v2f16/v4f16).
6902 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6903 }
6904
6905 // Cast back to the original packed type.
6906 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6907}
6908
6909SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6910 SelectionDAG &DAG,
6912 bool IsIntrinsic) const {
6913 SDLoc DL(M);
6914
6915 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6916 EVT LoadVT = M->getValueType(0);
6917
6918 EVT EquivLoadVT = LoadVT;
6919 if (LoadVT.isVector()) {
6920 if (Unpacked) {
6921 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6922 LoadVT.getVectorNumElements());
6923 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
6924 // Widen v3f16 to legal type
6925 EquivLoadVT =
6927 LoadVT.getVectorNumElements() + 1);
6928 }
6929 }
6930
6931 // Change from v4f16/v2f16 to EquivLoadVT.
6932 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
6933
6935 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
6936 M->getMemoryVT(), M->getMemOperand());
6937
6938 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
6939
6940 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
6941}
6942
6943SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
6944 SelectionDAG &DAG,
6945 ArrayRef<SDValue> Ops) const {
6946 SDLoc DL(M);
6947 EVT LoadVT = M->getValueType(0);
6948 EVT EltType = LoadVT.getScalarType();
6949 EVT IntVT = LoadVT.changeTypeToInteger();
6950
6951 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
6952
6953 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
6954 bool IsTFE = M->getNumValues() == 3;
6955
6956 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
6958 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
6959 : AMDGPUISD::BUFFER_LOAD;
6960
6961 if (IsD16) {
6962 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
6963 }
6964
6965 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6966 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
6967 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
6968 IsTFE);
6969
6970 if (isTypeLegal(LoadVT)) {
6971 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
6972 M->getMemOperand(), DAG);
6973 }
6974
6975 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
6976 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
6977 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
6978 M->getMemOperand(), DAG);
6979 return DAG.getMergeValues(
6980 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
6981 DL);
6982}
6983
6985 SelectionDAG &DAG) {
6986 EVT VT = N->getValueType(0);
6987 unsigned CondCode = N->getConstantOperandVal(3);
6988 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
6989 return DAG.getPOISON(VT);
6990
6991 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
6992
6993 SDValue LHS = N->getOperand(1);
6994 SDValue RHS = N->getOperand(2);
6995
6996 SDLoc DL(N);
6997
6998 EVT CmpVT = LHS.getValueType();
6999 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
7000 unsigned PromoteOp =
7002 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
7003 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
7004 }
7005
7006 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
7007
7008 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7009 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7010
7011 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
7012 DAG.getCondCode(CCOpcode));
7013 if (VT.bitsEq(CCVT))
7014 return SetCC;
7015 return DAG.getZExtOrTrunc(SetCC, DL, VT);
7016}
7017
7019 SelectionDAG &DAG) {
7020 EVT VT = N->getValueType(0);
7021
7022 unsigned CondCode = N->getConstantOperandVal(3);
7023 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
7024 return DAG.getPOISON(VT);
7025
7026 SDValue Src0 = N->getOperand(1);
7027 SDValue Src1 = N->getOperand(2);
7028 EVT CmpVT = Src0.getValueType();
7029 SDLoc SL(N);
7030
7031 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
7032 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7033 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7034 }
7035
7036 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
7037 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
7038 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7039 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7040 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7041 DAG.getCondCode(CCOpcode));
7042 if (VT.bitsEq(CCVT))
7043 return SetCC;
7044 return DAG.getZExtOrTrunc(SetCC, SL, VT);
7045}
7046
7048 SelectionDAG &DAG) {
7049 EVT VT = N->getValueType(0);
7050 SDValue Src = N->getOperand(1);
7051 SDLoc SL(N);
7052
7053 if (Src.getOpcode() == ISD::SETCC) {
7054 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
7055 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
7056 Src.getOperand(1), Src.getOperand(2));
7057 }
7058 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
7059 // (ballot 0) -> 0
7060 if (Arg->isZero())
7061 return DAG.getConstant(0, SL, VT);
7062
7063 // (ballot 1) -> EXEC/EXEC_LO
7064 if (Arg->isOne()) {
7065 Register Exec;
7066 if (VT.getScalarSizeInBits() == 32)
7067 Exec = AMDGPU::EXEC_LO;
7068 else if (VT.getScalarSizeInBits() == 64)
7069 Exec = AMDGPU::EXEC;
7070 else
7071 return SDValue();
7072
7073 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
7074 }
7075 }
7076
7077 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
7078 // ISD::SETNE)
7079 return DAG.getNode(
7080 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
7081 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
7082}
7083
7085 SelectionDAG &DAG) {
7086 EVT VT = N->getValueType(0);
7087 unsigned ValSize = VT.getSizeInBits();
7088 unsigned IID = N->getConstantOperandVal(0);
7089 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7090 IID == Intrinsic::amdgcn_permlanex16;
7091 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7092 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7093 SDLoc SL(N);
7094 MVT IntVT = MVT::getIntegerVT(ValSize);
7095 const GCNSubtarget *ST = TLI.getSubtarget();
7096 unsigned SplitSize = 32;
7097 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7098 ST->hasDPALU_DPP() &&
7099 AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))
7100 SplitSize = 64;
7101
7102 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
7103 SDValue Src2, MVT ValT) -> SDValue {
7105 switch (IID) {
7106 case Intrinsic::amdgcn_permlane16:
7107 case Intrinsic::amdgcn_permlanex16:
7108 case Intrinsic::amdgcn_update_dpp:
7109 Operands.push_back(N->getOperand(6));
7110 Operands.push_back(N->getOperand(5));
7111 Operands.push_back(N->getOperand(4));
7112 [[fallthrough]];
7113 case Intrinsic::amdgcn_writelane:
7114 Operands.push_back(Src2);
7115 [[fallthrough]];
7116 case Intrinsic::amdgcn_readlane:
7117 case Intrinsic::amdgcn_set_inactive:
7118 case Intrinsic::amdgcn_set_inactive_chain_arg:
7119 case Intrinsic::amdgcn_mov_dpp8:
7120 Operands.push_back(Src1);
7121 [[fallthrough]];
7122 case Intrinsic::amdgcn_readfirstlane:
7123 case Intrinsic::amdgcn_permlane64:
7124 Operands.push_back(Src0);
7125 break;
7126 default:
7127 llvm_unreachable("unhandled lane op");
7128 }
7129
7130 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
7131 std::reverse(Operands.begin(), Operands.end());
7132
7133 if (SDNode *GL = N->getGluedNode()) {
7134 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7135 GL = GL->getOperand(0).getNode();
7136 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7137 SDValue(GL, 0)));
7138 }
7139
7140 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
7141 };
7142
7143 SDValue Src0 = N->getOperand(1);
7144 SDValue Src1, Src2;
7145 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7146 IID == Intrinsic::amdgcn_mov_dpp8 ||
7147 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7148 Src1 = N->getOperand(2);
7149 if (IID == Intrinsic::amdgcn_writelane ||
7150 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7151 Src2 = N->getOperand(3);
7152 }
7153
7154 if (ValSize == SplitSize) {
7155 // Already legal
7156 return SDValue();
7157 }
7158
7159 if (ValSize < 32) {
7160 bool IsFloat = VT.isFloatingPoint();
7161 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
7162 SL, MVT::i32);
7163
7164 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7165 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
7166 SL, MVT::i32);
7167 }
7168
7169 if (IID == Intrinsic::amdgcn_writelane) {
7170 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
7171 SL, MVT::i32);
7172 }
7173
7174 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7175 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
7176 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
7177 }
7178
7179 if (ValSize % SplitSize != 0)
7180 return SDValue();
7181
7182 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
7183 EVT VT = N->getValueType(0);
7184 unsigned NE = VT.getVectorNumElements();
7185 EVT EltVT = VT.getVectorElementType();
7187 unsigned NumOperands = N->getNumOperands();
7188 SmallVector<SDValue, 4> Operands(NumOperands);
7189 SDNode *GL = N->getGluedNode();
7190
7191 // only handle convergencectrl_glue
7192 assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7193
7194 for (unsigned i = 0; i != NE; ++i) {
7195 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7196 ++j) {
7197 SDValue Operand = N->getOperand(j);
7198 EVT OperandVT = Operand.getValueType();
7199 if (OperandVT.isVector()) {
7200 // A vector operand; extract a single element.
7201 EVT OperandEltVT = OperandVT.getVectorElementType();
7202 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
7203 Operand, DAG.getVectorIdxConstant(i, SL));
7204 } else {
7205 // A scalar operand; just use it as is.
7206 Operands[j] = Operand;
7207 }
7208 }
7209
7210 if (GL)
7211 Operands[NumOperands - 1] =
7212 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7213 SDValue(GL->getOperand(0).getNode(), 0));
7214
7215 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
7216 }
7217
7218 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
7219 return DAG.getBuildVector(VecVT, SL, Scalars);
7220 };
7221
7222 if (VT.isVector()) {
7223 switch (MVT::SimpleValueType EltTy =
7225 case MVT::i32:
7226 case MVT::f32:
7227 if (SplitSize == 32) {
7228 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
7229 return unrollLaneOp(LaneOp.getNode());
7230 }
7231 [[fallthrough]];
7232 case MVT::i16:
7233 case MVT::f16:
7234 case MVT::bf16: {
7235 unsigned SubVecNumElt =
7236 SplitSize / VT.getVectorElementType().getSizeInBits();
7237 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
7239 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7240 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7241 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
7242 DAG.getConstant(EltIdx, SL, MVT::i32));
7243
7244 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7245 IsPermLane16)
7246 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
7247 DAG.getConstant(EltIdx, SL, MVT::i32));
7248
7249 if (IID == Intrinsic::amdgcn_writelane)
7250 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
7251 DAG.getConstant(EltIdx, SL, MVT::i32));
7252
7253 Pieces.push_back(
7254 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7255 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7256 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7257 EltIdx += SubVecNumElt;
7258 }
7259 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
7260 }
7261 default:
7262 // Handle all other cases by bitcasting to i32 vectors
7263 break;
7264 }
7265 }
7266
7267 MVT VecVT =
7268 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
7269 Src0 = DAG.getBitcast(VecVT, Src0);
7270
7271 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7272 Src1 = DAG.getBitcast(VecVT, Src1);
7273
7274 if (IID == Intrinsic::amdgcn_writelane)
7275 Src2 = DAG.getBitcast(VecVT, Src2);
7276
7277 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7278 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
7279 return DAG.getBitcast(VT, UnrolledLaneOp);
7280}
7281
7284 SelectionDAG &DAG) const {
7285 switch (N->getOpcode()) {
7287 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
7288 Results.push_back(Res);
7289 return;
7290 }
7292 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
7293 Results.push_back(Res);
7294 return;
7295 }
7297 unsigned IID = N->getConstantOperandVal(0);
7298 switch (IID) {
7299 case Intrinsic::amdgcn_make_buffer_rsrc:
7300 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
7301 return;
7302 case Intrinsic::amdgcn_cvt_pkrtz: {
7303 SDValue Src0 = N->getOperand(1);
7304 SDValue Src1 = N->getOperand(2);
7305 SDLoc SL(N);
7306 SDValue Cvt =
7307 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
7308 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7309 return;
7310 }
7311 case Intrinsic::amdgcn_cvt_pknorm_i16:
7312 case Intrinsic::amdgcn_cvt_pknorm_u16:
7313 case Intrinsic::amdgcn_cvt_pk_i16:
7314 case Intrinsic::amdgcn_cvt_pk_u16: {
7315 SDValue Src0 = N->getOperand(1);
7316 SDValue Src1 = N->getOperand(2);
7317 SDLoc SL(N);
7318 unsigned Opcode;
7319
7320 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7322 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7324 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7326 else
7328
7329 EVT VT = N->getValueType(0);
7330 if (isTypeLegal(VT))
7331 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
7332 else {
7333 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
7334 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7335 }
7336 return;
7337 }
7338 case Intrinsic::amdgcn_s_buffer_load: {
7339 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
7340 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
7341 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
7342 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
7343 // s_buffer_load_i8.
7344 if (!Subtarget->hasScalarSubwordLoads())
7345 return;
7346 SDValue Op = SDValue(N, 0);
7347 SDValue Rsrc = Op.getOperand(1);
7348 SDValue Offset = Op.getOperand(2);
7349 SDValue CachePolicy = Op.getOperand(3);
7350 EVT VT = Op.getValueType();
7351 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
7352 SDLoc DL(Op);
7354 const DataLayout &DataLayout = DAG.getDataLayout();
7355 Align Alignment =
7361 VT.getStoreSize(), Alignment);
7362 SDValue LoadVal;
7363 if (!Offset->isDivergent()) {
7364 SDValue Ops[] = {Rsrc, // source register
7365 Offset, CachePolicy};
7366 SDValue BufferLoad =
7368 DAG.getVTList(MVT::i32), Ops, VT, MMO);
7369 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
7370 } else {
7371 SDValue Ops[] = {
7372 DAG.getEntryNode(), // Chain
7373 Rsrc, // rsrc
7374 DAG.getConstant(0, DL, MVT::i32), // vindex
7375 {}, // voffset
7376 {}, // soffset
7377 {}, // offset
7378 CachePolicy, // cachepolicy
7379 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7380 };
7381 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
7382 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
7383 }
7384 Results.push_back(LoadVal);
7385 return;
7386 }
7387 case Intrinsic::amdgcn_dead: {
7388 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
7389 Results.push_back(DAG.getPOISON(N->getValueType(I)));
7390 return;
7391 }
7392 }
7393 break;
7394 }
7396 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
7397 if (Res.getOpcode() == ISD::MERGE_VALUES) {
7398 // FIXME: Hacky
7399 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
7400 Results.push_back(Res.getOperand(I));
7401 }
7402 } else {
7403 Results.push_back(Res);
7404 Results.push_back(Res.getValue(1));
7405 }
7406 return;
7407 }
7408
7409 break;
7410 }
7411 case ISD::SELECT: {
7412 SDLoc SL(N);
7413 EVT VT = N->getValueType(0);
7414 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
7415 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
7416 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
7417
7418 EVT SelectVT = NewVT;
7419 if (NewVT.bitsLT(MVT::i32)) {
7420 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
7421 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
7422 SelectVT = MVT::i32;
7423 }
7424
7425 SDValue NewSelect =
7426 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
7427
7428 if (NewVT != SelectVT)
7429 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
7430 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
7431 return;
7432 }
7433 case ISD::FNEG: {
7434 if (N->getValueType(0) != MVT::v2f16)
7435 break;
7436
7437 SDLoc SL(N);
7438 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7439
7440 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
7441 DAG.getConstant(0x80008000, SL, MVT::i32));
7442 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7443 return;
7444 }
7445 case ISD::FABS: {
7446 if (N->getValueType(0) != MVT::v2f16)
7447 break;
7448
7449 SDLoc SL(N);
7450 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7451
7452 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
7453 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
7454 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7455 return;
7456 }
7457 case ISD::FSQRT: {
7458 if (N->getValueType(0) != MVT::f16)
7459 break;
7460 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
7461 break;
7462 }
7463 default:
7465 break;
7466 }
7467}
7468
7469/// Helper function for LowerBRCOND
7470static SDNode *findUser(SDValue Value, unsigned Opcode) {
7471
7472 for (SDUse &U : Value->uses()) {
7473 if (U.get() != Value)
7474 continue;
7475
7476 if (U.getUser()->getOpcode() == Opcode)
7477 return U.getUser();
7478 }
7479 return nullptr;
7480}
7481
7482unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
7483 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
7484 switch (Intr->getConstantOperandVal(1)) {
7485 case Intrinsic::amdgcn_if:
7486 return AMDGPUISD::IF;
7487 case Intrinsic::amdgcn_else:
7488 return AMDGPUISD::ELSE;
7489 case Intrinsic::amdgcn_loop:
7490 return AMDGPUISD::LOOP;
7491 case Intrinsic::amdgcn_end_cf:
7492 llvm_unreachable("should not occur");
7493 default:
7494 return 0;
7495 }
7496 }
7497
7498 // break, if_break, else_break are all only used as inputs to loop, not
7499 // directly as branch conditions.
7500 return 0;
7501}
7502
7509
7511 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7512 return false;
7513
7514 // FIXME: Either avoid relying on address space here or change the default
7515 // address space for functions to avoid the explicit check.
7516 return (GV->getValueType()->isFunctionTy() ||
7519}
7520
7522 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
7523}
7524
7526 if (!GV->hasExternalLinkage())
7527 return true;
7528
7529 const auto OS = getTargetMachine().getTargetTriple().getOS();
7530 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
7531}
7532
7533/// This transforms the control flow intrinsics to get the branch destination as
7534/// last parameter, also switches branch target with BR if the need arise
7535SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
7536 SDLoc DL(BRCOND);
7537
7538 SDNode *Intr = BRCOND.getOperand(1).getNode();
7539 SDValue Target = BRCOND.getOperand(2);
7540 SDNode *BR = nullptr;
7541 SDNode *SetCC = nullptr;
7542
7543 if (Intr->getOpcode() == ISD::SETCC) {
7544 // As long as we negate the condition everything is fine
7545 SetCC = Intr;
7546 Intr = SetCC->getOperand(0).getNode();
7547
7548 } else {
7549 // Get the target from BR if we don't negate the condition
7550 BR = findUser(BRCOND, ISD::BR);
7551 assert(BR && "brcond missing unconditional branch user");
7552 Target = BR->getOperand(1);
7553 }
7554
7555 unsigned CFNode = isCFIntrinsic(Intr);
7556 if (CFNode == 0) {
7557 // This is a uniform branch so we don't need to legalize.
7558 return BRCOND;
7559 }
7560
7561 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
7563
7564 assert(!SetCC ||
7565 (SetCC->getConstantOperandVal(1) == 1 &&
7566 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
7567 ISD::SETNE));
7568
7569 // operands of the new intrinsic call
7571 if (HaveChain)
7572 Ops.push_back(BRCOND.getOperand(0));
7573
7574 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
7575 Ops.push_back(Target);
7576
7577 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
7578
7579 // build the new intrinsic call
7580 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
7581
7582 if (!HaveChain) {
7583 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
7584
7586 }
7587
7588 if (BR) {
7589 // Give the branch instruction our target
7590 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
7591 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
7592 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
7593 }
7594
7595 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
7596
7597 // Copy the intrinsic results to registers
7598 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
7599 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
7600 if (!CopyToReg)
7601 continue;
7602
7603 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
7604 SDValue(Result, i - 1), SDValue());
7605
7606 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
7607 }
7608
7609 // Remove the old intrinsic from the chain
7610 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
7611 Intr->getOperand(0));
7612
7613 return Chain;
7614}
7615
7616SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
7617 MVT VT = Op.getSimpleValueType();
7618 SDLoc DL(Op);
7619 // Checking the depth
7620 if (Op.getConstantOperandVal(0) != 0)
7621 return DAG.getConstant(0, DL, VT);
7622
7623 MachineFunction &MF = DAG.getMachineFunction();
7624 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7625 // Check for kernel and shader functions
7626 if (Info->isEntryFunction())
7627 return DAG.getConstant(0, DL, VT);
7628
7629 MachineFrameInfo &MFI = MF.getFrameInfo();
7630 // There is a call to @llvm.returnaddress in this function
7631 MFI.setReturnAddressIsTaken(true);
7632
7633 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
7634 // Get the return address reg and mark it as an implicit live-in
7635 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
7636 getRegClassFor(VT, Op.getNode()->isDivergent()));
7637
7638 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
7639}
7640
7641SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
7642 const SDLoc &DL, EVT VT) const {
7643 return Op.getValueType().bitsLE(VT)
7644 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
7645 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
7646 DAG.getTargetConstant(0, DL, MVT::i32));
7647}
7648
7649SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
7650 SelectionDAG &DAG) const {
7651 EVT DstVT = Op.getValueType();
7652 unsigned NumElts = DstVT.getVectorNumElements();
7653 assert(NumElts > 2 && isPowerOf2_32(NumElts));
7654
7655 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
7656
7657 SDLoc DL(Op);
7658 unsigned Opc = Op.getOpcode();
7659 SDValue Flags = Op.getOperand(1);
7660 EVT HalfDstVT =
7661 EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2);
7662 SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags);
7663 SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags);
7664
7665 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi);
7666}
7667
7668SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
7669 SDValue Src = Op.getOperand(0);
7670 EVT SrcVT = Src.getValueType();
7671 EVT DstVT = Op.getValueType();
7672
7673 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
7674 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
7675 if (SrcVT.getScalarType() != MVT::f32)
7676 return SDValue();
7677 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
7678 }
7679
7680 if (SrcVT.getScalarType() != MVT::f64)
7681 return Op;
7682
7683 SDLoc DL(Op);
7684 if (DstVT == MVT::f16) {
7685 // TODO: Handle strictfp
7686 if (Op.getOpcode() != ISD::FP_ROUND)
7687 return Op;
7688
7689 if (!Subtarget->has16BitInsts()) {
7690 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
7691 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7692 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7693 }
7694 if (Op->getFlags().hasApproximateFuncs()) {
7695 SDValue Flags = Op.getOperand(1);
7696 SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
7697 return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
7698 }
7699 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
7700 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7701 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7702 }
7703
7704 assert(DstVT.getScalarType() == MVT::bf16 &&
7705 "custom lower FP_ROUND for f16 or bf16");
7706 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
7707
7708 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
7709 // hardware f32 -> bf16 instruction.
7710 EVT F32VT = SrcVT.isVector() ? SrcVT.changeVectorElementType(MVT::f32) :
7711 MVT::f32;
7712 SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);
7713 return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,
7714 DAG.getTargetConstant(0, DL, MVT::i32));
7715}
7716
7717SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
7718 SelectionDAG &DAG) const {
7719 EVT VT = Op.getValueType();
7720 const MachineFunction &MF = DAG.getMachineFunction();
7721 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7722 bool IsIEEEMode = Info->getMode().IEEE;
7723
7724 // FIXME: Assert during selection that this is only selected for
7725 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
7726 // mode functions, but this happens to be OK since it's only done in cases
7727 // where there is known no sNaN.
7728 if (IsIEEEMode)
7729 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
7730
7731 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7732 VT == MVT::v16bf16)
7733 return splitBinaryVectorOp(Op, DAG);
7734 return Op;
7735}
7736
7737SDValue
7738SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
7739 SelectionDAG &DAG) const {
7740 EVT VT = Op.getValueType();
7741 const MachineFunction &MF = DAG.getMachineFunction();
7742 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7743 bool IsIEEEMode = Info->getMode().IEEE;
7744
7745 if (IsIEEEMode)
7746 return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
7747
7748 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7749 VT == MVT::v16bf16)
7750 return splitBinaryVectorOp(Op, DAG);
7751 return Op;
7752}
7753
7754SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
7755 SelectionDAG &DAG) const {
7756 EVT VT = Op.getValueType();
7757 if (VT.isVector())
7758 return splitBinaryVectorOp(Op, DAG);
7759
7760 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7761 !Subtarget->hasMinimum3Maximum3F16() &&
7762 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7763 "should not need to widen f16 minimum/maximum to v2f16");
7764
7765 // Widen f16 operation to v2f16
7766
7767 // fminimum f16:x, f16:y ->
7768 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
7769 // (v2f16 (scalar_to_vector y))), 0
7770 SDLoc SL(Op);
7771 SDValue WideSrc0 =
7772 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
7773 SDValue WideSrc1 =
7774 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
7775
7776 SDValue Widened =
7777 DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7778
7779 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
7780 DAG.getConstant(0, SL, MVT::i32));
7781}
7782
7783SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
7784 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
7785 EVT VT = Op.getValueType();
7786 assert(VT == MVT::f16);
7787
7788 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
7789 EVT ExpVT = Exp.getValueType();
7790 if (ExpVT == MVT::i16)
7791 return Op;
7792
7793 SDLoc DL(Op);
7794
7795 // Correct the exponent type for f16 to i16.
7796 // Clamp the range of the exponent to the instruction's range.
7797
7798 // TODO: This should be a generic narrowing legalization, and can easily be
7799 // for GlobalISel.
7800
7801 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
7802 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
7803
7804 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
7805 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
7806
7807 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
7808
7809 if (IsStrict) {
7810 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
7811 {Op.getOperand(0), Op.getOperand(1), TruncExp});
7812 }
7813
7814 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
7815}
7816
7818 switch (Op->getOpcode()) {
7819 case ISD::SRA:
7820 case ISD::SMIN:
7821 case ISD::SMAX:
7822 return ISD::SIGN_EXTEND;
7823 case ISD::SRL:
7824 case ISD::UMIN:
7825 case ISD::UMAX:
7826 return ISD::ZERO_EXTEND;
7827 case ISD::ADD:
7828 case ISD::SUB:
7829 case ISD::AND:
7830 case ISD::OR:
7831 case ISD::XOR:
7832 case ISD::SHL:
7833 case ISD::SELECT:
7834 case ISD::MUL:
7835 // operation result won't be influenced by garbage high bits.
7836 // TODO: are all of those cases correct, and are there more?
7837 return ISD::ANY_EXTEND;
7838 case ISD::SETCC: {
7839 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7841 }
7842 default:
7843 llvm_unreachable("unexpected opcode!");
7844 }
7845}
7846
7847SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
7848 DAGCombinerInfo &DCI) const {
7849 const unsigned Opc = Op.getOpcode();
7850 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
7851 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
7852 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
7853 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
7854 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
7855
7856 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
7857 : Op->getOperand(0).getValueType();
7858 auto ExtTy = OpTy.changeElementType(MVT::i32);
7859
7860 if (DCI.isBeforeLegalizeOps() ||
7861 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
7862 return SDValue();
7863
7864 auto &DAG = DCI.DAG;
7865
7866 SDLoc DL(Op);
7867 SDValue LHS;
7868 SDValue RHS;
7869 if (Opc == ISD::SELECT) {
7870 LHS = Op->getOperand(1);
7871 RHS = Op->getOperand(2);
7872 } else {
7873 LHS = Op->getOperand(0);
7874 RHS = Op->getOperand(1);
7875 }
7876
7877 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
7878 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
7879
7880 // Special case: for shifts, the RHS always needs a zext.
7881 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
7882 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
7883 else
7884 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
7885
7886 // setcc always return i1/i1 vec so no need to truncate after.
7887 if (Opc == ISD::SETCC) {
7888 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7889 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
7890 }
7891
7892 // For other ops, we extend the operation's return type as well so we need to
7893 // truncate back to the original type.
7894 SDValue NewVal;
7895 if (Opc == ISD::SELECT)
7896 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
7897 else
7898 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
7899
7900 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
7901}
7902
7903SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
7904 SDValue Mag = Op.getOperand(0);
7905 EVT MagVT = Mag.getValueType();
7906
7907 if (MagVT.getVectorNumElements() > 2)
7908 return splitBinaryVectorOp(Op, DAG);
7909
7910 SDValue Sign = Op.getOperand(1);
7911 EVT SignVT = Sign.getValueType();
7912
7913 if (MagVT == SignVT)
7914 return Op;
7915
7916 // fcopysign v2f16:mag, v2f32:sign ->
7917 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
7918
7919 SDLoc SL(Op);
7920 SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
7921 SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32);
7922
7923 SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
7924
7925 return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
7926}
7927
7928// Custom lowering for vector multiplications and s_mul_u64.
7929SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
7930 EVT VT = Op.getValueType();
7931
7932 // Split vector operands.
7933 if (VT.isVector())
7934 return splitBinaryVectorOp(Op, DAG);
7935
7936 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
7937
7938 // There are four ways to lower s_mul_u64:
7939 //
7940 // 1. If all the operands are uniform, then we lower it as it is.
7941 //
7942 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
7943 // multiplications because there is not a vector equivalent of s_mul_u64.
7944 //
7945 // 3. If the cost model decides that it is more efficient to use vector
7946 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
7947 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
7948 //
7949 // 4. If the cost model decides to use vector registers and both of the
7950 // operands are zero-extended/sign-extended from 32-bits, then we split the
7951 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
7952 // possible to check if the operands are zero-extended or sign-extended in
7953 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
7954 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
7955 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
7956 // If the cost model decides that we have to use vector registers, then
7957 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
7958 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
7959 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
7960 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
7961 // SIInstrInfo.cpp .
7962
7963 if (Op->isDivergent())
7964 return SDValue();
7965
7966 SDValue Op0 = Op.getOperand(0);
7967 SDValue Op1 = Op.getOperand(1);
7968 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
7969 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
7970 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
7971 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
7972 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
7973 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
7974 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
7975 SDLoc SL(Op);
7976 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7977 return SDValue(
7978 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7979 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
7980 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
7981 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7982 return SDValue(
7983 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7984 // If all the operands are uniform, then we lower s_mul_u64 as it is.
7985 return Op;
7986}
7987
7988SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
7989 EVT VT = Op.getValueType();
7990 SDLoc SL(Op);
7991 SDValue LHS = Op.getOperand(0);
7992 SDValue RHS = Op.getOperand(1);
7993 bool isSigned = Op.getOpcode() == ISD::SMULO;
7994
7995 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
7996 const APInt &C = RHSC->getAPIntValue();
7997 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
7998 if (C.isPowerOf2()) {
7999 // smulo(x, signed_min) is same as umulo(x, signed_min).
8000 bool UseArithShift = isSigned && !C.isMinSignedValue();
8001 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
8002 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
8003 SDValue Overflow =
8004 DAG.getSetCC(SL, MVT::i1,
8005 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
8006 Result, ShiftAmt),
8007 LHS, ISD::SETNE);
8008 return DAG.getMergeValues({Result, Overflow}, SL);
8009 }
8010 }
8011
8012 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
8013 SDValue Top =
8014 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
8015
8016 SDValue Sign = isSigned
8017 ? DAG.getNode(ISD::SRA, SL, VT, Result,
8018 DAG.getConstant(VT.getScalarSizeInBits() - 1,
8019 SL, MVT::i32))
8020 : DAG.getConstant(0, SL, VT);
8021 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
8022
8023 return DAG.getMergeValues({Result, Overflow}, SL);
8024}
8025
8026SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
8027 if (Op->isDivergent()) {
8028 // Select to V_MAD_[IU]64_[IU]32.
8029 return Op;
8030 }
8031 if (Subtarget->hasSMulHi()) {
8032 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
8033 return SDValue();
8034 }
8035 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
8036 // calculate the high part, so we might as well do the whole thing with
8037 // V_MAD_[IU]64_[IU]32.
8038 return Op;
8039}
8040
8041SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
8042 if (!Subtarget->isTrapHandlerEnabled() ||
8043 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
8044 return lowerTrapEndpgm(Op, DAG);
8045
8046 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
8047 : lowerTrapHsaQueuePtr(Op, DAG);
8048}
8049
8050SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
8051 SDLoc SL(Op);
8052 SDValue Chain = Op.getOperand(0);
8053 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8054}
8055
8056SDValue
8057SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
8058 const SDLoc &DL, Align Alignment,
8059 ImplicitParameter Param) const {
8060 MachineFunction &MF = DAG.getMachineFunction();
8061 uint64_t Offset = getImplicitParameterOffset(MF, Param);
8062 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
8063 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8064 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
8067}
8068
8069SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
8070 SelectionDAG &DAG) const {
8071 SDLoc SL(Op);
8072 SDValue Chain = Op.getOperand(0);
8073
8074 SDValue QueuePtr;
8075 // For code object version 5, QueuePtr is passed through implicit kernarg.
8076 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8078 QueuePtr =
8079 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
8080 } else {
8081 MachineFunction &MF = DAG.getMachineFunction();
8082 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8083 Register UserSGPR = Info->getQueuePtrUserSGPR();
8084
8085 if (UserSGPR == AMDGPU::NoRegister) {
8086 // We probably are in a function incorrectly marked with
8087 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
8088 // trap, so just use a null pointer.
8089 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
8090 } else {
8091 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
8092 MVT::i64);
8093 }
8094 }
8095
8096 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
8097 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
8098
8099 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8100 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
8101 ToReg.getValue(1)};
8102 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8103}
8104
8105SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
8106 SDLoc SL(Op);
8107 SDValue Chain = Op.getOperand(0);
8108
8109 // We need to simulate the 's_trap 2' instruction on targets that run in
8110 // PRIV=1 (where it is treated as a nop).
8111 if (Subtarget->hasPrivEnabledTrap2NopBug())
8112 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
8113
8114 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8115 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8116 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8117}
8118
8119SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
8120 SDLoc SL(Op);
8121 SDValue Chain = Op.getOperand(0);
8122 MachineFunction &MF = DAG.getMachineFunction();
8123
8124 if (!Subtarget->isTrapHandlerEnabled() ||
8125 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
8126 LLVMContext &Ctx = MF.getFunction().getContext();
8127 Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(),
8128 "debugtrap handler not supported",
8129 Op.getDebugLoc(), DS_Warning));
8130 return Chain;
8131 }
8132
8133 uint64_t TrapID =
8134 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
8135 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8136 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8137}
8138
8139SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
8140 SelectionDAG &DAG) const {
8141 if (Subtarget->hasApertureRegs()) {
8142 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
8143 ? AMDGPU::SRC_SHARED_BASE
8144 : AMDGPU::SRC_PRIVATE_BASE;
8145 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8146 !Subtarget->hasGloballyAddressableScratch()) &&
8147 "Cannot use src_private_base with globally addressable scratch!");
8148 // Note: this feature (register) is broken. When used as a 32-bit operand,
8149 // it returns a wrong value (all zeroes?). The real value is in the upper 32
8150 // bits.
8151 //
8152 // To work around the issue, emit a 64 bit copy from this register
8153 // then extract the high bits. Note that this shouldn't even result in a
8154 // shift being emitted and simply become a pair of registers (e.g.):
8155 // s_mov_b64 s[6:7], src_shared_base
8156 // v_mov_b32_e32 v1, s7
8157 SDValue Copy =
8158 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ApertureRegNo, MVT::v2i32);
8159 return DAG.getExtractVectorElt(DL, MVT::i32, Copy, 1);
8160 }
8161
8162 // For code object version 5, private_base and shared_base are passed through
8163 // implicit kernargs.
8164 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8168 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
8169 }
8170
8171 MachineFunction &MF = DAG.getMachineFunction();
8172 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8173 Register UserSGPR = Info->getQueuePtrUserSGPR();
8174 if (UserSGPR == AMDGPU::NoRegister) {
8175 // We probably are in a function incorrectly marked with
8176 // amdgpu-no-queue-ptr. This is undefined.
8177 return DAG.getPOISON(MVT::i32);
8178 }
8179
8180 SDValue QueuePtr =
8181 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
8182
8183 // Offset into amd_queue_t for group_segment_aperture_base_hi /
8184 // private_segment_aperture_base_hi.
8185 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
8186
8187 SDValue Ptr =
8188 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
8189
8190 // TODO: Use custom target PseudoSourceValue.
8191 // TODO: We should use the value from the IR intrinsic call, but it might not
8192 // be available and how do we get it?
8193 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8194 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
8195 commonAlignment(Align(64), StructOffset),
8198}
8199
8200/// Return true if the value is a known valid address, such that a null check is
8201/// not necessary.
8203 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
8205 return true;
8206
8207 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
8208 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
8209
8210 // TODO: Search through arithmetic, handle arguments and loads
8211 // marked nonnull.
8212 return false;
8213}
8214
8215SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
8216 SelectionDAG &DAG) const {
8217 SDLoc SL(Op);
8218
8219 const AMDGPUTargetMachine &TM =
8220 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
8221
8222 unsigned DestAS, SrcAS;
8223 SDValue Src;
8224 bool IsNonNull = false;
8225 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
8226 SrcAS = ASC->getSrcAddressSpace();
8227 Src = ASC->getOperand(0);
8228 DestAS = ASC->getDestAddressSpace();
8229 } else {
8230 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
8231 Op.getConstantOperandVal(0) ==
8232 Intrinsic::amdgcn_addrspacecast_nonnull);
8233 Src = Op->getOperand(1);
8234 SrcAS = Op->getConstantOperandVal(2);
8235 DestAS = Op->getConstantOperandVal(3);
8236 IsNonNull = true;
8237 }
8238
8239 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
8240
8241 // flat -> local/private
8242 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
8243 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
8244 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
8245 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8246
8247 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
8248 Subtarget->hasGloballyAddressableScratch()) {
8249 // flat -> private with globally addressable scratch: subtract
8250 // src_flat_scratch_base_lo.
8251 SDValue FlatScratchBaseLo(
8252 DAG.getMachineNode(
8253 AMDGPU::S_MOV_B32, SL, MVT::i32,
8254 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8255 0);
8256 Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
8257 }
8258
8259 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8260 return Ptr;
8261
8262 unsigned NullVal = TM.getNullPointerValue(DestAS);
8263 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8264 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
8265
8266 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
8267 SegmentNullPtr);
8268 }
8269 }
8270
8271 // local/private -> flat
8272 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
8273 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
8274 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
8275 SDValue CvtPtr;
8276 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
8277 Subtarget->hasGloballyAddressableScratch()) {
8278 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
8279 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
8280 SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
8281 SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
8282 ThreadID = DAG.getNode(
8283 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8284 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
8285 AllOnes, ThreadID);
8286 if (Subtarget->isWave64())
8287 ThreadID = DAG.getNode(
8288 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8289 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
8290 AllOnes, ThreadID);
8291 SDValue ShAmt = DAG.getShiftAmountConstant(
8292 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8293 SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
8294 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
8295 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8296 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
8297 // 64-bit hi:lo value.
8298 SDValue FlatScratchBase = {
8299 DAG.getMachineNode(
8300 AMDGPU::S_MOV_B64, SL, MVT::i64,
8301 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8302 0};
8303 CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8304 } else {
8305 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8306 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
8307 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8308 }
8309
8310 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8311 return CvtPtr;
8312
8313 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8314 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8315
8316 SDValue NonNull =
8317 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
8318
8319 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
8320 FlatNullPtr);
8321 }
8322 }
8323
8324 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8325 Op.getValueType() == MVT::i64) {
8326 const SIMachineFunctionInfo *Info =
8327 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
8328 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
8329 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
8330 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8331 }
8332
8333 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8334 Src.getValueType() == MVT::i64)
8335 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8336
8337 // global <-> flat are no-ops and never emitted.
8338
8339 // Invalid casts are poison.
8340 return DAG.getPOISON(Op->getValueType(0));
8341}
8342
8343// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
8344// the small vector and inserting them into the big vector. That is better than
8345// the default expansion of doing it via a stack slot. Even though the use of
8346// the stack slot would be optimized away afterwards, the stack slot itself
8347// remains.
8348SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
8349 SelectionDAG &DAG) const {
8350 SDValue Vec = Op.getOperand(0);
8351 SDValue Ins = Op.getOperand(1);
8352 SDValue Idx = Op.getOperand(2);
8353 EVT VecVT = Vec.getValueType();
8354 EVT InsVT = Ins.getValueType();
8355 EVT EltVT = VecVT.getVectorElementType();
8356 unsigned InsNumElts = InsVT.getVectorNumElements();
8357 unsigned IdxVal = Idx->getAsZExtVal();
8358 SDLoc SL(Op);
8359
8360 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
8361 // Insert 32-bit registers at a time.
8362 assert(InsNumElts % 2 == 0 && "expect legal vector types");
8363
8364 unsigned VecNumElts = VecVT.getVectorNumElements();
8365 EVT NewVecVT =
8366 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
8367 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8369 MVT::i32, InsNumElts / 2);
8370
8371 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8372 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8373
8374 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
8375 SDValue Elt;
8376 if (InsNumElts == 2) {
8377 Elt = Ins;
8378 } else {
8379 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
8380 DAG.getConstant(I, SL, MVT::i32));
8381 }
8382 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
8383 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
8384 }
8385
8386 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
8387 }
8388
8389 for (unsigned I = 0; I != InsNumElts; ++I) {
8390 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
8391 DAG.getConstant(I, SL, MVT::i32));
8392 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
8393 DAG.getConstant(IdxVal + I, SL, MVT::i32));
8394 }
8395 return Vec;
8396}
8397
8398SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
8399 SelectionDAG &DAG) const {
8400 SDValue Vec = Op.getOperand(0);
8401 SDValue InsVal = Op.getOperand(1);
8402 SDValue Idx = Op.getOperand(2);
8403 EVT VecVT = Vec.getValueType();
8404 EVT EltVT = VecVT.getVectorElementType();
8405 unsigned VecSize = VecVT.getSizeInBits();
8406 unsigned EltSize = EltVT.getSizeInBits();
8407 SDLoc SL(Op);
8408
8409 // Specially handle the case of v4i16 with static indexing.
8410 unsigned NumElts = VecVT.getVectorNumElements();
8411 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
8412 if (NumElts == 4 && EltSize == 16 && KIdx) {
8413 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
8414
8415 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8416 DAG.getConstant(0, SL, MVT::i32));
8417 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8418 DAG.getConstant(1, SL, MVT::i32));
8419
8420 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8421 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8422
8423 unsigned Idx = KIdx->getZExtValue();
8424 bool InsertLo = Idx < 2;
8425 SDValue InsHalf = DAG.getNode(
8426 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
8427 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8428 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8429
8430 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8431
8432 SDValue Concat =
8433 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
8434 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8435
8436 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
8437 }
8438
8439 // Static indexing does not lower to stack access, and hence there is no need
8440 // for special custom lowering to avoid stack access.
8441 if (isa<ConstantSDNode>(Idx))
8442 return SDValue();
8443
8444 // Avoid stack access for dynamic indexing by custom lowering to
8445 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
8446
8447 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
8448
8449 MVT IntVT = MVT::getIntegerVT(VecSize);
8450
8451 // Convert vector index to bit-index and get the required bit mask.
8452 assert(isPowerOf2_32(EltSize));
8453 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
8454 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8455 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8456 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
8457 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
8458
8459 // 1. Create a congruent vector with the target value in each element.
8460 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
8461 DAG.getSplatBuildVector(VecVT, SL, InsVal));
8462
8463 // 2. Mask off all other indices except the required index within (1).
8464 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
8465
8466 // 3. Mask off the required index within the target vector.
8467 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8468 SDValue RHS =
8469 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
8470
8471 // 4. Get (2) and (3) ORed into the target vector.
8472 SDValue BFI =
8473 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
8474
8475 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
8476}
8477
8478SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
8479 SelectionDAG &DAG) const {
8480 SDLoc SL(Op);
8481
8482 EVT ResultVT = Op.getValueType();
8483 SDValue Vec = Op.getOperand(0);
8484 SDValue Idx = Op.getOperand(1);
8485 EVT VecVT = Vec.getValueType();
8486 unsigned VecSize = VecVT.getSizeInBits();
8487 EVT EltVT = VecVT.getVectorElementType();
8488
8489 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
8490
8491 // Make sure we do any optimizations that will make it easier to fold
8492 // source modifiers before obscuring it with bit operations.
8493
8494 // XXX - Why doesn't this get called when vector_shuffle is expanded?
8495 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
8496 return Combined;
8497
8498 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8499 SDValue Lo, Hi;
8500 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
8501
8502 if (VecSize == 128) {
8503 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
8504 Lo = DAG.getBitcast(LoVT,
8505 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8506 DAG.getConstant(0, SL, MVT::i32)));
8507 Hi = DAG.getBitcast(HiVT,
8508 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8509 DAG.getConstant(1, SL, MVT::i32)));
8510 } else if (VecSize == 256) {
8511 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
8512 SDValue Parts[4];
8513 for (unsigned P = 0; P < 4; ++P) {
8514 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8515 DAG.getConstant(P, SL, MVT::i32));
8516 }
8517
8518 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8519 Parts[0], Parts[1]));
8520 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8521 Parts[2], Parts[3]));
8522 } else {
8523 assert(VecSize == 512);
8524
8525 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
8526 SDValue Parts[8];
8527 for (unsigned P = 0; P < 8; ++P) {
8528 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8529 DAG.getConstant(P, SL, MVT::i32));
8530 }
8531
8532 Lo = DAG.getBitcast(LoVT,
8533 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8534 Parts[0], Parts[1], Parts[2], Parts[3]));
8535 Hi = DAG.getBitcast(HiVT,
8536 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8537 Parts[4], Parts[5], Parts[6], Parts[7]));
8538 }
8539
8540 EVT IdxVT = Idx.getValueType();
8541 unsigned NElem = VecVT.getVectorNumElements();
8542 assert(isPowerOf2_32(NElem));
8543 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
8544 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
8545 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
8546 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
8547 }
8548
8549 assert(VecSize <= 64);
8550
8551 MVT IntVT = MVT::getIntegerVT(VecSize);
8552
8553 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
8554 SDValue VecBC = peekThroughBitcasts(Vec);
8555 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8556 SDValue Src = VecBC.getOperand(0);
8557 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8558 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
8559 }
8560
8561 unsigned EltSize = EltVT.getSizeInBits();
8562 assert(isPowerOf2_32(EltSize));
8563
8564 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8565
8566 // Convert vector index to bit-index (* EltSize)
8567 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8568
8569 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8570 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
8571
8572 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8573 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
8574 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
8575 }
8576
8577 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
8578}
8579
8580static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
8581 assert(Elt % 2 == 0);
8582 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8583}
8584
8585static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
8586 assert(Elt % 2 == 0);
8587 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8588 !(Mask[Elt + 1] & 1);
8589}
8590
8591SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
8592 SelectionDAG &DAG) const {
8593 SDLoc SL(Op);
8594 EVT ResultVT = Op.getValueType();
8595 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
8596 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
8597 const int NewSrcNumElts = 2;
8598 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
8599 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
8600
8601 // Break up the shuffle into registers sized pieces.
8602 //
8603 // We're trying to form sub-shuffles that the register allocation pipeline
8604 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
8605 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
8606 // pair of copies into a consecutive register copy, so use the ordinary
8607 // extract_vector_elt lowering unless we can use the shuffle.
8608 //
8609 // TODO: This is a bit of hack, and we should probably always use
8610 // extract_subvector for the largest possible subvector we can (or at least
8611 // use it for PackVT aligned pieces). However we have worse support for
8612 // combines on them don't directly treat extract_subvector / insert_subvector
8613 // as legal. The DAG scheduler also ends up doing a worse job with the
8614 // extract_subvectors.
8615 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
8616
8617 // vector_shuffle <0,1,6,7> lhs, rhs
8618 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
8619 //
8620 // vector_shuffle <6,7,2,3> lhs, rhs
8621 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
8622 //
8623 // vector_shuffle <6,7,0,1> lhs, rhs
8624 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
8625
8626 // Avoid scalarizing when both halves are reading from consecutive elements.
8627
8628 // If we're treating 2 element shuffles as legal, also create odd-to-even
8629 // shuffles of neighboring pairs.
8630 //
8631 // vector_shuffle <3,2,7,6> lhs, rhs
8632 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
8633 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
8634
8636 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
8637 if (ShouldUseConsecutiveExtract &&
8639 const int Idx = SVN->getMaskElt(I);
8640 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8641 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8642 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
8643 SVN->getOperand(VecIdx),
8644 DAG.getConstant(EltIdx, SL, MVT::i32));
8645 Pieces.push_back(SubVec);
8646 } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
8648 int Idx0 = SVN->getMaskElt(I);
8649 int Idx1 = SVN->getMaskElt(I + 1);
8650
8651 SDValue SrcOp0 = SVN->getOperand(0);
8652 SDValue SrcOp1 = SrcOp0;
8653 if (Idx0 >= SrcNumElts) {
8654 SrcOp0 = SVN->getOperand(1);
8655 Idx0 -= SrcNumElts;
8656 }
8657
8658 if (Idx1 >= SrcNumElts) {
8659 SrcOp1 = SVN->getOperand(1);
8660 Idx1 -= SrcNumElts;
8661 }
8662
8663 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8664 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8665
8666 // Extract nearest even aligned piece.
8667 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
8668 DAG.getConstant(AlignedIdx0, SL, MVT::i32));
8669 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
8670 DAG.getConstant(AlignedIdx1, SL, MVT::i32));
8671
8672 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8673 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8674
8675 SDValue Result0 = SubVec0;
8676 SDValue Result1 = SubVec0;
8677
8678 if (SubVec0 != SubVec1) {
8679 NewMaskIdx1 += NewSrcNumElts;
8680 Result1 = SubVec1;
8681 } else {
8682 Result1 = DAG.getPOISON(PackVT);
8683 }
8684
8685 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
8686 {NewMaskIdx0, NewMaskIdx1});
8687 Pieces.push_back(Shuf);
8688 } else {
8689 const int Idx0 = SVN->getMaskElt(I);
8690 const int Idx1 = SVN->getMaskElt(I + 1);
8691 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8692 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8693 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8694 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8695
8696 SDValue Vec0 = SVN->getOperand(VecIdx0);
8697 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
8698 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
8699
8700 SDValue Vec1 = SVN->getOperand(VecIdx1);
8701 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
8702 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
8703 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
8704 }
8705 }
8706
8707 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
8708}
8709
8710SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
8711 SelectionDAG &DAG) const {
8712 SDValue SVal = Op.getOperand(0);
8713 EVT ResultVT = Op.getValueType();
8714 EVT SValVT = SVal.getValueType();
8715 SDValue UndefVal = DAG.getPOISON(SValVT);
8716 SDLoc SL(Op);
8717
8719 VElts.push_back(SVal);
8720 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
8721 VElts.push_back(UndefVal);
8722
8723 return DAG.getBuildVector(ResultVT, SL, VElts);
8724}
8725
8726SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
8727 SelectionDAG &DAG) const {
8728 SDLoc SL(Op);
8729 EVT VT = Op.getValueType();
8730
8731 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8732 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
8733
8734 SDValue Lo = Op.getOperand(0);
8735 SDValue Hi = Op.getOperand(1);
8736
8737 // Avoid adding defined bits with the zero_extend.
8738 if (Hi.isUndef()) {
8739 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8740 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
8741 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
8742 }
8743
8744 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
8745 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
8746
8747 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
8748 DAG.getConstant(16, SL, MVT::i32));
8749 if (Lo.isUndef())
8750 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
8751
8752 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8753 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
8754
8755 SDValue Or =
8756 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
8757 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
8758 }
8759
8760 // Split into 2-element chunks.
8761 const unsigned NumParts = VT.getVectorNumElements() / 2;
8762 EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
8763 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
8764
8766 for (unsigned P = 0; P < NumParts; ++P) {
8767 SDValue Vec = DAG.getBuildVector(
8768 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
8769 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
8770 }
8771
8772 SDValue Blend =
8773 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
8774 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
8775}
8776
8778 const GlobalAddressSDNode *GA) const {
8779 // OSes that use ELF REL relocations (instead of RELA) can only store a
8780 // 32-bit addend in the instruction, so it is not safe to allow offset folding
8781 // which can create arbitrary 64-bit addends. (This is only a problem for
8782 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
8783 // the high 32 bits of the addend.)
8784 //
8785 // This should be kept in sync with how HasRelocationAddend is initialized in
8786 // the constructor of ELFAMDGPUAsmBackend.
8787 if (!Subtarget->isAmdHsaOS())
8788 return false;
8789
8790 // We can fold offsets for anything that doesn't require a GOT relocation.
8791 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
8795}
8796
8797static SDValue
8799 const SDLoc &DL, int64_t Offset, EVT PtrVT,
8800 unsigned GAFlags = SIInstrInfo::MO_NONE) {
8801 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
8802 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
8803 // lowered to the following code sequence:
8804 //
8805 // For constant address space:
8806 // s_getpc_b64 s[0:1]
8807 // s_add_u32 s0, s0, $symbol
8808 // s_addc_u32 s1, s1, 0
8809 //
8810 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8811 // a fixup or relocation is emitted to replace $symbol with a literal
8812 // constant, which is a pc-relative offset from the encoding of the $symbol
8813 // operand to the global variable.
8814 //
8815 // For global address space:
8816 // s_getpc_b64 s[0:1]
8817 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
8818 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
8819 //
8820 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8821 // fixups or relocations are emitted to replace $symbol@*@lo and
8822 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
8823 // which is a 64-bit pc-relative offset from the encoding of the $symbol
8824 // operand to the global variable.
8825 if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
8826 assert(GAFlags != SIInstrInfo::MO_NONE);
8827
8828 SDValue Ptr =
8829 DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2);
8830 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr);
8831 }
8832
8833 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
8834 SDValue PtrHi;
8835 if (GAFlags == SIInstrInfo::MO_NONE)
8836 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
8837 else
8838 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
8839 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
8840}
8841
8842SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
8843 SDValue Op,
8844 SelectionDAG &DAG) const {
8845 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
8846 SDLoc DL(GSD);
8847 EVT PtrVT = Op.getValueType();
8848
8849 const GlobalValue *GV = GSD->getGlobal();
8855 GV->hasExternalLinkage()) {
8856 Type *Ty = GV->getValueType();
8857 // HIP uses an unsized array `extern __shared__ T s[]` or similar
8858 // zero-sized type in other languages to declare the dynamic shared
8859 // memory which size is not known at the compile time. They will be
8860 // allocated by the runtime and placed directly after the static
8861 // allocated ones. They all share the same offset.
8862 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
8863 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
8864 // Adjust alignment for that dynamic shared memory array.
8867 MFI->setUsesDynamicLDS(true);
8868 return SDValue(
8869 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
8870 }
8871 }
8873 }
8874
8876 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
8878 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
8879 }
8880
8881 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8882 if (Subtarget->has64BitLiterals()) {
8884 GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64);
8885 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr),
8886 0);
8887 }
8888
8889 SDValue AddrLo = DAG.getTargetGlobalAddress(
8890 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
8891 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
8892
8893 SDValue AddrHi = DAG.getTargetGlobalAddress(
8894 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
8895 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
8896
8897 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
8898 }
8899
8900 if (shouldEmitFixup(GV))
8901 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
8902
8903 if (shouldEmitPCReloc(GV))
8904 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
8906
8907 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
8909 PointerType *PtrTy =
8911 const DataLayout &DataLayout = DAG.getDataLayout();
8912 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
8913 MachinePointerInfo PtrInfo =
8915
8916 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
8919}
8920
8922 const SDLoc &DL, SDValue V) const {
8923 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
8924 // the destination register.
8925 //
8926 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
8927 // so we will end up with redundant moves to m0.
8928 //
8929 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
8930
8931 // A Null SDValue creates a glue result.
8932 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
8933 V, Chain);
8934 return SDValue(M0, 0);
8935}
8936
8937SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
8938 MVT VT,
8939 unsigned Offset) const {
8940 SDLoc SL(Op);
8941 SDValue Param = lowerKernargMemParameter(
8942 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
8943 // The local size values will have the hi 16-bits as zero.
8944 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
8945 DAG.getValueType(VT));
8946}
8947
8949 EVT VT) {
8952 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
8953 return DAG.getPOISON(VT);
8954}
8955
8957 EVT VT) {
8960 "intrinsic not supported on subtarget", DL.getDebugLoc()));
8961 return DAG.getPOISON(VT);
8962}
8963
8965 ArrayRef<SDValue> Elts) {
8966 assert(!Elts.empty());
8967 MVT Type;
8968 unsigned NumElts = Elts.size();
8969
8970 if (NumElts <= 12) {
8971 Type = MVT::getVectorVT(MVT::f32, NumElts);
8972 } else {
8973 assert(Elts.size() <= 16);
8974 Type = MVT::v16f32;
8975 NumElts = 16;
8976 }
8977
8978 SmallVector<SDValue, 16> VecElts(NumElts);
8979 for (unsigned i = 0; i < Elts.size(); ++i) {
8980 SDValue Elt = Elts[i];
8981 if (Elt.getValueType() != MVT::f32)
8982 Elt = DAG.getBitcast(MVT::f32, Elt);
8983 VecElts[i] = Elt;
8984 }
8985 for (unsigned i = Elts.size(); i < NumElts; ++i)
8986 VecElts[i] = DAG.getPOISON(MVT::f32);
8987
8988 if (NumElts == 1)
8989 return VecElts[0];
8990 return DAG.getBuildVector(Type, DL, VecElts);
8991}
8992
8993static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
8994 SDValue Src, int ExtraElts) {
8995 EVT SrcVT = Src.getValueType();
8996
8998
8999 if (SrcVT.isVector())
9000 DAG.ExtractVectorElements(Src, Elts);
9001 else
9002 Elts.push_back(Src);
9003
9004 SDValue Undef = DAG.getPOISON(SrcVT.getScalarType());
9005 while (ExtraElts--)
9006 Elts.push_back(Undef);
9007
9008 return DAG.getBuildVector(CastVT, DL, Elts);
9009}
9010
9011// Re-construct the required return value for a image load intrinsic.
9012// This is more complicated due to the optional use TexFailCtrl which means the
9013// required return type is an aggregate
9015 ArrayRef<EVT> ResultTypes, bool IsTexFail,
9016 bool Unpacked, bool IsD16, int DMaskPop,
9017 int NumVDataDwords, bool IsAtomicPacked16Bit,
9018 const SDLoc &DL) {
9019 // Determine the required return type. This is the same regardless of
9020 // IsTexFail flag
9021 EVT ReqRetVT = ResultTypes[0];
9022 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
9023 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9024 ? (ReqRetNumElts + 1) / 2
9025 : ReqRetNumElts;
9026
9027 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9028
9029 MVT DataDwordVT =
9030 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
9031
9032 MVT MaskPopVT =
9033 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
9034
9035 SDValue Data(Result, 0);
9036 SDValue TexFail;
9037
9038 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
9039 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
9040 if (MaskPopVT.isVector()) {
9041 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
9042 SDValue(Result, 0), ZeroIdx);
9043 } else {
9044 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
9045 SDValue(Result, 0), ZeroIdx);
9046 }
9047 }
9048
9049 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
9050 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
9051 NumDataDwords - MaskPopDwords);
9052
9053 if (IsD16)
9054 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
9055
9056 EVT LegalReqRetVT = ReqRetVT;
9057 if (!ReqRetVT.isVector()) {
9058 if (!Data.getValueType().isInteger())
9059 Data = DAG.getNode(ISD::BITCAST, DL,
9060 Data.getValueType().changeTypeToInteger(), Data);
9061 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
9062 } else {
9063 // We need to widen the return vector to a legal type
9064 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
9065 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
9066 LegalReqRetVT =
9068 ReqRetVT.getVectorNumElements() + 1);
9069 }
9070 }
9071 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
9072
9073 if (IsTexFail) {
9074 TexFail =
9075 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
9076 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
9077
9078 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
9079 }
9080
9081 if (Result->getNumValues() == 1)
9082 return Data;
9083
9084 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
9085}
9086
9087static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
9088 SDValue *LWE, bool &IsTexFail) {
9089 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
9090
9091 uint64_t Value = TexFailCtrlConst->getZExtValue();
9092 if (Value) {
9093 IsTexFail = true;
9094 }
9095
9096 SDLoc DL(TexFailCtrlConst);
9097 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
9098 Value &= ~(uint64_t)0x1;
9099 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
9100 Value &= ~(uint64_t)0x2;
9101
9102 return Value == 0;
9103}
9104
9106 MVT PackVectorVT,
9107 SmallVectorImpl<SDValue> &PackedAddrs,
9108 unsigned DimIdx, unsigned EndIdx,
9109 unsigned NumGradients) {
9110 SDLoc DL(Op);
9111 for (unsigned I = DimIdx; I < EndIdx; I++) {
9112 SDValue Addr = Op.getOperand(I);
9113
9114 // Gradients are packed with undef for each coordinate.
9115 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
9116 // 1D: undef,dx/dh; undef,dx/dv
9117 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
9118 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
9119 if (((I + 1) >= EndIdx) ||
9120 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
9121 I == DimIdx + NumGradients - 1))) {
9122 if (Addr.getValueType() != MVT::i16)
9123 Addr = DAG.getBitcast(MVT::i16, Addr);
9124 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
9125 } else {
9126 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
9127 I++;
9128 }
9129 Addr = DAG.getBitcast(MVT::f32, Addr);
9130 PackedAddrs.push_back(Addr);
9131 }
9132}
9133
9134SDValue SITargetLowering::lowerImage(SDValue Op,
9136 SelectionDAG &DAG, bool WithChain) const {
9137 SDLoc DL(Op);
9138 MachineFunction &MF = DAG.getMachineFunction();
9139 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
9140 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9142 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
9143 unsigned IntrOpcode = Intr->BaseOpcode;
9144 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
9145 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9146 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9147
9148 SmallVector<EVT, 3> ResultTypes(Op->values());
9149 SmallVector<EVT, 3> OrigResultTypes(Op->values());
9150 bool IsD16 = false;
9151 bool IsG16 = false;
9152 bool IsA16 = false;
9153 SDValue VData;
9154 int NumVDataDwords = 0;
9155 bool AdjustRetType = false;
9156 bool IsAtomicPacked16Bit = false;
9157
9158 // Offset of intrinsic arguments
9159 const unsigned ArgOffset = WithChain ? 2 : 1;
9160
9161 unsigned DMask;
9162 unsigned DMaskLanes = 0;
9163
9164 if (BaseOpcode->Atomic) {
9165 VData = Op.getOperand(2);
9166
9167 IsAtomicPacked16Bit =
9168 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9169 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
9170
9171 bool Is64Bit = VData.getValueSizeInBits() == 64;
9172 if (BaseOpcode->AtomicX2) {
9173 SDValue VData2 = Op.getOperand(3);
9174 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
9175 {VData, VData2});
9176 if (Is64Bit)
9177 VData = DAG.getBitcast(MVT::v4i32, VData);
9178
9179 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9180 DMask = Is64Bit ? 0xf : 0x3;
9181 NumVDataDwords = Is64Bit ? 4 : 2;
9182 } else {
9183 DMask = Is64Bit ? 0x3 : 0x1;
9184 NumVDataDwords = Is64Bit ? 2 : 1;
9185 }
9186 } else {
9187 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
9188 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
9189
9190 if (BaseOpcode->Store) {
9191 VData = Op.getOperand(2);
9192
9193 MVT StoreVT = VData.getSimpleValueType();
9194 if (StoreVT.getScalarType() == MVT::f16) {
9195 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9196 return Op; // D16 is unsupported for this instruction
9197
9198 IsD16 = true;
9199 VData = handleD16VData(VData, DAG, true);
9200 }
9201
9202 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
9203 } else if (!BaseOpcode->NoReturn) {
9204 // Work out the num dwords based on the dmask popcount and underlying type
9205 // and whether packing is supported.
9206 MVT LoadVT = ResultTypes[0].getSimpleVT();
9207 if (LoadVT.getScalarType() == MVT::f16) {
9208 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9209 return Op; // D16 is unsupported for this instruction
9210
9211 IsD16 = true;
9212 }
9213
9214 // Confirm that the return type is large enough for the dmask specified
9215 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
9216 (!LoadVT.isVector() && DMaskLanes > 1))
9217 return Op;
9218
9219 // The sq block of gfx8 and gfx9 do not estimate register use correctly
9220 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
9221 // instructions.
9222 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9223 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9224 NumVDataDwords = (DMaskLanes + 1) / 2;
9225 else
9226 NumVDataDwords = DMaskLanes;
9227
9228 AdjustRetType = true;
9229 }
9230 }
9231
9232 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
9234
9235 // Check for 16 bit addresses or derivatives and pack if true.
9236 MVT VAddrVT =
9237 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
9238 MVT VAddrScalarVT = VAddrVT.getScalarType();
9239 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9240 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9241
9242 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
9243 VAddrScalarVT = VAddrVT.getScalarType();
9244 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9245 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9246
9247 // Push back extra arguments.
9248 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
9249 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
9250 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
9251 // Special handling of bias when A16 is on. Bias is of type half but
9252 // occupies full 32-bit.
9253 SDValue Bias = DAG.getBuildVector(
9254 MVT::v2f16, DL,
9255 {Op.getOperand(ArgOffset + I), DAG.getPOISON(MVT::f16)});
9256 VAddrs.push_back(Bias);
9257 } else {
9258 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
9259 "Bias needs to be converted to 16 bit in A16 mode");
9260 VAddrs.push_back(Op.getOperand(ArgOffset + I));
9261 }
9262 }
9263
9264 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
9265 // 16 bit gradients are supported, but are tied to the A16 control
9266 // so both gradients and addresses must be 16 bit
9267 LLVM_DEBUG(
9268 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
9269 "require 16 bit args for both gradients and addresses");
9270 return Op;
9271 }
9272
9273 if (IsA16) {
9274 if (!ST->hasA16()) {
9275 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
9276 "support 16 bit addresses\n");
9277 return Op;
9278 }
9279 }
9280
9281 // We've dealt with incorrect input so we know that if IsA16, IsG16
9282 // are set then we have to compress/pack operands (either address,
9283 // gradient or both)
9284 // In the case where a16 and gradients are tied (no G16 support) then we
9285 // have already verified that both IsA16 and IsG16 are true
9286 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
9287 // Activate g16
9288 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9290 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
9291 }
9292
9293 // Add gradients (packed or unpacked)
9294 if (IsG16) {
9295 // Pack the gradients
9296 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
9297 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
9298 ArgOffset + Intr->GradientStart,
9299 ArgOffset + Intr->CoordStart, Intr->NumGradients);
9300 } else {
9301 for (unsigned I = ArgOffset + Intr->GradientStart;
9302 I < ArgOffset + Intr->CoordStart; I++)
9303 VAddrs.push_back(Op.getOperand(I));
9304 }
9305
9306 // Add addresses (packed or unpacked)
9307 if (IsA16) {
9308 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
9309 ArgOffset + Intr->CoordStart, VAddrEnd,
9310 0 /* No gradients */);
9311 } else {
9312 // Add uncompressed address
9313 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
9314 VAddrs.push_back(Op.getOperand(I));
9315 }
9316
9317 // If the register allocator cannot place the address registers contiguously
9318 // without introducing moves, then using the non-sequential address encoding
9319 // is always preferable, since it saves VALU instructions and is usually a
9320 // wash in terms of code size or even better.
9321 //
9322 // However, we currently have no way of hinting to the register allocator that
9323 // MIMG addresses should be placed contiguously when it is possible to do so,
9324 // so force non-NSA for the common 2-address case as a heuristic.
9325 //
9326 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
9327 // allocation when possible.
9328 //
9329 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
9330 // set of the remaining addresses.
9331 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
9332 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
9333 const bool UseNSA = ST->hasNSAEncoding() &&
9334 VAddrs.size() >= ST->getNSAThreshold(MF) &&
9335 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
9336 const bool UsePartialNSA =
9337 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
9338
9339 SDValue VAddr;
9340 if (UsePartialNSA) {
9341 VAddr = getBuildDwordsVector(DAG, DL,
9342 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9343 } else if (!UseNSA) {
9344 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
9345 }
9346
9347 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
9348 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
9349 SDValue Unorm;
9350 if (!BaseOpcode->Sampler) {
9351 Unorm = True;
9352 } else {
9353 uint64_t UnormConst =
9354 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
9355
9356 Unorm = UnormConst ? True : False;
9357 }
9358
9359 SDValue TFE;
9360 SDValue LWE;
9361 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
9362 bool IsTexFail = false;
9363 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9364 return Op;
9365
9366 if (IsTexFail) {
9367 if (!DMaskLanes) {
9368 // Expecting to get an error flag since TFC is on - and dmask is 0
9369 // Force dmask to be at least 1 otherwise the instruction will fail
9370 DMask = 0x1;
9371 DMaskLanes = 1;
9372 NumVDataDwords = 1;
9373 }
9374 NumVDataDwords += 1;
9375 AdjustRetType = true;
9376 }
9377
9378 // Has something earlier tagged that the return type needs adjusting
9379 // This happens if the instruction is a load or has set TexFailCtrl flags
9380 if (AdjustRetType) {
9381 // NumVDataDwords reflects the true number of dwords required in the return
9382 // type
9383 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9384 // This is a no-op load. This can be eliminated
9385 SDValue Undef = DAG.getPOISON(Op.getValueType());
9386 if (isa<MemSDNode>(Op))
9387 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
9388 return Undef;
9389 }
9390
9391 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
9392 MVT::i32, NumVDataDwords)
9393 : MVT::i32;
9394
9395 ResultTypes[0] = NewVT;
9396 if (ResultTypes.size() == 3) {
9397 // Original result was aggregate type used for TexFailCtrl results
9398 // The actual instruction returns as a vector type which has now been
9399 // created. Remove the aggregate result.
9400 ResultTypes.erase(&ResultTypes[1]);
9401 }
9402 }
9403
9404 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
9405 if (BaseOpcode->Atomic)
9406 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
9407 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
9409 return Op;
9410
9412 if (BaseOpcode->Store || BaseOpcode->Atomic)
9413 Ops.push_back(VData); // vdata
9414 if (UsePartialNSA) {
9415 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
9416 Ops.push_back(VAddr);
9417 } else if (UseNSA)
9418 append_range(Ops, VAddrs);
9419 else
9420 Ops.push_back(VAddr);
9421 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
9422 EVT RsrcVT = Rsrc.getValueType();
9423 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9424 return Op;
9425 Ops.push_back(Rsrc);
9426 if (BaseOpcode->Sampler) {
9427 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
9428 if (Samp.getValueType() != MVT::v4i32)
9429 return Op;
9430 Ops.push_back(Samp);
9431 }
9432 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
9433 if (IsGFX10Plus)
9434 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
9435 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9436 Ops.push_back(Unorm);
9437 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
9438 Ops.push_back(IsA16 && // r128, a16 for gfx9
9439 ST->hasFeature(AMDGPU::FeatureR128A16)
9440 ? True
9441 : False);
9442 if (IsGFX10Plus)
9443 Ops.push_back(IsA16 ? True : False);
9444
9445 if (!Subtarget->hasGFX90AInsts())
9446 Ops.push_back(TFE); // tfe
9447 else if (TFE->getAsZExtVal()) {
9448 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9450 "TFE is not supported on this GPU", DL.getDebugLoc()));
9451 }
9452
9453 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9454 Ops.push_back(LWE); // lwe
9455 if (!IsGFX10Plus)
9456 Ops.push_back(DimInfo->DA ? True : False);
9457 if (BaseOpcode->HasD16)
9458 Ops.push_back(IsD16 ? True : False);
9459 if (isa<MemSDNode>(Op))
9460 Ops.push_back(Op.getOperand(0)); // chain
9461
9462 int NumVAddrDwords =
9463 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
9464 int Opcode = -1;
9465
9466 if (IsGFX12Plus) {
9467 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
9468 NumVDataDwords, NumVAddrDwords);
9469 } else if (IsGFX11Plus) {
9470 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9471 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9472 : AMDGPU::MIMGEncGfx11Default,
9473 NumVDataDwords, NumVAddrDwords);
9474 } else if (IsGFX10Plus) {
9475 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9476 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9477 : AMDGPU::MIMGEncGfx10Default,
9478 NumVDataDwords, NumVAddrDwords);
9479 } else {
9480 if (Subtarget->hasGFX90AInsts()) {
9481 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
9482 NumVDataDwords, NumVAddrDwords);
9483 if (Opcode == -1) {
9484 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9486 "requested image instruction is not supported on this GPU",
9487 DL.getDebugLoc()));
9488
9489 unsigned Idx = 0;
9490 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
9491 for (EVT VT : OrigResultTypes) {
9492 if (VT == MVT::Other)
9493 RetValues[Idx++] = Op.getOperand(0); // Chain
9494 else
9495 RetValues[Idx++] = DAG.getPOISON(VT);
9496 }
9497
9498 return DAG.getMergeValues(RetValues, DL);
9499 }
9500 }
9501 if (Opcode == -1 &&
9502 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9503 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
9504 NumVDataDwords, NumVAddrDwords);
9505 if (Opcode == -1)
9506 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
9507 NumVDataDwords, NumVAddrDwords);
9508 }
9509 if (Opcode == -1)
9510 return Op;
9511
9512 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
9513 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
9514 MachineMemOperand *MemRef = MemOp->getMemOperand();
9515 DAG.setNodeMemRefs(NewNode, {MemRef});
9516 }
9517
9518 if (BaseOpcode->AtomicX2) {
9520 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
9521 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
9522 }
9523 if (BaseOpcode->NoReturn)
9524 return SDValue(NewNode, 0);
9525 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
9526 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9527 NumVDataDwords, IsAtomicPacked16Bit, DL);
9528}
9529
9530SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
9531 SDValue Offset, SDValue CachePolicy,
9532 SelectionDAG &DAG) const {
9533 MachineFunction &MF = DAG.getMachineFunction();
9534
9535 const DataLayout &DataLayout = DAG.getDataLayout();
9536 Align Alignment =
9537 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
9538
9539 MachineMemOperand *MMO = MF.getMachineMemOperand(
9540 MachinePointerInfo(),
9543 VT.getStoreSize(), Alignment);
9544
9545 if (!Offset->isDivergent()) {
9546 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
9547
9548 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
9549 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
9550 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
9551 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
9552 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9553 SDValue BufferLoad =
9555 DAG.getVTList(MVT::i32), Ops, VT, MMO);
9556 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
9557 }
9558
9559 // Widen vec3 load to vec4.
9560 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
9561 !Subtarget->hasScalarDwordx3Loads()) {
9562 EVT WidenedVT =
9564 auto WidenedOp = DAG.getMemIntrinsicNode(
9565 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
9566 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
9567 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
9568 DAG.getVectorIdxConstant(0, DL));
9569 return Subvector;
9570 }
9571
9573 DAG.getVTList(VT), Ops, VT, MMO);
9574 }
9575
9576 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
9577 // assume that the buffer is unswizzled.
9578 SDValue Ops[] = {
9579 DAG.getEntryNode(), // Chain
9580 Rsrc, // rsrc
9581 DAG.getConstant(0, DL, MVT::i32), // vindex
9582 {}, // voffset
9583 {}, // soffset
9584 {}, // offset
9585 CachePolicy, // cachepolicy
9586 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9587 };
9588 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9589 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
9590 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
9591 }
9592
9594 unsigned NumLoads = 1;
9595 MVT LoadVT = VT.getSimpleVT();
9596 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
9597 assert((LoadVT.getScalarType() == MVT::i32 ||
9598 LoadVT.getScalarType() == MVT::f32));
9599
9600 if (NumElts == 8 || NumElts == 16) {
9601 NumLoads = NumElts / 4;
9602 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
9603 }
9604
9605 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other});
9606
9607 // Use the alignment to ensure that the required offsets will fit into the
9608 // immediate offsets.
9609 setBufferOffsets(Offset, DAG, &Ops[3],
9610 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
9611
9612 uint64_t InstOffset = Ops[5]->getAsZExtVal();
9613 for (unsigned i = 0; i < NumLoads; ++i) {
9614 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
9615 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
9616 LoadVT, MMO, DAG));
9617 }
9618
9619 if (NumElts == 8 || NumElts == 16)
9620 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
9621
9622 return Loads[0];
9623}
9624
9625SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
9626 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
9627 if (!Subtarget->hasArchitectedSGPRs())
9628 return {};
9629 SDLoc SL(Op);
9630 MVT VT = MVT::i32;
9631 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
9632 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
9633 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
9634}
9635
9636SDValue SITargetLowering::lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,
9637 AMDGPU::Hwreg::Id HwReg,
9638 unsigned LowBit,
9639 unsigned Width) const {
9640 SDLoc SL(Op);
9641 using namespace AMDGPU::Hwreg;
9642 return {DAG.getMachineNode(
9643 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
9644 DAG.getTargetConstant(HwregEncoding::encode(HwReg, LowBit, Width),
9645 SL, MVT::i32)),
9646 0};
9647}
9648
9649SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
9650 unsigned Dim,
9651 const ArgDescriptor &Arg) const {
9652 SDLoc SL(Op);
9653 MachineFunction &MF = DAG.getMachineFunction();
9654 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
9655 if (MaxID == 0)
9656 return DAG.getConstant(0, SL, MVT::i32);
9657
9658 // It's undefined behavior if a function marked with the amdgpu-no-*
9659 // attributes uses the corresponding intrinsic.
9660 if (!Arg)
9661 return DAG.getPOISON(Op->getValueType(0));
9662
9663 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
9664 SDLoc(DAG.getEntryNode()), Arg);
9665
9666 // Don't bother inserting AssertZext for packed IDs since we're emitting the
9667 // masking operations anyway.
9668 //
9669 // TODO: We could assert the top bit is 0 for the source copy.
9670 if (Arg.isMasked())
9671 return Val;
9672
9673 // Preserve the known bits after expansion to a copy.
9674 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID));
9675 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
9676 DAG.getValueType(SmallVT));
9677}
9678
9679SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
9680 SelectionDAG &DAG) const {
9681 MachineFunction &MF = DAG.getMachineFunction();
9682 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
9683
9684 EVT VT = Op.getValueType();
9685 SDLoc DL(Op);
9686 unsigned IntrinsicID = Op.getConstantOperandVal(0);
9687
9688 // TODO: Should this propagate fast-math-flags?
9689
9690 switch (IntrinsicID) {
9691 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9692 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
9693 return emitNonHSAIntrinsicError(DAG, DL, VT);
9694 return getPreloadedValue(DAG, *MFI, VT,
9696 }
9697 case Intrinsic::amdgcn_dispatch_ptr:
9698 case Intrinsic::amdgcn_queue_ptr: {
9699 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
9700 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9701 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
9702 DL.getDebugLoc()));
9703 return DAG.getPOISON(VT);
9704 }
9705
9706 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9709 return getPreloadedValue(DAG, *MFI, VT, RegID);
9710 }
9711 case Intrinsic::amdgcn_implicitarg_ptr: {
9712 if (MFI->isEntryFunction())
9713 return getImplicitArgPtr(DAG, DL);
9714 return getPreloadedValue(DAG, *MFI, VT,
9716 }
9717 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9719 // This only makes sense to call in a kernel, so just lower to null.
9720 return DAG.getConstant(0, DL, VT);
9721 }
9722
9723 return getPreloadedValue(DAG, *MFI, VT,
9725 }
9726 case Intrinsic::amdgcn_dispatch_id: {
9727 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
9728 }
9729 case Intrinsic::amdgcn_rcp:
9730 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
9731 case Intrinsic::amdgcn_rsq:
9732 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9733 case Intrinsic::amdgcn_rsq_legacy:
9734 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9735 return emitRemovedIntrinsicError(DAG, DL, VT);
9736 return SDValue();
9737 case Intrinsic::amdgcn_rcp_legacy:
9738 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9739 return emitRemovedIntrinsicError(DAG, DL, VT);
9740 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
9741 case Intrinsic::amdgcn_rsq_clamp: {
9742 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9743 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
9744
9745 Type *Type = VT.getTypeForEVT(*DAG.getContext());
9746 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
9747 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
9748
9749 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9750 SDValue Tmp =
9751 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
9752 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
9753 DAG.getConstantFP(Min, DL, VT));
9754 }
9755 case Intrinsic::r600_read_ngroups_x:
9756 if (Subtarget->isAmdHsaOS())
9757 return emitNonHSAIntrinsicError(DAG, DL, VT);
9758
9759 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9761 false);
9762 case Intrinsic::r600_read_ngroups_y:
9763 if (Subtarget->isAmdHsaOS())
9764 return emitNonHSAIntrinsicError(DAG, DL, VT);
9765
9766 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9768 false);
9769 case Intrinsic::r600_read_ngroups_z:
9770 if (Subtarget->isAmdHsaOS())
9771 return emitNonHSAIntrinsicError(DAG, DL, VT);
9772
9773 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9775 false);
9776 case Intrinsic::r600_read_local_size_x:
9777 if (Subtarget->isAmdHsaOS())
9778 return emitNonHSAIntrinsicError(DAG, DL, VT);
9779
9780 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9782 case Intrinsic::r600_read_local_size_y:
9783 if (Subtarget->isAmdHsaOS())
9784 return emitNonHSAIntrinsicError(DAG, DL, VT);
9785
9786 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9788 case Intrinsic::r600_read_local_size_z:
9789 if (Subtarget->isAmdHsaOS())
9790 return emitNonHSAIntrinsicError(DAG, DL, VT);
9791
9792 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9794 case Intrinsic::amdgcn_workgroup_id_x:
9795 return lowerWorkGroupId(DAG, *MFI, VT,
9799 case Intrinsic::amdgcn_workgroup_id_y:
9800 return lowerWorkGroupId(DAG, *MFI, VT,
9804 case Intrinsic::amdgcn_workgroup_id_z:
9805 return lowerWorkGroupId(DAG, *MFI, VT,
9809 case Intrinsic::amdgcn_cluster_id_x:
9810 return Subtarget->hasClusters()
9811 ? getPreloadedValue(DAG, *MFI, VT,
9813 : DAG.getPOISON(VT);
9814 case Intrinsic::amdgcn_cluster_id_y:
9815 return Subtarget->hasClusters()
9816 ? getPreloadedValue(DAG, *MFI, VT,
9818 : DAG.getPOISON(VT);
9819 case Intrinsic::amdgcn_cluster_id_z:
9820 return Subtarget->hasClusters()
9821 ? getPreloadedValue(DAG, *MFI, VT,
9823 : DAG.getPOISON(VT);
9824 case Intrinsic::amdgcn_cluster_workgroup_id_x:
9825 return Subtarget->hasClusters()
9826 ? getPreloadedValue(
9827 DAG, *MFI, VT,
9829 : DAG.getPOISON(VT);
9830 case Intrinsic::amdgcn_cluster_workgroup_id_y:
9831 return Subtarget->hasClusters()
9832 ? getPreloadedValue(
9833 DAG, *MFI, VT,
9835 : DAG.getPOISON(VT);
9836 case Intrinsic::amdgcn_cluster_workgroup_id_z:
9837 return Subtarget->hasClusters()
9838 ? getPreloadedValue(
9839 DAG, *MFI, VT,
9841 : DAG.getPOISON(VT);
9842 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
9843 return Subtarget->hasClusters()
9844 ? lowerConstHwRegRead(DAG, Op, AMDGPU::Hwreg::ID_IB_STS2, 21, 4)
9845 : SDValue();
9846 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
9847 return Subtarget->hasClusters()
9848 ? getPreloadedValue(
9849 DAG, *MFI, VT,
9851 : DAG.getPOISON(VT);
9852 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
9853 return Subtarget->hasClusters()
9854 ? getPreloadedValue(
9855 DAG, *MFI, VT,
9857 : DAG.getPOISON(VT);
9858 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
9859 return Subtarget->hasClusters()
9860 ? getPreloadedValue(
9861 DAG, *MFI, VT,
9863 : DAG.getPOISON(VT);
9864 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
9865 return Subtarget->hasClusters()
9866 ? getPreloadedValue(
9867 DAG, *MFI, VT,
9869 : DAG.getPOISON(VT);
9870 case Intrinsic::amdgcn_wave_id:
9871 return lowerWaveID(DAG, Op);
9872 case Intrinsic::amdgcn_lds_kernel_id: {
9873 if (MFI->isEntryFunction())
9874 return getLDSKernelId(DAG, DL);
9875 return getPreloadedValue(DAG, *MFI, VT,
9877 }
9878 case Intrinsic::amdgcn_workitem_id_x:
9879 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
9880 case Intrinsic::amdgcn_workitem_id_y:
9881 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
9882 case Intrinsic::amdgcn_workitem_id_z:
9883 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
9884 case Intrinsic::amdgcn_wavefrontsize:
9885 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
9886 SDLoc(Op), MVT::i32);
9887 case Intrinsic::amdgcn_s_buffer_load: {
9888 unsigned CPol = Op.getConstantOperandVal(3);
9889 // s_buffer_load, because of how it's optimized, can't be volatile
9890 // so reject ones with the volatile bit set.
9891 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
9894 return Op;
9895 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
9896 Op.getOperand(3), DAG);
9897 }
9898 case Intrinsic::amdgcn_fdiv_fast:
9899 return lowerFDIV_FAST(Op, DAG);
9900 case Intrinsic::amdgcn_sin:
9901 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
9902
9903 case Intrinsic::amdgcn_cos:
9904 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
9905
9906 case Intrinsic::amdgcn_mul_u24:
9907 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
9908 Op.getOperand(2));
9909 case Intrinsic::amdgcn_mul_i24:
9910 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
9911 Op.getOperand(2));
9912
9913 case Intrinsic::amdgcn_log_clamp: {
9914 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9915 return SDValue();
9916
9917 return emitRemovedIntrinsicError(DAG, DL, VT);
9918 }
9919 case Intrinsic::amdgcn_fract:
9920 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
9921
9922 case Intrinsic::amdgcn_class:
9923 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
9924 Op.getOperand(2));
9925 case Intrinsic::amdgcn_div_fmas:
9926 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
9927 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9928
9929 case Intrinsic::amdgcn_div_fixup:
9930 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
9931 Op.getOperand(2), Op.getOperand(3));
9932
9933 case Intrinsic::amdgcn_div_scale: {
9934 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
9935
9936 // Translate to the operands expected by the machine instruction. The
9937 // first parameter must be the same as the first instruction.
9938 SDValue Numerator = Op.getOperand(1);
9939 SDValue Denominator = Op.getOperand(2);
9940
9941 // Note this order is opposite of the machine instruction's operations,
9942 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
9943 // intrinsic has the numerator as the first operand to match a normal
9944 // division operation.
9945
9946 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
9947
9948 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
9949 Denominator, Numerator);
9950 }
9951 case Intrinsic::amdgcn_icmp: {
9952 // There is a Pat that handles this variant, so return it as-is.
9953 if (Op.getOperand(1).getValueType() == MVT::i1 &&
9954 Op.getConstantOperandVal(2) == 0 &&
9955 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
9956 return Op;
9957 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
9958 }
9959 case Intrinsic::amdgcn_fcmp: {
9960 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
9961 }
9962 case Intrinsic::amdgcn_ballot:
9963 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
9964 case Intrinsic::amdgcn_fmed3:
9965 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
9966 Op.getOperand(2), Op.getOperand(3));
9967 case Intrinsic::amdgcn_fdot2:
9968 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
9969 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9970 case Intrinsic::amdgcn_fmul_legacy:
9971 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
9972 Op.getOperand(2));
9973 case Intrinsic::amdgcn_sffbh:
9974 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
9975 case Intrinsic::amdgcn_sbfe:
9976 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
9977 Op.getOperand(2), Op.getOperand(3));
9978 case Intrinsic::amdgcn_ubfe:
9979 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
9980 Op.getOperand(2), Op.getOperand(3));
9981 case Intrinsic::amdgcn_cvt_pkrtz:
9982 case Intrinsic::amdgcn_cvt_pknorm_i16:
9983 case Intrinsic::amdgcn_cvt_pknorm_u16:
9984 case Intrinsic::amdgcn_cvt_pk_i16:
9985 case Intrinsic::amdgcn_cvt_pk_u16: {
9986 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
9987 EVT VT = Op.getValueType();
9988 unsigned Opcode;
9989
9990 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
9992 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
9994 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
9996 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
9998 else
10000
10001 if (isTypeLegal(VT))
10002 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
10003
10004 SDValue Node =
10005 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
10006 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
10007 }
10008 case Intrinsic::amdgcn_fmad_ftz:
10009 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
10010 Op.getOperand(2), Op.getOperand(3));
10011
10012 case Intrinsic::amdgcn_if_break:
10013 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
10014 Op->getOperand(1), Op->getOperand(2)),
10015 0);
10016
10017 case Intrinsic::amdgcn_groupstaticsize: {
10019 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
10020 return Op;
10021
10022 const Module *M = MF.getFunction().getParent();
10023 const GlobalValue *GV =
10024 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
10025 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
10027 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10028 }
10029 case Intrinsic::amdgcn_is_shared:
10030 case Intrinsic::amdgcn_is_private: {
10031 SDLoc SL(Op);
10032 SDValue SrcVec =
10033 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10034 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
10035 DAG.getConstant(1, SL, MVT::i32));
10036
10037 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10039 : AMDGPUAS::PRIVATE_ADDRESS;
10040 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
10041 Subtarget->hasGloballyAddressableScratch()) {
10042 SDValue FlatScratchBaseHi(
10043 DAG.getMachineNode(
10044 AMDGPU::S_MOV_B32, DL, MVT::i32,
10045 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10046 0);
10047 // Test bits 63..58 against the aperture address.
10048 return DAG.getSetCC(
10049 SL, MVT::i1,
10050 DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
10051 DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
10052 }
10053
10054 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10055 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
10056 }
10057 case Intrinsic::amdgcn_perm:
10058 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
10059 Op.getOperand(2), Op.getOperand(3));
10060 case Intrinsic::amdgcn_reloc_constant: {
10061 Module *M = MF.getFunction().getParent();
10062 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
10063 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
10064 auto *RelocSymbol = cast<GlobalVariable>(
10065 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
10066 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
10068 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10069 }
10070 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10071 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10072 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10073 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10074 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10075 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10076 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10077 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10078 if (Op.getOperand(4).getValueType() == MVT::i32)
10079 return SDValue();
10080
10081 SDLoc SL(Op);
10082 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
10083 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10084 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10085 Op.getOperand(3), IndexKeyi32);
10086 }
10087 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10088 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10089 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10090 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10091 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10092 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10093 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10094 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10095 if (Op.getOperand(4).getValueType() == MVT::i64)
10096 return SDValue();
10097
10098 SDLoc SL(Op);
10099 auto IndexKeyi64 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64);
10100 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10101 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10102 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10103 Op.getOperand(6)});
10104 }
10105 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10106 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10107 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10108 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10109 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10110 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10111 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10112 ? MVT::i64
10113 : MVT::i32;
10114 if (Op.getOperand(6).getValueType() == IndexKeyTy)
10115 return SDValue();
10116
10117 SDLoc SL(Op);
10118 auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
10119 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10120 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10121 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10122 IndexKey, Op.getOperand(7),
10123 Op.getOperand(8)}); // No clamp operand
10124 }
10125 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10126 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10127 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10128 if (Op.getOperand(6).getValueType() == MVT::i32)
10129 return SDValue();
10130
10131 SDLoc SL(Op);
10132 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
10133 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10134 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10135 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10136 IndexKeyi32, Op.getOperand(7)});
10137 }
10138 case Intrinsic::amdgcn_addrspacecast_nonnull:
10139 return lowerADDRSPACECAST(Op, DAG);
10140 case Intrinsic::amdgcn_readlane:
10141 case Intrinsic::amdgcn_readfirstlane:
10142 case Intrinsic::amdgcn_writelane:
10143 case Intrinsic::amdgcn_permlane16:
10144 case Intrinsic::amdgcn_permlanex16:
10145 case Intrinsic::amdgcn_permlane64:
10146 case Intrinsic::amdgcn_set_inactive:
10147 case Intrinsic::amdgcn_set_inactive_chain_arg:
10148 case Intrinsic::amdgcn_mov_dpp8:
10149 case Intrinsic::amdgcn_update_dpp:
10150 return lowerLaneOp(*this, Op.getNode(), DAG);
10151 case Intrinsic::amdgcn_dead: {
10153 for (const EVT ValTy : Op.getNode()->values())
10154 Poisons.push_back(DAG.getPOISON(ValTy));
10155 return DAG.getMergeValues(Poisons, SDLoc(Op));
10156 }
10157 default:
10158 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10160 return lowerImage(Op, ImageDimIntr, DAG, false);
10161
10162 return Op;
10163 }
10164}
10165
10166// On targets not supporting constant in soffset field, turn zero to
10167// SGPR_NULL to avoid generating an extra s_mov with zero.
10169 const GCNSubtarget *Subtarget) {
10170 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
10171 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10172 return SOffset;
10173}
10174
10175SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
10176 SelectionDAG &DAG,
10177 unsigned NewOpcode) const {
10178 SDLoc DL(Op);
10179
10180 SDValue VData = Op.getOperand(2);
10181 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10182 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10183 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10184 SDValue Ops[] = {
10185 Op.getOperand(0), // Chain
10186 VData, // vdata
10187 Rsrc, // rsrc
10188 DAG.getConstant(0, DL, MVT::i32), // vindex
10189 VOffset, // voffset
10190 SOffset, // soffset
10191 Offset, // offset
10192 Op.getOperand(6), // cachepolicy
10193 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10194 };
10195
10196 auto *M = cast<MemSDNode>(Op);
10197
10198 EVT MemVT = VData.getValueType();
10199 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10200 M->getMemOperand());
10201}
10202
10203SDValue
10204SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
10205 unsigned NewOpcode) const {
10206 SDLoc DL(Op);
10207
10208 SDValue VData = Op.getOperand(2);
10209 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10210 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10211 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10212 SDValue Ops[] = {
10213 Op.getOperand(0), // Chain
10214 VData, // vdata
10215 Rsrc, // rsrc
10216 Op.getOperand(4), // vindex
10217 VOffset, // voffset
10218 SOffset, // soffset
10219 Offset, // offset
10220 Op.getOperand(7), // cachepolicy
10221 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10222 };
10223
10224 auto *M = cast<MemSDNode>(Op);
10225
10226 EVT MemVT = VData.getValueType();
10227 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10228 M->getMemOperand());
10229}
10230
10231SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
10232 SelectionDAG &DAG) const {
10233 unsigned IntrID = Op.getConstantOperandVal(1);
10234 SDLoc DL(Op);
10235
10236 switch (IntrID) {
10237 case Intrinsic::amdgcn_ds_ordered_add:
10238 case Intrinsic::amdgcn_ds_ordered_swap: {
10239 MemSDNode *M = cast<MemSDNode>(Op);
10240 SDValue Chain = M->getOperand(0);
10241 SDValue M0 = M->getOperand(2);
10242 SDValue Value = M->getOperand(3);
10243 unsigned IndexOperand = M->getConstantOperandVal(7);
10244 unsigned WaveRelease = M->getConstantOperandVal(8);
10245 unsigned WaveDone = M->getConstantOperandVal(9);
10246
10247 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10248 IndexOperand &= ~0x3f;
10249 unsigned CountDw = 0;
10250
10251 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
10252 CountDw = (IndexOperand >> 24) & 0xf;
10253 IndexOperand &= ~(0xf << 24);
10254
10255 if (CountDw < 1 || CountDw > 4) {
10256 const Function &Fn = DAG.getMachineFunction().getFunction();
10257 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10258 Fn, "ds_ordered_count: dword count must be between 1 and 4",
10259 DL.getDebugLoc()));
10260 CountDw = 1;
10261 }
10262 }
10263
10264 if (IndexOperand) {
10265 const Function &Fn = DAG.getMachineFunction().getFunction();
10266 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10267 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
10268 }
10269
10270 if (WaveDone && !WaveRelease) {
10271 // TODO: Move this to IR verifier
10272 const Function &Fn = DAG.getMachineFunction().getFunction();
10273 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10274 Fn, "ds_ordered_count: wave_done requires wave_release",
10275 DL.getDebugLoc()));
10276 }
10277
10278 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10279 unsigned ShaderType =
10281 unsigned Offset0 = OrderedCountIndex << 2;
10282 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10283
10284 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
10285 Offset1 |= (CountDw - 1) << 6;
10286
10287 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
10288 Offset1 |= ShaderType << 2;
10289
10290 unsigned Offset = Offset0 | (Offset1 << 8);
10291
10292 SDValue Ops[] = {
10293 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
10294 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
10295 };
10297 M->getVTList(), Ops, M->getMemoryVT(),
10298 M->getMemOperand());
10299 }
10300 case Intrinsic::amdgcn_raw_buffer_load:
10301 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10302 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10303 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10304 case Intrinsic::amdgcn_raw_buffer_load_format:
10305 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10306 const bool IsFormat =
10307 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10308 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10309
10310 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10311 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10312 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10313 SDValue Ops[] = {
10314 Op.getOperand(0), // Chain
10315 Rsrc, // rsrc
10316 DAG.getConstant(0, DL, MVT::i32), // vindex
10317 VOffset, // voffset
10318 SOffset, // soffset
10319 Offset, // offset
10320 Op.getOperand(5), // cachepolicy, swizzled buffer
10321 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10322 };
10323
10324 auto *M = cast<MemSDNode>(Op);
10325 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
10326 }
10327 case Intrinsic::amdgcn_struct_buffer_load:
10328 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10329 case Intrinsic::amdgcn_struct_buffer_load_format:
10330 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10331 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10332 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10333 const bool IsFormat =
10334 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10335 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10336
10337 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10338 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10339 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10340 SDValue Ops[] = {
10341 Op.getOperand(0), // Chain
10342 Rsrc, // rsrc
10343 Op.getOperand(3), // vindex
10344 VOffset, // voffset
10345 SOffset, // soffset
10346 Offset, // offset
10347 Op.getOperand(6), // cachepolicy, swizzled buffer
10348 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10349 };
10350
10351 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
10352 }
10353 case Intrinsic::amdgcn_raw_tbuffer_load:
10354 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10355 MemSDNode *M = cast<MemSDNode>(Op);
10356 EVT LoadVT = Op.getValueType();
10357 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10358 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10359 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10360
10361 SDValue Ops[] = {
10362 Op.getOperand(0), // Chain
10363 Rsrc, // rsrc
10364 DAG.getConstant(0, DL, MVT::i32), // vindex
10365 VOffset, // voffset
10366 SOffset, // soffset
10367 Offset, // offset
10368 Op.getOperand(5), // format
10369 Op.getOperand(6), // cachepolicy, swizzled buffer
10370 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10371 };
10372
10373 if (LoadVT.getScalarType() == MVT::f16)
10374 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10375 Ops);
10376 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10377 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10378 DAG);
10379 }
10380 case Intrinsic::amdgcn_struct_tbuffer_load:
10381 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10382 MemSDNode *M = cast<MemSDNode>(Op);
10383 EVT LoadVT = Op.getValueType();
10384 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10385 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10386 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10387
10388 SDValue Ops[] = {
10389 Op.getOperand(0), // Chain
10390 Rsrc, // rsrc
10391 Op.getOperand(3), // vindex
10392 VOffset, // voffset
10393 SOffset, // soffset
10394 Offset, // offset
10395 Op.getOperand(6), // format
10396 Op.getOperand(7), // cachepolicy, swizzled buffer
10397 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10398 };
10399
10400 if (LoadVT.getScalarType() == MVT::f16)
10401 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10402 Ops);
10403 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10404 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10405 DAG);
10406 }
10407 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10408 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10409 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
10410 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10411 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10412 return lowerStructBufferAtomicIntrin(Op, DAG,
10414 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10415 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10416 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
10417 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10418 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10419 return lowerStructBufferAtomicIntrin(Op, DAG,
10421 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10422 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10423 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
10424 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10425 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10426 return lowerStructBufferAtomicIntrin(Op, DAG,
10428 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10429 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10430 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
10431 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10432 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10433 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10434 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10435 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10436 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10437 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10438 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10439 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
10440 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10441 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10442 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
10443 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10444 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10445 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
10446 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10447 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10448 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
10449 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10450 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10451 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10452 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10453 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10454 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10455 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10456 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10457 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10458 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10459 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10460 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10461 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10462 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10463 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10464 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10465 return lowerRawBufferAtomicIntrin(Op, DAG,
10467 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10468 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10469 return lowerStructBufferAtomicIntrin(Op, DAG,
10471 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10472 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10473 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10474 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10475 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10476 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10477 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10478 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10479 return lowerStructBufferAtomicIntrin(Op, DAG,
10481 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10482 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10483 return lowerStructBufferAtomicIntrin(Op, DAG,
10485 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10486 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10487 return lowerStructBufferAtomicIntrin(Op, DAG,
10489 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10490 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10491 return lowerStructBufferAtomicIntrin(Op, DAG,
10493 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10494 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10495 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10496 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10497 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10498 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10499 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10500 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10501 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10502 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10503 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10504 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10505 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10506 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10507 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10508 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10509 return lowerStructBufferAtomicIntrin(Op, DAG,
10511
10512 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10513 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10514 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
10515 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10516 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10517 SDValue Ops[] = {
10518 Op.getOperand(0), // Chain
10519 Op.getOperand(2), // src
10520 Op.getOperand(3), // cmp
10521 Rsrc, // rsrc
10522 DAG.getConstant(0, DL, MVT::i32), // vindex
10523 VOffset, // voffset
10524 SOffset, // soffset
10525 Offset, // offset
10526 Op.getOperand(7), // cachepolicy
10527 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10528 };
10529 EVT VT = Op.getValueType();
10530 auto *M = cast<MemSDNode>(Op);
10531
10533 Op->getVTList(), Ops, VT,
10534 M->getMemOperand());
10535 }
10536 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10537 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10538 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
10539 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
10540 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
10541 SDValue Ops[] = {
10542 Op.getOperand(0), // Chain
10543 Op.getOperand(2), // src
10544 Op.getOperand(3), // cmp
10545 Rsrc, // rsrc
10546 Op.getOperand(5), // vindex
10547 VOffset, // voffset
10548 SOffset, // soffset
10549 Offset, // offset
10550 Op.getOperand(8), // cachepolicy
10551 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10552 };
10553 EVT VT = Op.getValueType();
10554 auto *M = cast<MemSDNode>(Op);
10555
10557 Op->getVTList(), Ops, VT,
10558 M->getMemOperand());
10559 }
10560 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10561 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10562 MemSDNode *M = cast<MemSDNode>(Op);
10563 SDValue NodePtr = M->getOperand(2);
10564 SDValue RayExtent = M->getOperand(3);
10565 SDValue InstanceMask = M->getOperand(4);
10566 SDValue RayOrigin = M->getOperand(5);
10567 SDValue RayDir = M->getOperand(6);
10568 SDValue Offsets = M->getOperand(7);
10569 SDValue TDescr = M->getOperand(8);
10570
10571 assert(NodePtr.getValueType() == MVT::i64);
10572 assert(RayDir.getValueType() == MVT::v3f32);
10573
10574 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10575 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10576 return SDValue();
10577 }
10578
10579 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10580 const unsigned NumVDataDwords = 10;
10581 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10582 int Opcode = AMDGPU::getMIMGOpcode(
10583 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10584 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10585 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10586 assert(Opcode != -1);
10587
10589 Ops.push_back(NodePtr);
10590 Ops.push_back(DAG.getBuildVector(
10591 MVT::v2i32, DL,
10592 {DAG.getBitcast(MVT::i32, RayExtent),
10593 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10594 Ops.push_back(RayOrigin);
10595 Ops.push_back(RayDir);
10596 Ops.push_back(Offsets);
10597 Ops.push_back(TDescr);
10598 Ops.push_back(M->getChain());
10599
10600 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10601 MachineMemOperand *MemRef = M->getMemOperand();
10602 DAG.setNodeMemRefs(NewNode, {MemRef});
10603 return SDValue(NewNode, 0);
10604 }
10605 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10606 MemSDNode *M = cast<MemSDNode>(Op);
10607 SDValue NodePtr = M->getOperand(2);
10608 SDValue RayExtent = M->getOperand(3);
10609 SDValue RayOrigin = M->getOperand(4);
10610 SDValue RayDir = M->getOperand(5);
10611 SDValue RayInvDir = M->getOperand(6);
10612 SDValue TDescr = M->getOperand(7);
10613
10614 assert(NodePtr.getValueType() == MVT::i32 ||
10615 NodePtr.getValueType() == MVT::i64);
10616 assert(RayDir.getValueType() == MVT::v3f16 ||
10617 RayDir.getValueType() == MVT::v3f32);
10618
10619 if (!Subtarget->hasGFX10_AEncoding()) {
10620 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10621 return SDValue();
10622 }
10623
10624 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
10625 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
10626 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10627 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
10628 const bool Is64 = NodePtr.getValueType() == MVT::i64;
10629 const unsigned NumVDataDwords = 4;
10630 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10631 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10632 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10633 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
10634 IsGFX12Plus;
10635 const unsigned BaseOpcodes[2][2] = {
10636 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10637 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10638 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10639 int Opcode;
10640 if (UseNSA) {
10641 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10642 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10643 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10644 : AMDGPU::MIMGEncGfx10NSA,
10645 NumVDataDwords, NumVAddrDwords);
10646 } else {
10647 assert(!IsGFX12Plus);
10648 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10649 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10650 : AMDGPU::MIMGEncGfx10Default,
10651 NumVDataDwords, NumVAddrDwords);
10652 }
10653 assert(Opcode != -1);
10654
10656
10657 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
10659 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
10660 if (Lanes[0].getValueSizeInBits() == 32) {
10661 for (unsigned I = 0; I < 3; ++I)
10662 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
10663 } else {
10664 if (IsAligned) {
10665 Ops.push_back(DAG.getBitcast(
10666 MVT::i32,
10667 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
10668 Ops.push_back(Lanes[2]);
10669 } else {
10670 SDValue Elt0 = Ops.pop_back_val();
10671 Ops.push_back(DAG.getBitcast(
10672 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
10673 Ops.push_back(DAG.getBitcast(
10674 MVT::i32,
10675 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
10676 }
10677 }
10678 };
10679
10680 if (UseNSA && IsGFX11Plus) {
10681 Ops.push_back(NodePtr);
10682 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10683 Ops.push_back(RayOrigin);
10684 if (IsA16) {
10685 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
10686 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
10687 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
10688 for (unsigned I = 0; I < 3; ++I) {
10689 MergedLanes.push_back(DAG.getBitcast(
10690 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
10691 {DirLanes[I], InvDirLanes[I]})));
10692 }
10693 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
10694 } else {
10695 Ops.push_back(RayDir);
10696 Ops.push_back(RayInvDir);
10697 }
10698 } else {
10699 if (Is64)
10700 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
10701 2);
10702 else
10703 Ops.push_back(NodePtr);
10704
10705 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10706 packLanes(RayOrigin, true);
10707 packLanes(RayDir, true);
10708 packLanes(RayInvDir, false);
10709 }
10710
10711 if (!UseNSA) {
10712 // Build a single vector containing all the operands so far prepared.
10713 if (NumVAddrDwords > 12) {
10714 SDValue Undef = DAG.getPOISON(MVT::i32);
10715 Ops.append(16 - Ops.size(), Undef);
10716 }
10717 assert(Ops.size() >= 8 && Ops.size() <= 12);
10718 SDValue MergedOps =
10719 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
10720 Ops.clear();
10721 Ops.push_back(MergedOps);
10722 }
10723
10724 Ops.push_back(TDescr);
10725 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
10726 Ops.push_back(M->getChain());
10727
10728 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10729 MachineMemOperand *MemRef = M->getMemOperand();
10730 DAG.setNodeMemRefs(NewNode, {MemRef});
10731 return SDValue(NewNode, 0);
10732 }
10733 case Intrinsic::amdgcn_global_atomic_fmin_num:
10734 case Intrinsic::amdgcn_global_atomic_fmax_num:
10735 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10736 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10737 MemSDNode *M = cast<MemSDNode>(Op);
10738 SDValue Ops[] = {
10739 M->getOperand(0), // Chain
10740 M->getOperand(2), // Ptr
10741 M->getOperand(3) // Value
10742 };
10743 unsigned Opcode = 0;
10744 switch (IntrID) {
10745 case Intrinsic::amdgcn_global_atomic_fmin_num:
10746 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10747 Opcode = ISD::ATOMIC_LOAD_FMIN;
10748 break;
10749 }
10750 case Intrinsic::amdgcn_global_atomic_fmax_num:
10751 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10752 Opcode = ISD::ATOMIC_LOAD_FMAX;
10753 break;
10754 }
10755 default:
10756 llvm_unreachable("unhandled atomic opcode");
10757 }
10758 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
10759 Ops, M->getMemOperand());
10760 }
10761 case Intrinsic::amdgcn_s_get_barrier_state:
10762 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10763 SDValue Chain = Op->getOperand(0);
10765 unsigned Opc;
10766
10767 if (isa<ConstantSDNode>(Op->getOperand(2))) {
10768 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
10769 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10770 BarID = (BarID >> 4) & 0x3F;
10771 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10772 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10773 Ops.push_back(K);
10774 Ops.push_back(Chain);
10775 } else {
10776 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10777 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10778 SDValue M0Val;
10779 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
10780 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10781 M0Val = SDValue(
10782 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10783 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10784 0);
10785 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10786 } else
10787 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
10788 }
10789
10790 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10791 return SDValue(NewMI, 0);
10792 }
10793 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
10794 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
10795 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
10796 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
10797 SDValue Chain = Op->getOperand(0);
10798 SDValue Ptr = Op->getOperand(2);
10799 EVT VT = Op->getValueType(0);
10800 return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
10801 Chain, Ptr, MII->getMemOperand());
10802 }
10803 default:
10804
10805 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10807 return lowerImage(Op, ImageDimIntr, DAG, true);
10808
10809 return SDValue();
10810 }
10811}
10812
10813// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
10814// dwordx4 if on SI and handle TFE loads.
10815SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
10816 SDVTList VTList,
10817 ArrayRef<SDValue> Ops, EVT MemVT,
10818 MachineMemOperand *MMO,
10819 SelectionDAG &DAG) const {
10820 LLVMContext &C = *DAG.getContext();
10821 MachineFunction &MF = DAG.getMachineFunction();
10822 EVT VT = VTList.VTs[0];
10823
10824 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
10825 bool IsTFE = VTList.NumVTs == 3;
10826 if (IsTFE) {
10827 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
10828 unsigned NumOpDWords = NumValueDWords + 1;
10829 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
10830 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
10831 MachineMemOperand *OpDWordsMMO =
10832 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
10833 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
10834 OpDWordsVT, OpDWordsMMO, DAG);
10835 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10836 DAG.getVectorIdxConstant(NumValueDWords, DL));
10837 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
10838 SDValue ValueDWords =
10839 NumValueDWords == 1
10840 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
10842 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
10843 ZeroIdx);
10844 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
10845 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10846 }
10847
10848 if (!Subtarget->hasDwordx3LoadStores() &&
10849 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10850 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
10851 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
10852 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
10853 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
10854 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
10855 WidenedMemVT, WidenedMMO);
10857 DAG.getVectorIdxConstant(0, DL));
10858 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
10859 }
10860
10861 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
10862}
10863
10864SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
10865 bool ImageStore) const {
10866 EVT StoreVT = VData.getValueType();
10867
10868 // No change for f16 and legal vector D16 types.
10869 if (!StoreVT.isVector())
10870 return VData;
10871
10872 SDLoc DL(VData);
10873 unsigned NumElements = StoreVT.getVectorNumElements();
10874
10875 if (Subtarget->hasUnpackedD16VMem()) {
10876 // We need to unpack the packed data to store.
10877 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10878 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10879
10880 EVT EquivStoreVT =
10881 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
10882 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
10883 return DAG.UnrollVectorOp(ZExt.getNode());
10884 }
10885
10886 // The sq block of gfx8.1 does not estimate register use correctly for d16
10887 // image store instructions. The data operand is computed as if it were not a
10888 // d16 image instruction.
10889 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
10890 // Bitcast to i16
10891 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10892 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10893
10894 // Decompose into scalars
10896 DAG.ExtractVectorElements(IntVData, Elts);
10897
10898 // Group pairs of i16 into v2i16 and bitcast to i32
10899 SmallVector<SDValue, 4> PackedElts;
10900 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
10901 SDValue Pair =
10902 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
10903 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10904 PackedElts.push_back(IntPair);
10905 }
10906 if ((NumElements % 2) == 1) {
10907 // Handle v3i16
10908 unsigned I = Elts.size() / 2;
10909 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
10910 {Elts[I * 2], DAG.getPOISON(MVT::i16)});
10911 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10912 PackedElts.push_back(IntPair);
10913 }
10914
10915 // Pad using UNDEF
10916 PackedElts.resize(Elts.size(), DAG.getPOISON(MVT::i32));
10917
10918 // Build final vector
10919 EVT VecVT =
10920 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
10921 return DAG.getBuildVector(VecVT, DL, PackedElts);
10922 }
10923
10924 if (NumElements == 3) {
10925 EVT IntStoreVT =
10927 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10928
10929 EVT WidenedStoreVT = EVT::getVectorVT(
10930 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
10931 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
10932 WidenedStoreVT.getStoreSizeInBits());
10933 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
10934 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
10935 }
10936
10937 assert(isTypeLegal(StoreVT));
10938 return VData;
10939}
10940
10941SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
10942 SelectionDAG &DAG) const {
10943 SDLoc DL(Op);
10944 SDValue Chain = Op.getOperand(0);
10945 unsigned IntrinsicID = Op.getConstantOperandVal(1);
10946 MachineFunction &MF = DAG.getMachineFunction();
10947
10948 switch (IntrinsicID) {
10949 case Intrinsic::amdgcn_exp_compr: {
10950 if (!Subtarget->hasCompressedExport()) {
10951 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10953 "intrinsic not supported on subtarget", DL.getDebugLoc()));
10954 }
10955 SDValue Src0 = Op.getOperand(4);
10956 SDValue Src1 = Op.getOperand(5);
10957 // Hack around illegal type on SI by directly selecting it.
10958 if (isTypeLegal(Src0.getValueType()))
10959 return SDValue();
10960
10961 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
10962 SDValue Undef = DAG.getPOISON(MVT::f32);
10963 const SDValue Ops[] = {
10964 Op.getOperand(2), // tgt
10965 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
10966 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
10967 Undef, // src2
10968 Undef, // src3
10969 Op.getOperand(7), // vm
10970 DAG.getTargetConstant(1, DL, MVT::i1), // compr
10971 Op.getOperand(3), // en
10972 Op.getOperand(0) // Chain
10973 };
10974
10975 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
10976 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
10977 }
10978
10979 case Intrinsic::amdgcn_struct_tbuffer_store:
10980 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
10981 SDValue VData = Op.getOperand(2);
10982 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
10983 if (IsD16)
10984 VData = handleD16VData(VData, DAG);
10985 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10986 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10987 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10988 SDValue Ops[] = {
10989 Chain,
10990 VData, // vdata
10991 Rsrc, // rsrc
10992 Op.getOperand(4), // vindex
10993 VOffset, // voffset
10994 SOffset, // soffset
10995 Offset, // offset
10996 Op.getOperand(7), // format
10997 Op.getOperand(8), // cachepolicy, swizzled buffer
10998 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10999 };
11000 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11002 MemSDNode *M = cast<MemSDNode>(Op);
11003 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11004 M->getMemoryVT(), M->getMemOperand());
11005 }
11006
11007 case Intrinsic::amdgcn_raw_tbuffer_store:
11008 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11009 SDValue VData = Op.getOperand(2);
11010 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11011 if (IsD16)
11012 VData = handleD16VData(VData, DAG);
11013 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11014 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11015 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11016 SDValue Ops[] = {
11017 Chain,
11018 VData, // vdata
11019 Rsrc, // rsrc
11020 DAG.getConstant(0, DL, MVT::i32), // vindex
11021 VOffset, // voffset
11022 SOffset, // soffset
11023 Offset, // offset
11024 Op.getOperand(6), // format
11025 Op.getOperand(7), // cachepolicy, swizzled buffer
11026 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11027 };
11028 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11030 MemSDNode *M = cast<MemSDNode>(Op);
11031 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11032 M->getMemoryVT(), M->getMemOperand());
11033 }
11034
11035 case Intrinsic::amdgcn_raw_buffer_store:
11036 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11037 case Intrinsic::amdgcn_raw_buffer_store_format:
11038 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11039 const bool IsFormat =
11040 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11041 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11042
11043 SDValue VData = Op.getOperand(2);
11044 EVT VDataVT = VData.getValueType();
11045 EVT EltType = VDataVT.getScalarType();
11046 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11047 if (IsD16) {
11048 VData = handleD16VData(VData, DAG);
11049 VDataVT = VData.getValueType();
11050 }
11051
11052 if (!isTypeLegal(VDataVT)) {
11053 VData =
11054 DAG.getNode(ISD::BITCAST, DL,
11055 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11056 }
11057
11058 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11059 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11060 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11061 SDValue Ops[] = {
11062 Chain,
11063 VData,
11064 Rsrc,
11065 DAG.getConstant(0, DL, MVT::i32), // vindex
11066 VOffset, // voffset
11067 SOffset, // soffset
11068 Offset, // offset
11069 Op.getOperand(6), // cachepolicy, swizzled buffer
11070 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11071 };
11072 unsigned Opc =
11075 MemSDNode *M = cast<MemSDNode>(Op);
11076
11077 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11078 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11079 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
11080
11081 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11082 M->getMemoryVT(), M->getMemOperand());
11083 }
11084
11085 case Intrinsic::amdgcn_struct_buffer_store:
11086 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11087 case Intrinsic::amdgcn_struct_buffer_store_format:
11088 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11089 const bool IsFormat =
11090 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11091 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11092
11093 SDValue VData = Op.getOperand(2);
11094 EVT VDataVT = VData.getValueType();
11095 EVT EltType = VDataVT.getScalarType();
11096 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11097
11098 if (IsD16) {
11099 VData = handleD16VData(VData, DAG);
11100 VDataVT = VData.getValueType();
11101 }
11102
11103 if (!isTypeLegal(VDataVT)) {
11104 VData =
11105 DAG.getNode(ISD::BITCAST, DL,
11106 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11107 }
11108
11109 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11110 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11111 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11112 SDValue Ops[] = {
11113 Chain,
11114 VData,
11115 Rsrc,
11116 Op.getOperand(4), // vindex
11117 VOffset, // voffset
11118 SOffset, // soffset
11119 Offset, // offset
11120 Op.getOperand(7), // cachepolicy, swizzled buffer
11121 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11122 };
11123 unsigned Opc =
11126 MemSDNode *M = cast<MemSDNode>(Op);
11127
11128 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11129 EVT VDataType = VData.getValueType().getScalarType();
11130 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11131 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
11132
11133 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11134 M->getMemoryVT(), M->getMemOperand());
11135 }
11136 case Intrinsic::amdgcn_raw_buffer_load_lds:
11137 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11138 case Intrinsic::amdgcn_struct_buffer_load_lds:
11139 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
11140 if (!Subtarget->hasVMemToLDSLoad())
11141 return SDValue();
11142 unsigned Opc;
11143 bool HasVIndex =
11144 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11145 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
11146 unsigned OpOffset = HasVIndex ? 1 : 0;
11147 SDValue VOffset = Op.getOperand(5 + OpOffset);
11148 bool HasVOffset = !isNullConstant(VOffset);
11149 unsigned Size = Op->getConstantOperandVal(4);
11150
11151 switch (Size) {
11152 default:
11153 return SDValue();
11154 case 1:
11155 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11156 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11157 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11158 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11159 break;
11160 case 2:
11161 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11162 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11163 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11164 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11165 break;
11166 case 4:
11167 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11168 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11169 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11170 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11171 break;
11172 case 12:
11173 if (!Subtarget->hasLDSLoadB96_B128())
11174 return SDValue();
11175 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11176 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11177 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11178 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11179 break;
11180 case 16:
11181 if (!Subtarget->hasLDSLoadB96_B128())
11182 return SDValue();
11183 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11184 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11185 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11186 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11187 break;
11188 }
11189
11190 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11191
11193
11194 if (HasVIndex && HasVOffset)
11195 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
11196 {Op.getOperand(5), // VIndex
11197 VOffset}));
11198 else if (HasVIndex)
11199 Ops.push_back(Op.getOperand(5));
11200 else if (HasVOffset)
11201 Ops.push_back(VOffset);
11202
11203 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11204 Ops.push_back(Rsrc);
11205 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
11206 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
11207 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
11208 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
11209 Ops.push_back(DAG.getTargetConstant(
11210 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
11211 DL, MVT::i8)); // cpol
11212 Ops.push_back(DAG.getTargetConstant(
11213 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
11214 ? 1
11215 : 0,
11216 DL, MVT::i8)); // swz
11217 Ops.push_back(M0Val.getValue(0)); // Chain
11218 Ops.push_back(M0Val.getValue(1)); // Glue
11219
11220 auto *M = cast<MemSDNode>(Op);
11221 MachineMemOperand *LoadMMO = M->getMemOperand();
11222 // Don't set the offset value here because the pointer points to the base of
11223 // the buffer.
11224 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11225
11226 MachinePointerInfo StorePtrI = LoadPtrI;
11227 LoadPtrI.V = PoisonValue::get(
11231
11232 auto F = LoadMMO->getFlags() &
11234 LoadMMO =
11236 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11237
11238 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11239 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
11240 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11241
11242 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
11243 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11244
11245 return SDValue(Load, 0);
11246 }
11247 // Buffers are handled by LowerBufferFatPointers, and we're going to go
11248 // for "trust me" that the remaining cases are global pointers until
11249 // such time as we can put two mem operands on an intrinsic.
11250 case Intrinsic::amdgcn_load_to_lds:
11251 case Intrinsic::amdgcn_global_load_lds: {
11252 if (!Subtarget->hasVMemToLDSLoad())
11253 return SDValue();
11254
11255 unsigned Opc;
11256 unsigned Size = Op->getConstantOperandVal(4);
11257 switch (Size) {
11258 default:
11259 return SDValue();
11260 case 1:
11261 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11262 break;
11263 case 2:
11264 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11265 break;
11266 case 4:
11267 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11268 break;
11269 case 12:
11270 if (!Subtarget->hasLDSLoadB96_B128())
11271 return SDValue();
11272 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11273 break;
11274 case 16:
11275 if (!Subtarget->hasLDSLoadB96_B128())
11276 return SDValue();
11277 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11278 break;
11279 }
11280
11281 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11282
11284
11285 SDValue Addr = Op.getOperand(2); // Global ptr
11286 SDValue VOffset;
11287 // Try to split SAddr and VOffset. Global and LDS pointers share the same
11288 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
11289 if (Addr->isDivergent() && Addr->isAnyAdd()) {
11290 SDValue LHS = Addr.getOperand(0);
11291 SDValue RHS = Addr.getOperand(1);
11292
11293 if (LHS->isDivergent())
11294 std::swap(LHS, RHS);
11295
11296 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
11297 RHS.getOperand(0).getValueType() == MVT::i32) {
11298 // add (i64 sgpr), (zero_extend (i32 vgpr))
11299 Addr = LHS;
11300 VOffset = RHS.getOperand(0);
11301 }
11302 }
11303
11304 Ops.push_back(Addr);
11305 if (!Addr->isDivergent()) {
11307 if (!VOffset)
11308 VOffset =
11309 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
11310 DAG.getTargetConstant(0, DL, MVT::i32)),
11311 0);
11312 Ops.push_back(VOffset);
11313 }
11314
11315 Ops.push_back(Op.getOperand(5)); // Offset
11316 Ops.push_back(Op.getOperand(6)); // CPol
11317 Ops.push_back(M0Val.getValue(0)); // Chain
11318 Ops.push_back(M0Val.getValue(1)); // Glue
11319
11320 auto *M = cast<MemSDNode>(Op);
11321 MachineMemOperand *LoadMMO = M->getMemOperand();
11322 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11323 LoadPtrI.Offset = Op->getConstantOperandVal(5);
11324 MachinePointerInfo StorePtrI = LoadPtrI;
11325 LoadPtrI.V = PoisonValue::get(
11329 auto F = LoadMMO->getFlags() &
11331 LoadMMO =
11333 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11334 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11335 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
11336 LoadMMO->getAAInfo());
11337
11338 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11339 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11340
11341 return SDValue(Load, 0);
11342 }
11343 case Intrinsic::amdgcn_end_cf:
11344 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
11345 Op->getOperand(2), Chain),
11346 0);
11347 case Intrinsic::amdgcn_s_barrier_init:
11348 case Intrinsic::amdgcn_s_barrier_signal_var: {
11349 // these two intrinsics have two operands: barrier pointer and member count
11350 SDValue Chain = Op->getOperand(0);
11352 SDValue BarOp = Op->getOperand(2);
11353 SDValue CntOp = Op->getOperand(3);
11354 SDValue M0Val;
11355 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11356 ? AMDGPU::S_BARRIER_INIT_M0
11357 : AMDGPU::S_BARRIER_SIGNAL_M0;
11358 // extract the BarrierID from bits 4-9 of BarOp
11359 SDValue BarID;
11360 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11361 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11362 BarID =
11363 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
11364 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11365 0);
11366 // Member count should be put into M0[ShAmt:+6]
11367 // Barrier ID should be put into M0[5:0]
11368 M0Val =
11369 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
11370 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11371 0);
11372 constexpr unsigned ShAmt = 16;
11373 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
11374 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
11375
11376 M0Val = SDValue(
11377 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
11378
11379 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11380
11381 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11382 return SDValue(NewMI, 0);
11383 }
11384 case Intrinsic::amdgcn_s_barrier_join: {
11385 // these three intrinsics have one operand: barrier pointer
11386 SDValue Chain = Op->getOperand(0);
11388 SDValue BarOp = Op->getOperand(2);
11389 unsigned Opc;
11390
11391 if (isa<ConstantSDNode>(BarOp)) {
11392 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
11393 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11394
11395 // extract the BarrierID from bits 4-9 of the immediate
11396 unsigned BarID = (BarVal >> 4) & 0x3F;
11397 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
11398 Ops.push_back(K);
11399 Ops.push_back(Chain);
11400 } else {
11401 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11402
11403 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
11404 SDValue M0Val;
11405 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11406 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11407 M0Val =
11408 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
11409 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11410 0);
11411 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11412 }
11413
11414 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11415 return SDValue(NewMI, 0);
11416 }
11417 case Intrinsic::amdgcn_s_prefetch_data: {
11418 // For non-global address space preserve the chain and remove the call.
11420 return Op.getOperand(0);
11421 return Op;
11422 }
11423 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11424 SDValue Ops[] = {
11425 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
11426 Op.getOperand(3), // offset
11427 Op.getOperand(4), // length
11428 };
11429
11430 MemSDNode *M = cast<MemSDNode>(Op);
11432 Op->getVTList(), Ops, M->getMemoryVT(),
11433 M->getMemOperand());
11434 }
11435 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11436 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11437 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11438 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11439 SDValue Chain = Op->getOperand(0);
11440 SDValue Ptr = Op->getOperand(2);
11441 SDValue Val = Op->getOperand(3);
11442 return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val,
11443 Ptr, MII->getMemOperand());
11444 }
11445 default: {
11446 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11448 return lowerImage(Op, ImageDimIntr, DAG, true);
11449
11450 return Op;
11451 }
11452 }
11453}
11454
11455// Return whether the operation has NoUnsignedWrap property.
11456static bool isNoUnsignedWrap(SDValue Addr) {
11457 return (Addr.getOpcode() == ISD::ADD &&
11458 Addr->getFlags().hasNoUnsignedWrap()) ||
11459 Addr->getOpcode() == ISD::OR;
11460}
11461
11463 EVT PtrVT) const {
11464 return UseSelectionDAGPTRADD && PtrVT == MVT::i64;
11465}
11466
11468 EVT PtrVT) const {
11469 return true;
11470}
11471
11472// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
11473// offset (the offset that is included in bounds checking and swizzling, to be
11474// split between the instruction's voffset and immoffset fields) and soffset
11475// (the offset that is excluded from bounds checking and swizzling, to go in
11476// the instruction's soffset field). This function takes the first kind of
11477// offset and figures out how to split it between voffset and immoffset.
11478std::pair<SDValue, SDValue>
11479SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
11480 SDLoc DL(Offset);
11481 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
11482 SDValue N0 = Offset;
11483 ConstantSDNode *C1 = nullptr;
11484
11485 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
11486 N0 = SDValue();
11487 else if (DAG.isBaseWithConstantOffset(N0)) {
11488 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
11489 // being added, so we can only safely match a 32-bit addition with no
11490 // unsigned overflow.
11491 bool CheckNUW = AMDGPU::isGFX1250(*Subtarget);
11492 if (!CheckNUW || isNoUnsignedWrap(N0)) {
11493 C1 = cast<ConstantSDNode>(N0.getOperand(1));
11494 N0 = N0.getOperand(0);
11495 }
11496 }
11497
11498 if (C1) {
11499 unsigned ImmOffset = C1->getZExtValue();
11500 // If the immediate value is too big for the immoffset field, put only bits
11501 // that would normally fit in the immoffset field. The remaining value that
11502 // is copied/added for the voffset field is a large power of 2, and it
11503 // stands more chance of being CSEd with the copy/add for another similar
11504 // load/store.
11505 // However, do not do that rounding down if that is a negative
11506 // number, as it appears to be illegal to have a negative offset in the
11507 // vgpr, even if adding the immediate offset makes it positive.
11508 unsigned Overflow = ImmOffset & ~MaxImm;
11509 ImmOffset -= Overflow;
11510 if ((int32_t)Overflow < 0) {
11511 Overflow += ImmOffset;
11512 ImmOffset = 0;
11513 }
11514 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
11515 if (Overflow) {
11516 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
11517 if (!N0)
11518 N0 = OverflowVal;
11519 else {
11520 SDValue Ops[] = {N0, OverflowVal};
11521 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
11522 }
11523 }
11524 }
11525 if (!N0)
11526 N0 = DAG.getConstant(0, DL, MVT::i32);
11527 if (!C1)
11528 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
11529 return {N0, SDValue(C1, 0)};
11530}
11531
11532// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
11533// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
11534// pointed to by Offsets.
11535void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
11536 SelectionDAG &DAG, SDValue *Offsets,
11537 Align Alignment) const {
11538 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11539 SDLoc DL(CombinedOffset);
11540 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
11541 uint32_t Imm = C->getZExtValue();
11542 uint32_t SOffset, ImmOffset;
11543 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11544 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
11545 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11546 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11547 return;
11548 }
11549 }
11550 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
11551 SDValue N0 = CombinedOffset.getOperand(0);
11552 SDValue N1 = CombinedOffset.getOperand(1);
11553 uint32_t SOffset, ImmOffset;
11554 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
11555 if (Offset >= 0 &&
11556 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
11557 Offsets[0] = N0;
11558 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11559 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11560 return;
11561 }
11562 }
11563
11564 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11565 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
11566 : DAG.getConstant(0, DL, MVT::i32);
11567
11568 Offsets[0] = CombinedOffset;
11569 Offsets[1] = SOffsetZero;
11570 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
11571}
11572
11573SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
11574 SelectionDAG &DAG) const {
11575 if (!MaybePointer.getValueType().isScalarInteger())
11576 return MaybePointer;
11577
11578 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
11579 return Rsrc;
11580}
11581
11582// Wrap a global or flat pointer into a buffer intrinsic using the flags
11583// specified in the intrinsic.
11584SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
11585 SelectionDAG &DAG) const {
11586 SDLoc Loc(Op);
11587
11588 SDValue Pointer = Op->getOperand(1);
11589 SDValue Stride = Op->getOperand(2);
11590 SDValue NumRecords = Op->getOperand(3);
11591 SDValue Flags = Op->getOperand(4);
11592
11593 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11594 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
11595 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
11596 std::optional<uint32_t> ConstStride = std::nullopt;
11597 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
11598 ConstStride = ConstNode->getZExtValue();
11599
11600 SDValue NewHighHalf = Masked;
11601 if (!ConstStride || *ConstStride != 0) {
11602 SDValue ShiftedStride;
11603 if (ConstStride) {
11604 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
11605 } else {
11606 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
11607 ShiftedStride =
11608 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11609 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
11610 }
11611 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
11612 }
11613
11614 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
11615 NewHighHalf, NumRecords, Flags);
11616 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11617 return RsrcPtr;
11618}
11619
11620// Handle 8 bit and 16 bit buffer loads
11621SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
11622 EVT LoadVT, SDLoc DL,
11624 MachineMemOperand *MMO,
11625 bool IsTFE) const {
11626 EVT IntVT = LoadVT.changeTypeToInteger();
11627
11628 if (IsTFE) {
11629 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
11632 MachineFunction &MF = DAG.getMachineFunction();
11633 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
11634 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
11635 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
11636 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11637 DAG.getConstant(1, DL, MVT::i32));
11638 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11639 DAG.getConstant(0, DL, MVT::i32));
11640 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
11641 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
11642 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
11643 }
11644
11645 unsigned Opc = LoadVT.getScalarType() == MVT::i8
11648
11649 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
11650 SDValue BufferLoad =
11651 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
11652 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
11653 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
11654
11655 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
11656}
11657
11658// Handle 8 bit and 16 bit buffer stores
11659SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
11660 EVT VDataType, SDLoc DL,
11661 SDValue Ops[],
11662 MemSDNode *M) const {
11663 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11664 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
11665
11666 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
11667 Ops[1] = BufferStoreExt;
11668 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
11669 : AMDGPUISD::BUFFER_STORE_SHORT;
11670 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
11671 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
11672 M->getMemOperand());
11673}
11674
11676 SDValue Op, const SDLoc &SL, EVT VT) {
11677 if (VT.bitsLT(Op.getValueType()))
11678 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
11679
11680 switch (ExtType) {
11681 case ISD::SEXTLOAD:
11682 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
11683 case ISD::ZEXTLOAD:
11684 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
11685 case ISD::EXTLOAD:
11686 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
11687 case ISD::NON_EXTLOAD:
11688 return Op;
11689 }
11690
11691 llvm_unreachable("invalid ext type");
11692}
11693
11694// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
11695// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
11696SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
11697 DAGCombinerInfo &DCI) const {
11698 SelectionDAG &DAG = DCI.DAG;
11699 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
11700 return SDValue();
11701
11702 // FIXME: Constant loads should all be marked invariant.
11703 unsigned AS = Ld->getAddressSpace();
11704 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
11706 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
11707 return SDValue();
11708
11709 // Don't do this early, since it may interfere with adjacent load merging for
11710 // illegal types. We can avoid losing alignment information for exotic types
11711 // pre-legalize.
11712 EVT MemVT = Ld->getMemoryVT();
11713 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
11714 MemVT.getSizeInBits() >= 32)
11715 return SDValue();
11716
11717 SDLoc SL(Ld);
11718
11719 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
11720 "unexpected vector extload");
11721
11722 // TODO: Drop only high part of range.
11723 SDValue Ptr = Ld->getBasePtr();
11724 SDValue NewLoad = DAG.getLoad(
11725 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
11726 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
11727 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
11728 nullptr); // Drop ranges
11729
11730 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
11731 if (MemVT.isFloatingPoint()) {
11733 "unexpected fp extload");
11734 TruncVT = MemVT.changeTypeToInteger();
11735 }
11736
11737 SDValue Cvt = NewLoad;
11738 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
11739 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
11740 DAG.getValueType(TruncVT));
11741 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
11743 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
11744 } else {
11746 }
11747
11748 EVT VT = Ld->getValueType(0);
11749 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
11750
11751 DCI.AddToWorklist(Cvt.getNode());
11752
11753 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
11754 // the appropriate extension from the 32-bit load.
11755 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
11756 DCI.AddToWorklist(Cvt.getNode());
11757
11758 // Handle conversion back to floating point if necessary.
11759 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
11760
11761 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
11762}
11763
11765 const SIMachineFunctionInfo &Info) {
11766 // TODO: Should check if the address can definitely not access stack.
11767 if (Info.isEntryFunction())
11768 return Info.getUserSGPRInfo().hasFlatScratchInit();
11769 return true;
11770}
11771
11772SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
11773 SDLoc DL(Op);
11774 LoadSDNode *Load = cast<LoadSDNode>(Op);
11775 ISD::LoadExtType ExtType = Load->getExtensionType();
11776 EVT MemVT = Load->getMemoryVT();
11777 MachineMemOperand *MMO = Load->getMemOperand();
11778
11779 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
11780 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
11781 return SDValue();
11782
11783 // FIXME: Copied from PPC
11784 // First, load into 32 bits, then truncate to 1 bit.
11785
11786 SDValue Chain = Load->getChain();
11787 SDValue BasePtr = Load->getBasePtr();
11788
11789 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
11790
11791 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
11792 RealMemVT, MMO);
11793
11794 if (!MemVT.isVector()) {
11795 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
11796 NewLD.getValue(1)};
11797
11798 return DAG.getMergeValues(Ops, DL);
11799 }
11800
11802 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
11803 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
11804 DAG.getConstant(I, DL, MVT::i32));
11805
11806 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
11807 }
11808
11809 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
11810
11811 return DAG.getMergeValues(Ops, DL);
11812 }
11813
11814 if (!MemVT.isVector())
11815 return SDValue();
11816
11817 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
11818 "Custom lowering for non-i32 vectors hasn't been implemented.");
11819
11820 Align Alignment = Load->getAlign();
11821 unsigned AS = Load->getAddressSpace();
11822 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
11823 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
11824 return SplitVectorLoad(Op, DAG);
11825 }
11826
11827 MachineFunction &MF = DAG.getMachineFunction();
11828 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
11829 // If there is a possibility that flat instruction access scratch memory
11830 // then we need to use the same legalization rules we use for private.
11831 if (AS == AMDGPUAS::FLAT_ADDRESS &&
11832 !Subtarget->hasMultiDwordFlatScratchAddressing())
11833 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
11836
11837 unsigned NumElements = MemVT.getVectorNumElements();
11838
11839 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11841 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
11842 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
11844 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
11845 Alignment >= Align(4) && NumElements < 32) {
11846 if (MemVT.isPow2VectorType() ||
11847 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
11848 return SDValue();
11849 return WidenOrSplitVectorLoad(Op, DAG);
11850 }
11851 // Non-uniform loads will be selected to MUBUF instructions, so they
11852 // have the same legalization requirements as global and private
11853 // loads.
11854 //
11855 }
11856 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11859 if (NumElements > 4)
11860 return SplitVectorLoad(Op, DAG);
11861 // v3 loads not supported on SI.
11862 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11863 return WidenOrSplitVectorLoad(Op, DAG);
11864
11865 // v3 and v4 loads are supported for private and global memory.
11866 return SDValue();
11867 }
11868 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11869 // Depending on the setting of the private_element_size field in the
11870 // resource descriptor, we can only make private accesses up to a certain
11871 // size.
11872 switch (Subtarget->getMaxPrivateElementSize()) {
11873 case 4: {
11874 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
11875 return DAG.getMergeValues({Op0, Op1}, DL);
11876 }
11877 case 8:
11878 if (NumElements > 2)
11879 return SplitVectorLoad(Op, DAG);
11880 return SDValue();
11881 case 16:
11882 // Same as global/flat
11883 if (NumElements > 4)
11884 return SplitVectorLoad(Op, DAG);
11885 // v3 loads not supported on SI.
11886 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11887 return WidenOrSplitVectorLoad(Op, DAG);
11888
11889 return SDValue();
11890 default:
11891 llvm_unreachable("unsupported private_element_size");
11892 }
11893 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
11894 unsigned Fast = 0;
11895 auto Flags = Load->getMemOperand()->getFlags();
11897 Load->getAlign(), Flags, &Fast) &&
11898 Fast > 1)
11899 return SDValue();
11900
11901 if (MemVT.isVector())
11902 return SplitVectorLoad(Op, DAG);
11903 }
11904
11906 MemVT, *Load->getMemOperand())) {
11907 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
11908 return DAG.getMergeValues({Op0, Op1}, DL);
11909 }
11910
11911 return SDValue();
11912}
11913
11914SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
11915 EVT VT = Op.getValueType();
11916 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
11917 VT.getSizeInBits() == 512)
11918 return splitTernaryVectorOp(Op, DAG);
11919
11920 assert(VT.getSizeInBits() == 64);
11921
11922 SDLoc DL(Op);
11923 SDValue Cond = DAG.getFreeze(Op.getOperand(0));
11924
11925 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
11926 SDValue One = DAG.getConstant(1, DL, MVT::i32);
11927
11928 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
11929 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
11930
11931 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
11932 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
11933
11934 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
11935
11936 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
11937 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
11938
11939 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
11940
11941 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
11942 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
11943}
11944
11945// Catch division cases where we can use shortcuts with rcp and rsq
11946// instructions.
11947SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
11948 SelectionDAG &DAG) const {
11949 SDLoc SL(Op);
11950 SDValue LHS = Op.getOperand(0);
11951 SDValue RHS = Op.getOperand(1);
11952 EVT VT = Op.getValueType();
11953 const SDNodeFlags Flags = Op->getFlags();
11954
11955 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
11956
11957 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
11958 // Without !fpmath accuracy information, we can't do more because we don't
11959 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
11960 // f16 is always accurate enough
11961 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
11962 return SDValue();
11963
11964 if (CLHS->isExactlyValue(1.0)) {
11965 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
11966 // the CI documentation has a worst case error of 1 ulp.
11967 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
11968 // use it as long as we aren't trying to use denormals.
11969 //
11970 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
11971
11972 // 1.0 / sqrt(x) -> rsq(x)
11973
11974 // XXX - Is afn sufficient to do this for f64? The maximum ULP
11975 // error seems really high at 2^29 ULP.
11976 // 1.0 / x -> rcp(x)
11977 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
11978 }
11979
11980 // Same as for 1.0, but expand the sign out of the constant.
11981 if (CLHS->isExactlyValue(-1.0)) {
11982 // -1.0 / x -> rcp (fneg x)
11983 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
11984 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
11985 }
11986 }
11987
11988 // For f16 and bf16 require afn or arcp.
11989 // For f32 require afn.
11990 if (!AllowInaccurateRcp &&
11991 ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
11992 return SDValue();
11993
11994 // Turn into multiply by the reciprocal.
11995 // x / y -> x * (1.0 / y)
11996 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
11997 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
11998}
11999
12000SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
12001 SelectionDAG &DAG) const {
12002 SDLoc SL(Op);
12003 SDValue X = Op.getOperand(0);
12004 SDValue Y = Op.getOperand(1);
12005 EVT VT = Op.getValueType();
12006 const SDNodeFlags Flags = Op->getFlags();
12007
12008 bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
12009 if (!AllowInaccurateDiv)
12010 return SDValue();
12011
12012 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
12013 SDValue One = DAG.getConstantFP(1.0, SL, VT);
12014
12015 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
12016 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12017
12018 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
12019 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12020 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
12021 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
12022 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
12023 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
12024}
12025
12026static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12027 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
12028 SDNodeFlags Flags) {
12029 if (GlueChain->getNumValues() <= 1) {
12030 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
12031 }
12032
12033 assert(GlueChain->getNumValues() == 3);
12034
12035 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12036 switch (Opcode) {
12037 default:
12038 llvm_unreachable("no chain equivalent for opcode");
12039 case ISD::FMUL:
12040 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12041 break;
12042 }
12043
12044 return DAG.getNode(Opcode, SL, VTList,
12045 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
12046 Flags);
12047}
12048
12049static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12050 EVT VT, SDValue A, SDValue B, SDValue C,
12051 SDValue GlueChain, SDNodeFlags Flags) {
12052 if (GlueChain->getNumValues() <= 1) {
12053 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
12054 }
12055
12056 assert(GlueChain->getNumValues() == 3);
12057
12058 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12059 switch (Opcode) {
12060 default:
12061 llvm_unreachable("no chain equivalent for opcode");
12062 case ISD::FMA:
12063 Opcode = AMDGPUISD::FMA_W_CHAIN;
12064 break;
12065 }
12066
12067 return DAG.getNode(Opcode, SL, VTList,
12068 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
12069 Flags);
12070}
12071
12072SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
12073 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12074 return FastLowered;
12075
12076 SDLoc SL(Op);
12077 EVT VT = Op.getValueType();
12078 SDValue LHS = Op.getOperand(0);
12079 SDValue RHS = Op.getOperand(1);
12080
12081 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
12082 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
12083
12084 if (VT == MVT::bf16) {
12085 SDValue ExtDiv =
12086 DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags());
12087 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv,
12088 DAG.getTargetConstant(0, SL, MVT::i32));
12089 }
12090
12091 assert(VT == MVT::f16);
12092
12093 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
12094 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
12095 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
12096 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
12097 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12098 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
12099 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12100 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
12101 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
12102 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
12103 // q16.u = opx(V_CVT_F16_F32, q32.u);
12104 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
12105
12106 // We will use ISD::FMA on targets that don't support ISD::FMAD.
12107 unsigned FMADOpCode =
12109 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
12110 SDValue Rcp =
12111 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
12112 SDValue Quot =
12113 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
12114 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12115 Op->getFlags());
12116 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
12117 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12118 Op->getFlags());
12119 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
12120 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
12121 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
12122 DAG.getConstant(0xff800000, SL, MVT::i32));
12123 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
12124 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
12125 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
12126 DAG.getTargetConstant(0, SL, MVT::i32));
12127 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
12128 Op->getFlags());
12129}
12130
12131// Faster 2.5 ULP division that does not support denormals.
12132SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
12133 SDNodeFlags Flags = Op->getFlags();
12134 SDLoc SL(Op);
12135 SDValue LHS = Op.getOperand(1);
12136 SDValue RHS = Op.getOperand(2);
12137
12138 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
12139
12140 const APFloat K0Val(0x1p+96f);
12141 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
12142
12143 const APFloat K1Val(0x1p-32f);
12144 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
12145
12146 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12147
12148 EVT SetCCVT =
12149 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
12150
12151 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
12152
12153 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
12154
12155 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
12156
12157 // rcp does not support denormals.
12158 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
12159
12160 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
12161
12162 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
12163}
12164
12165// Returns immediate value for setting the F32 denorm mode when using the
12166// S_DENORM_MODE instruction.
12169 const GCNSubtarget *ST) {
12170 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
12171 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
12172 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12173 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
12174}
12175
12176SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
12177 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12178 return FastLowered;
12179
12180 // The selection matcher assumes anything with a chain selecting to a
12181 // mayRaiseFPException machine instruction. Since we're introducing a chain
12182 // here, we need to explicitly report nofpexcept for the regular fdiv
12183 // lowering.
12184 SDNodeFlags Flags = Op->getFlags();
12185 Flags.setNoFPExcept(true);
12186
12187 SDLoc SL(Op);
12188 SDValue LHS = Op.getOperand(0);
12189 SDValue RHS = Op.getOperand(1);
12190
12191 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12192
12193 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
12194
12195 SDValue DenominatorScaled =
12196 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
12197 SDValue NumeratorScaled =
12198 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
12199
12200 // Denominator is scaled to not be denormal, so using rcp is ok.
12201 SDValue ApproxRcp =
12202 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
12203 SDValue NegDivScale0 =
12204 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
12205
12206 using namespace AMDGPU::Hwreg;
12207 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12208 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
12209
12210 const MachineFunction &MF = DAG.getMachineFunction();
12211 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
12212 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
12213
12214 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
12215 const bool HasDynamicDenormals =
12216 (DenormMode.Input == DenormalMode::Dynamic) ||
12217 (DenormMode.Output == DenormalMode::Dynamic);
12218
12219 SDValue SavedDenormMode;
12220
12221 if (!PreservesDenormals) {
12222 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
12223 // lowering. The chain dependence is insufficient, and we need glue. We do
12224 // not need the glue variants in a strictfp function.
12225
12226 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12227
12228 SDValue Glue = DAG.getEntryNode();
12229 if (HasDynamicDenormals) {
12230 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
12231 DAG.getVTList(MVT::i32, MVT::Glue),
12232 {BitField, Glue});
12233 SavedDenormMode = SDValue(GetReg, 0);
12234
12235 Glue = DAG.getMergeValues(
12236 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
12237 }
12238
12239 SDNode *EnableDenorm;
12240 if (Subtarget->hasDenormModeInst()) {
12241 const SDValue EnableDenormValue =
12243
12244 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
12245 EnableDenormValue)
12246 .getNode();
12247 } else {
12248 const SDValue EnableDenormValue =
12249 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
12250 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12251 {EnableDenormValue, BitField, Glue});
12252 }
12253
12254 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
12255 SDValue(EnableDenorm, 1)};
12256
12257 NegDivScale0 = DAG.getMergeValues(Ops, SL);
12258 }
12259
12260 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
12261 ApproxRcp, One, NegDivScale0, Flags);
12262
12263 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
12264 ApproxRcp, Fma0, Flags);
12265
12266 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
12267 Fma1, Flags);
12268
12269 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
12270 NumeratorScaled, Mul, Flags);
12271
12272 SDValue Fma3 =
12273 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
12274
12275 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
12276 NumeratorScaled, Fma3, Flags);
12277
12278 if (!PreservesDenormals) {
12279 SDNode *DisableDenorm;
12280 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12281 const SDValue DisableDenormValue = getSPDenormModeValue(
12282 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
12283
12284 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12285 DisableDenorm =
12286 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
12287 Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2))
12288 .getNode();
12289 } else {
12290 assert(HasDynamicDenormals == (bool)SavedDenormMode);
12291 const SDValue DisableDenormValue =
12292 HasDynamicDenormals
12293 ? SavedDenormMode
12294 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
12295
12296 DisableDenorm = DAG.getMachineNode(
12297 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12298 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
12299 }
12300
12301 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
12302 SDValue(DisableDenorm, 0), DAG.getRoot());
12303 DAG.setRoot(OutputChain);
12304 }
12305
12306 SDValue Scale = NumeratorScaled.getValue(1);
12307 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
12308 {Fma4, Fma1, Fma3, Scale}, Flags);
12309
12310 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
12311}
12312
12313SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
12314 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
12315 return FastLowered;
12316
12317 SDLoc SL(Op);
12318 SDValue X = Op.getOperand(0);
12319 SDValue Y = Op.getOperand(1);
12320
12321 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
12322
12323 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
12324
12325 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
12326
12327 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
12328
12329 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
12330
12331 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
12332
12333 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
12334
12335 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
12336
12337 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
12338
12339 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
12340 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
12341
12342 SDValue Fma4 =
12343 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
12344
12345 SDValue Scale;
12346
12347 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12348 // Workaround a hardware bug on SI where the condition output from div_scale
12349 // is not usable.
12350
12351 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
12352
12353 // Figure out if the scale to use for div_fmas.
12354 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
12355 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
12356 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
12357 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
12358
12359 SDValue NumHi =
12360 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
12361 SDValue DenHi =
12362 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
12363
12364 SDValue Scale0Hi =
12365 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
12366 SDValue Scale1Hi =
12367 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
12368
12369 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
12370 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
12371 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
12372 } else {
12373 Scale = DivScale1.getValue(1);
12374 }
12375
12376 SDValue Fmas =
12377 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
12378
12379 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
12380}
12381
12382SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
12383 EVT VT = Op.getValueType();
12384
12385 if (VT == MVT::f32)
12386 return LowerFDIV32(Op, DAG);
12387
12388 if (VT == MVT::f64)
12389 return LowerFDIV64(Op, DAG);
12390
12391 if (VT == MVT::f16 || VT == MVT::bf16)
12392 return LowerFDIV16(Op, DAG);
12393
12394 llvm_unreachable("Unexpected type for fdiv");
12395}
12396
12397SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
12398 SDLoc dl(Op);
12399 SDValue Val = Op.getOperand(0);
12400 EVT VT = Val.getValueType();
12401 EVT ResultExpVT = Op->getValueType(1);
12402 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12403
12404 SDValue Mant = DAG.getNode(
12406 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
12407
12408 SDValue Exp = DAG.getNode(
12409 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
12410 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
12411
12412 if (Subtarget->hasFractBug()) {
12413 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
12414 SDValue Inf =
12416
12417 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
12418 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
12419 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
12420 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
12421 }
12422
12423 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
12424 return DAG.getMergeValues({Mant, CastExp}, dl);
12425}
12426
12427SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
12428 SDLoc DL(Op);
12429 StoreSDNode *Store = cast<StoreSDNode>(Op);
12430 EVT VT = Store->getMemoryVT();
12431
12432 if (VT == MVT::i1) {
12433 return DAG.getTruncStore(
12434 Store->getChain(), DL,
12435 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
12436 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
12437 }
12438
12439 assert(VT.isVector() &&
12440 Store->getValue().getValueType().getScalarType() == MVT::i32);
12441
12442 unsigned AS = Store->getAddressSpace();
12443 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
12444 Store->getAlign().value() < VT.getStoreSize() &&
12445 VT.getSizeInBits() > 32) {
12446 return SplitVectorStore(Op, DAG);
12447 }
12448
12449 MachineFunction &MF = DAG.getMachineFunction();
12450 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12451 // If there is a possibility that flat instruction access scratch memory
12452 // then we need to use the same legalization rules we use for private.
12453 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12454 !Subtarget->hasMultiDwordFlatScratchAddressing())
12455 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
12458
12459 unsigned NumElements = VT.getVectorNumElements();
12461 if (NumElements > 4)
12462 return SplitVectorStore(Op, DAG);
12463 // v3 stores not supported on SI.
12464 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12465 return SplitVectorStore(Op, DAG);
12466
12468 VT, *Store->getMemOperand()))
12469 return expandUnalignedStore(Store, DAG);
12470
12471 return SDValue();
12472 }
12473 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12474 switch (Subtarget->getMaxPrivateElementSize()) {
12475 case 4:
12476 return scalarizeVectorStore(Store, DAG);
12477 case 8:
12478 if (NumElements > 2)
12479 return SplitVectorStore(Op, DAG);
12480 return SDValue();
12481 case 16:
12482 if (NumElements > 4 ||
12483 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12484 return SplitVectorStore(Op, DAG);
12485 return SDValue();
12486 default:
12487 llvm_unreachable("unsupported private_element_size");
12488 }
12489 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12490 unsigned Fast = 0;
12491 auto Flags = Store->getMemOperand()->getFlags();
12493 Store->getAlign(), Flags, &Fast) &&
12494 Fast > 1)
12495 return SDValue();
12496
12497 if (VT.isVector())
12498 return SplitVectorStore(Op, DAG);
12499
12500 return expandUnalignedStore(Store, DAG);
12501 }
12502
12503 // Probably an invalid store. If so we'll end up emitting a selection error.
12504 return SDValue();
12505}
12506
12507// Avoid the full correct expansion for f32 sqrt when promoting from f16.
12508SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
12509 SDLoc SL(Op);
12510 assert(!Subtarget->has16BitInsts());
12511 SDNodeFlags Flags = Op->getFlags();
12512 SDValue Ext =
12513 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
12514
12515 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
12516 SDValue Sqrt =
12517 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
12518
12519 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
12520 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
12521}
12522
12523SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
12524 SDLoc DL(Op);
12525 SDNodeFlags Flags = Op->getFlags();
12526 MVT VT = Op.getValueType().getSimpleVT();
12527 const SDValue X = Op.getOperand(0);
12528
12529 if (allowApproxFunc(DAG, Flags)) {
12530 // Instruction is 1ulp but ignores denormals.
12531 return DAG.getNode(
12533 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
12534 }
12535
12536 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
12537 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
12538
12539 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
12540
12541 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
12542
12543 SDValue SqrtX =
12544 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
12545
12546 SDValue SqrtS;
12547 if (needsDenormHandlingF32(DAG, X, Flags)) {
12548 SDValue SqrtID =
12549 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
12550 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
12551
12552 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
12553 SDValue SqrtSNextDownInt =
12554 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12555 DAG.getAllOnesConstant(DL, MVT::i32));
12556 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
12557
12558 SDValue NegSqrtSNextDown =
12559 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
12560
12561 SDValue SqrtVP =
12562 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
12563
12564 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12565 DAG.getConstant(1, DL, MVT::i32));
12566 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
12567
12568 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
12569 SDValue SqrtVS =
12570 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
12571
12572 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
12573 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
12574
12575 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
12576 Flags);
12577
12578 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
12579 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
12580 Flags);
12581 } else {
12582 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
12583
12584 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
12585
12586 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
12587 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
12588 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
12589
12590 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
12591 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
12592 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
12593
12594 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
12595 SDValue SqrtD =
12596 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
12597 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
12598 }
12599
12600 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
12601
12602 SDValue ScaledDown =
12603 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
12604
12605 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
12606 SDValue IsZeroOrInf =
12607 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12608 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12609
12610 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
12611}
12612
12613SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
12614 // For double type, the SQRT and RSQ instructions don't have required
12615 // precision, we apply Goldschmidt's algorithm to improve the result:
12616 //
12617 // y0 = rsq(x)
12618 // g0 = x * y0
12619 // h0 = 0.5 * y0
12620 //
12621 // r0 = 0.5 - h0 * g0
12622 // g1 = g0 * r0 + g0
12623 // h1 = h0 * r0 + h0
12624 //
12625 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
12626 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
12627 // h2 = h1 * r1 + h1
12628 //
12629 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
12630 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
12631 //
12632 // sqrt(x) = g3
12633
12634 SDNodeFlags Flags = Op->getFlags();
12635
12636 SDLoc DL(Op);
12637
12638 SDValue X = Op.getOperand(0);
12639 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
12640
12641 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
12642
12643 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
12644
12645 // Scale up input if it is too small.
12646 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
12647 SDValue ScaleUp =
12648 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
12649 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
12650
12651 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
12652
12653 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
12654
12655 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
12656 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
12657
12658 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
12659 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
12660
12661 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
12662
12663 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
12664
12665 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
12666 SDValue SqrtD0 =
12667 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
12668
12669 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
12670
12671 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
12672 SDValue SqrtD1 =
12673 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
12674
12675 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
12676
12677 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
12678 SDValue ScaleDown =
12679 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
12680 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12681
12682 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
12683 // with finite only or nsz because rsq(+/-0) = +/-inf
12684
12685 // TODO: Check for DAZ and expand to subnormals
12686 SDValue IsZeroOrInf =
12687 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12688 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12689
12690 // If x is +INF, +0, or -0, use its original value
12691 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
12692 Flags);
12693}
12694
12695SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
12696 SDLoc DL(Op);
12697 EVT VT = Op.getValueType();
12698 SDValue Arg = Op.getOperand(0);
12699 SDValue TrigVal;
12700
12701 // Propagate fast-math flags so that the multiply we introduce can be folded
12702 // if Arg is already the result of a multiply by constant.
12703 auto Flags = Op->getFlags();
12704
12705 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
12706
12707 if (Subtarget->hasTrigReducedRange()) {
12708 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12709 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
12710 } else {
12711 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12712 }
12713
12714 switch (Op.getOpcode()) {
12715 case ISD::FCOS:
12716 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
12717 case ISD::FSIN:
12718 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
12719 default:
12720 llvm_unreachable("Wrong trig opcode");
12721 }
12722}
12723
12724SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
12725 SelectionDAG &DAG) const {
12726 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
12727 assert(AtomicNode->isCompareAndSwap());
12728 unsigned AS = AtomicNode->getAddressSpace();
12729
12730 // No custom lowering required for local address space
12732 return Op;
12733
12734 // Non-local address space requires custom lowering for atomic compare
12735 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
12736 SDLoc DL(Op);
12737 SDValue ChainIn = Op.getOperand(0);
12738 SDValue Addr = Op.getOperand(1);
12739 SDValue Old = Op.getOperand(2);
12740 SDValue New = Op.getOperand(3);
12741 EVT VT = Op.getValueType();
12742 MVT SimpleVT = VT.getSimpleVT();
12743 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
12744
12745 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
12746 SDValue Ops[] = {ChainIn, Addr, NewOld};
12747
12749 Op->getVTList(), Ops, VT,
12750 AtomicNode->getMemOperand());
12751}
12752
12753//===----------------------------------------------------------------------===//
12754// Custom DAG optimizations
12755//===----------------------------------------------------------------------===//
12756
12757SDValue
12758SITargetLowering::performUCharToFloatCombine(SDNode *N,
12759 DAGCombinerInfo &DCI) const {
12760 EVT VT = N->getValueType(0);
12761 EVT ScalarVT = VT.getScalarType();
12762 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12763 return SDValue();
12764
12765 SelectionDAG &DAG = DCI.DAG;
12766 SDLoc DL(N);
12767
12768 SDValue Src = N->getOperand(0);
12769 EVT SrcVT = Src.getValueType();
12770
12771 // TODO: We could try to match extracting the higher bytes, which would be
12772 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
12773 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
12774 // about in practice.
12775 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12776 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
12777 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
12778 DCI.AddToWorklist(Cvt.getNode());
12779
12780 // For the f16 case, fold to a cast to f32 and then cast back to f16.
12781 if (ScalarVT != MVT::f32) {
12782 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
12783 DAG.getTargetConstant(0, DL, MVT::i32));
12784 }
12785 return Cvt;
12786 }
12787 }
12788
12789 return SDValue();
12790}
12791
12792SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
12793 DAGCombinerInfo &DCI) const {
12794 SDValue MagnitudeOp = N->getOperand(0);
12795 SDValue SignOp = N->getOperand(1);
12796
12797 // The generic combine for fcopysign + fp cast is too conservative with
12798 // vectors, and also gets confused by the splitting we will perform here, so
12799 // peek through FP casts.
12800 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
12801 SignOp.getOpcode() == ISD::FP_ROUND)
12802 SignOp = SignOp.getOperand(0);
12803
12804 SelectionDAG &DAG = DCI.DAG;
12805 SDLoc DL(N);
12806 EVT SignVT = SignOp.getValueType();
12807
12808 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
12809 // lower half with a copy.
12810 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
12811 EVT MagVT = MagnitudeOp.getValueType();
12812
12813 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
12814
12815 if (MagVT.getScalarType() == MVT::f64) {
12816 EVT F32VT = MagVT.isVector()
12817 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
12818 : MVT::v2f32;
12819
12820 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
12821
12823 for (unsigned I = 0; I != NumElts; ++I) {
12824 SDValue MagLo =
12825 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
12826 DAG.getConstant(2 * I, DL, MVT::i32));
12827 SDValue MagHi =
12828 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
12829 DAG.getConstant(2 * I + 1, DL, MVT::i32));
12830
12831 SDValue SignOpElt =
12832 MagVT.isVector()
12834 SignOp, DAG.getConstant(I, DL, MVT::i32))
12835 : SignOp;
12836
12837 SDValue HiOp =
12838 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
12839
12840 SDValue Vector =
12841 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
12842
12843 SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
12844 NewElts.push_back(NewElt);
12845 }
12846
12847 if (NewElts.size() == 1)
12848 return NewElts[0];
12849
12850 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
12851 }
12852
12853 if (SignVT.getScalarType() != MVT::f64)
12854 return SDValue();
12855
12856 // Reduce width of sign operand, we only need the highest bit.
12857 //
12858 // fcopysign f64:x, f64:y ->
12859 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
12860 // TODO: In some cases it might make sense to go all the way to f16.
12861
12862 EVT F32VT = MagVT.isVector()
12863 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
12864 : MVT::v2f32;
12865
12866 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
12867
12868 SmallVector<SDValue, 8> F32Signs;
12869 for (unsigned I = 0; I != NumElts; ++I) {
12870 // Take sign from odd elements of cast vector
12871 SDValue SignAsF32 =
12872 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
12873 DAG.getConstant(2 * I + 1, DL, MVT::i32));
12874 F32Signs.push_back(SignAsF32);
12875 }
12876
12877 SDValue NewSign =
12878 NumElts == 1
12879 ? F32Signs.back()
12881 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
12882 F32Signs);
12883
12884 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
12885 NewSign);
12886}
12887
12888// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
12889// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
12890// bits
12891
12892// This is a variant of
12893// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
12894//
12895// The normal DAG combiner will do this, but only if the add has one use since
12896// that would increase the number of instructions.
12897//
12898// This prevents us from seeing a constant offset that can be folded into a
12899// memory instruction's addressing mode. If we know the resulting add offset of
12900// a pointer can be folded into an addressing offset, we can replace the pointer
12901// operand with the add of new constant offset. This eliminates one of the uses,
12902// and may allow the remaining use to also be simplified.
12903//
12904SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
12905 EVT MemVT,
12906 DAGCombinerInfo &DCI) const {
12907 SDValue N0 = N->getOperand(0);
12908 SDValue N1 = N->getOperand(1);
12909
12910 // We only do this to handle cases where it's profitable when there are
12911 // multiple uses of the add, so defer to the standard combine.
12912 if ((!N0->isAnyAdd() && N0.getOpcode() != ISD::OR) || N0->hasOneUse())
12913 return SDValue();
12914
12915 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
12916 if (!CN1)
12917 return SDValue();
12918
12919 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
12920 if (!CAdd)
12921 return SDValue();
12922
12923 SelectionDAG &DAG = DCI.DAG;
12924
12925 if (N0->getOpcode() == ISD::OR &&
12926 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
12927 return SDValue();
12928
12929 // If the resulting offset is too large, we can't fold it into the
12930 // addressing mode offset.
12931 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
12932 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
12933
12934 AddrMode AM;
12935 AM.HasBaseReg = true;
12936 AM.BaseOffs = Offset.getSExtValue();
12937 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
12938 return SDValue();
12939
12940 SDLoc SL(N);
12941 EVT VT = N->getValueType(0);
12942
12943 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
12944 SDValue COffset = DAG.getConstant(Offset, SL, VT);
12945
12946 SDNodeFlags Flags;
12947 Flags.setNoUnsignedWrap(
12948 N->getFlags().hasNoUnsignedWrap() &&
12949 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
12950
12951 // Use ISD::ADD even if the original operation was ISD::PTRADD, since we can't
12952 // be sure that the new left operand is a proper base pointer.
12953 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
12954}
12955
12956/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
12957/// by the chain and intrinsic ID. Theoretically we would also need to check the
12958/// specific intrinsic, but they all place the pointer operand first.
12959static unsigned getBasePtrIndex(const MemSDNode *N) {
12960 switch (N->getOpcode()) {
12961 case ISD::STORE:
12964 return 2;
12965 default:
12966 return 1;
12967 }
12968}
12969
12970SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
12971 DAGCombinerInfo &DCI) const {
12972 SelectionDAG &DAG = DCI.DAG;
12973
12974 unsigned PtrIdx = getBasePtrIndex(N);
12975 SDValue Ptr = N->getOperand(PtrIdx);
12976
12977 // TODO: We could also do this for multiplies.
12978 if (Ptr.getOpcode() == ISD::SHL) {
12979 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
12980 N->getMemoryVT(), DCI);
12981 if (NewPtr) {
12982 SmallVector<SDValue, 8> NewOps(N->ops());
12983
12984 NewOps[PtrIdx] = NewPtr;
12985 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
12986 }
12987 }
12988
12989 return SDValue();
12990}
12991
12992static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
12993 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
12994 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
12995 (Opc == ISD::XOR && Val == 0);
12996}
12997
12998// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
12999// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
13000// integer combine opportunities since most 64-bit operations are decomposed
13001// this way. TODO: We won't want this for SALU especially if it is an inline
13002// immediate.
13003SDValue SITargetLowering::splitBinaryBitConstantOp(
13004 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
13005 const ConstantSDNode *CRHS) const {
13006 uint64_t Val = CRHS->getZExtValue();
13007 uint32_t ValLo = Lo_32(Val);
13008 uint32_t ValHi = Hi_32(Val);
13009 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13010
13011 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
13013 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
13014 // We have 64-bit scalar and/or/xor, but do not have vector forms.
13015 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
13016 !CRHS->user_begin()->isDivergent())
13017 return SDValue();
13018
13019 // If we need to materialize a 64-bit immediate, it will be split up later
13020 // anyway. Avoid creating the harder to understand 64-bit immediate
13021 // materialization.
13022 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
13023 }
13024
13025 return SDValue();
13026}
13027
13029 if (V.getValueType() != MVT::i1)
13030 return false;
13031 switch (V.getOpcode()) {
13032 default:
13033 break;
13034 case ISD::SETCC:
13035 case ISD::IS_FPCLASS:
13037 return true;
13038 case ISD::AND:
13039 case ISD::OR:
13040 case ISD::XOR:
13041 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
13042 case ISD::SADDO:
13043 case ISD::UADDO:
13044 case ISD::SSUBO:
13045 case ISD::USUBO:
13046 case ISD::SMULO:
13047 case ISD::UMULO:
13048 return V.getResNo() == 1;
13050 unsigned IntrinsicID = V.getConstantOperandVal(0);
13051 switch (IntrinsicID) {
13052 case Intrinsic::amdgcn_is_shared:
13053 case Intrinsic::amdgcn_is_private:
13054 return true;
13055 default:
13056 return false;
13057 }
13058
13059 return false;
13060 }
13061 }
13062 return false;
13063}
13064
13065// If a constant has all zeroes or all ones within each byte return it.
13066// Otherwise return 0.
13068 // 0xff for any zero byte in the mask
13069 uint32_t ZeroByteMask = 0;
13070 if (!(C & 0x000000ff))
13071 ZeroByteMask |= 0x000000ff;
13072 if (!(C & 0x0000ff00))
13073 ZeroByteMask |= 0x0000ff00;
13074 if (!(C & 0x00ff0000))
13075 ZeroByteMask |= 0x00ff0000;
13076 if (!(C & 0xff000000))
13077 ZeroByteMask |= 0xff000000;
13078 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
13079 if ((NonZeroByteMask & C) != NonZeroByteMask)
13080 return 0; // Partial bytes selected.
13081 return C;
13082}
13083
13084// Check if a node selects whole bytes from its operand 0 starting at a byte
13085// boundary while masking the rest. Returns select mask as in the v_perm_b32
13086// or -1 if not succeeded.
13087// Note byte select encoding:
13088// value 0-3 selects corresponding source byte;
13089// value 0xc selects zero;
13090// value 0xff selects 0xff.
13092 assert(V.getValueSizeInBits() == 32);
13093
13094 if (V.getNumOperands() != 2)
13095 return ~0;
13096
13097 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
13098 if (!N1)
13099 return ~0;
13100
13101 uint32_t C = N1->getZExtValue();
13102
13103 switch (V.getOpcode()) {
13104 default:
13105 break;
13106 case ISD::AND:
13107 if (uint32_t ConstMask = getConstantPermuteMask(C))
13108 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13109 break;
13110
13111 case ISD::OR:
13112 if (uint32_t ConstMask = getConstantPermuteMask(C))
13113 return (0x03020100 & ~ConstMask) | ConstMask;
13114 break;
13115
13116 case ISD::SHL:
13117 if (C % 8)
13118 return ~0;
13119
13120 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
13121
13122 case ISD::SRL:
13123 if (C % 8)
13124 return ~0;
13125
13126 return uint32_t(0x0c0c0c0c03020100ull >> C);
13127 }
13128
13129 return ~0;
13130}
13131
13132SDValue SITargetLowering::performAndCombine(SDNode *N,
13133 DAGCombinerInfo &DCI) const {
13134 if (DCI.isBeforeLegalize())
13135 return SDValue();
13136
13137 SelectionDAG &DAG = DCI.DAG;
13138 EVT VT = N->getValueType(0);
13139 SDValue LHS = N->getOperand(0);
13140 SDValue RHS = N->getOperand(1);
13141
13142 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13143 if (VT == MVT::i64 && CRHS) {
13144 if (SDValue Split =
13145 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
13146 return Split;
13147 }
13148
13149 if (CRHS && VT == MVT::i32) {
13150 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
13151 // nb = number of trailing zeroes in mask
13152 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
13153 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
13154 uint64_t Mask = CRHS->getZExtValue();
13155 unsigned Bits = llvm::popcount(Mask);
13156 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
13157 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
13158 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
13159 unsigned Shift = CShift->getZExtValue();
13160 unsigned NB = CRHS->getAPIntValue().countr_zero();
13161 unsigned Offset = NB + Shift;
13162 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
13163 SDLoc SL(N);
13164 SDValue BFE =
13165 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
13166 DAG.getConstant(Offset, SL, MVT::i32),
13167 DAG.getConstant(Bits, SL, MVT::i32));
13168 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
13169 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
13170 DAG.getValueType(NarrowVT));
13171 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
13172 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
13173 return Shl;
13174 }
13175 }
13176 }
13177
13178 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13179 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
13180 isa<ConstantSDNode>(LHS.getOperand(2))) {
13181 uint32_t Sel = getConstantPermuteMask(Mask);
13182 if (!Sel)
13183 return SDValue();
13184
13185 // Select 0xc for all zero bytes
13186 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13187 SDLoc DL(N);
13188 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13189 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13190 }
13191 }
13192
13193 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
13194 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
13195 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
13196 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13197 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
13198
13199 SDValue X = LHS.getOperand(0);
13200 SDValue Y = RHS.getOperand(0);
13201 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
13202 !isTypeLegal(X.getValueType()))
13203 return SDValue();
13204
13205 if (LCC == ISD::SETO) {
13206 if (X != LHS.getOperand(1))
13207 return SDValue();
13208
13209 if (RCC == ISD::SETUNE) {
13210 const ConstantFPSDNode *C1 =
13211 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
13212 if (!C1 || !C1->isInfinity() || C1->isNegative())
13213 return SDValue();
13214
13215 const uint32_t Mask = SIInstrFlags::N_NORMAL |
13219
13220 static_assert(
13223 0x3ff) == Mask,
13224 "mask not equal");
13225
13226 SDLoc DL(N);
13227 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
13228 DAG.getConstant(Mask, DL, MVT::i32));
13229 }
13230 }
13231 }
13232
13233 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13234 std::swap(LHS, RHS);
13235
13236 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13237 RHS.hasOneUse()) {
13238 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13239 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
13240 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
13241 // | n_nan)
13242 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13243 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
13244 (RHS.getOperand(0) == LHS.getOperand(0) &&
13245 LHS.getOperand(0) == LHS.getOperand(1))) {
13246 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
13247 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
13248 : Mask->getZExtValue() & OrdMask;
13249
13250 SDLoc DL(N);
13251 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
13252 DAG.getConstant(NewMask, DL, MVT::i32));
13253 }
13254 }
13255
13256 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
13257 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
13258 // and x, (sext cc from i1) => select cc, x, 0
13259 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
13260 std::swap(LHS, RHS);
13261 if (isBoolSGPR(RHS.getOperand(0)))
13262 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
13263 DAG.getConstant(0, SDLoc(N), MVT::i32));
13264 }
13265
13266 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13267 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13268 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13269 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13270 uint32_t LHSMask = getPermuteMask(LHS);
13271 uint32_t RHSMask = getPermuteMask(RHS);
13272 if (LHSMask != ~0u && RHSMask != ~0u) {
13273 // Canonicalize the expression in an attempt to have fewer unique masks
13274 // and therefore fewer registers used to hold the masks.
13275 if (LHSMask > RHSMask) {
13276 std::swap(LHSMask, RHSMask);
13277 std::swap(LHS, RHS);
13278 }
13279
13280 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13281 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13282 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13283 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13284
13285 // Check of we need to combine values from two sources within a byte.
13286 if (!(LHSUsedLanes & RHSUsedLanes) &&
13287 // If we select high and lower word keep it for SDWA.
13288 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13289 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13290 // Each byte in each mask is either selector mask 0-3, or has higher
13291 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
13292 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
13293 // mask which is not 0xff wins. By anding both masks we have a correct
13294 // result except that 0x0c shall be corrected to give 0x0c only.
13295 uint32_t Mask = LHSMask & RHSMask;
13296 for (unsigned I = 0; I < 32; I += 8) {
13297 uint32_t ByteSel = 0xff << I;
13298 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13299 Mask &= (0x0c << I) & 0xffffffff;
13300 }
13301
13302 // Add 4 to each active LHS lane. It will not affect any existing 0xff
13303 // or 0x0c.
13304 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
13305 SDLoc DL(N);
13306
13307 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13308 RHS.getOperand(0),
13309 DAG.getConstant(Sel, DL, MVT::i32));
13310 }
13311 }
13312 }
13313
13314 return SDValue();
13315}
13316
13317// A key component of v_perm is a mapping between byte position of the src
13318// operands, and the byte position of the dest. To provide such, we need: 1. the
13319// node that provides x byte of the dest of the OR, and 2. the byte of the node
13320// used to provide that x byte. calculateByteProvider finds which node provides
13321// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
13322// and finds an ultimate src and byte position For example: The supported
13323// LoadCombine pattern for vector loads is as follows
13324// t1
13325// or
13326// / \
13327// t2 t3
13328// zext shl
13329// | | \
13330// t4 t5 16
13331// or anyext
13332// / \ |
13333// t6 t7 t8
13334// srl shl or
13335// / | / \ / \
13336// t9 t10 t11 t12 t13 t14
13337// trunc* 8 trunc* 8 and and
13338// | | / | | \
13339// t15 t16 t17 t18 t19 t20
13340// trunc* 255 srl -256
13341// | / \
13342// t15 t15 16
13343//
13344// *In this example, the truncs are from i32->i16
13345//
13346// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
13347// respectively. calculateSrcByte would find (given node) -> ultimate src &
13348// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
13349// After finding the mapping, we can combine the tree into vperm t15, t16,
13350// 0x05000407
13351
13352// Find the source and byte position from a node.
13353// \p DestByte is the byte position of the dest of the or that the src
13354// ultimately provides. \p SrcIndex is the byte of the src that maps to this
13355// dest of the or byte. \p Depth tracks how many recursive iterations we have
13356// performed.
13357static const std::optional<ByteProvider<SDValue>>
13358calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
13359 unsigned Depth = 0) {
13360 // We may need to recursively traverse a series of SRLs
13361 if (Depth >= 6)
13362 return std::nullopt;
13363
13364 if (Op.getValueSizeInBits() < 8)
13365 return std::nullopt;
13366
13367 if (Op.getValueType().isVector())
13368 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13369
13370 switch (Op->getOpcode()) {
13371 case ISD::TRUNCATE: {
13372 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13373 }
13374
13375 case ISD::SIGN_EXTEND:
13376 case ISD::ZERO_EXTEND:
13378 SDValue NarrowOp = Op->getOperand(0);
13379 auto NarrowVT = NarrowOp.getValueType();
13380 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
13381 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13382 NarrowVT = VTSign->getVT();
13383 }
13384 if (!NarrowVT.isByteSized())
13385 return std::nullopt;
13386 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
13387
13388 if (SrcIndex >= NarrowByteWidth)
13389 return std::nullopt;
13390 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13391 }
13392
13393 case ISD::SRA:
13394 case ISD::SRL: {
13395 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13396 if (!ShiftOp)
13397 return std::nullopt;
13398
13399 uint64_t BitShift = ShiftOp->getZExtValue();
13400
13401 if (BitShift % 8 != 0)
13402 return std::nullopt;
13403
13404 SrcIndex += BitShift / 8;
13405
13406 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13407 }
13408
13409 default: {
13410 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13411 }
13412 }
13413 llvm_unreachable("fully handled switch");
13414}
13415
13416// For a byte position in the result of an Or, traverse the tree and find the
13417// node (and the byte of the node) which ultimately provides this {Or,
13418// BytePosition}. \p Op is the operand we are currently examining. \p Index is
13419// the byte position of the Op that corresponds with the originally requested
13420// byte of the Or \p Depth tracks how many recursive iterations we have
13421// performed. \p StartingIndex is the originally requested byte of the Or
13422static const std::optional<ByteProvider<SDValue>>
13423calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
13424 unsigned StartingIndex = 0) {
13425 // Finding Src tree of RHS of or typically requires at least 1 additional
13426 // depth
13427 if (Depth > 6)
13428 return std::nullopt;
13429
13430 unsigned BitWidth = Op.getScalarValueSizeInBits();
13431 if (BitWidth % 8 != 0)
13432 return std::nullopt;
13433 if (Index > BitWidth / 8 - 1)
13434 return std::nullopt;
13435
13436 bool IsVec = Op.getValueType().isVector();
13437 switch (Op.getOpcode()) {
13438 case ISD::OR: {
13439 if (IsVec)
13440 return std::nullopt;
13441
13442 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
13443 StartingIndex);
13444 if (!RHS)
13445 return std::nullopt;
13446 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13447 StartingIndex);
13448 if (!LHS)
13449 return std::nullopt;
13450 // A well formed Or will have two ByteProviders for each byte, one of which
13451 // is constant zero
13452 if (!LHS->isConstantZero() && !RHS->isConstantZero())
13453 return std::nullopt;
13454 if (!LHS || LHS->isConstantZero())
13455 return RHS;
13456 if (!RHS || RHS->isConstantZero())
13457 return LHS;
13458 return std::nullopt;
13459 }
13460
13461 case ISD::AND: {
13462 if (IsVec)
13463 return std::nullopt;
13464
13465 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13466 if (!BitMaskOp)
13467 return std::nullopt;
13468
13469 uint32_t BitMask = BitMaskOp->getZExtValue();
13470 // Bits we expect for our StartingIndex
13471 uint32_t IndexMask = 0xFF << (Index * 8);
13472
13473 if ((IndexMask & BitMask) != IndexMask) {
13474 // If the result of the and partially provides the byte, then it
13475 // is not well formatted
13476 if (IndexMask & BitMask)
13477 return std::nullopt;
13479 }
13480
13481 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
13482 }
13483
13484 case ISD::FSHR: {
13485 if (IsVec)
13486 return std::nullopt;
13487
13488 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
13489 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13490 if (!ShiftOp || Op.getValueType().isVector())
13491 return std::nullopt;
13492
13493 uint64_t BitsProvided = Op.getValueSizeInBits();
13494 if (BitsProvided % 8 != 0)
13495 return std::nullopt;
13496
13497 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13498 if (BitShift % 8)
13499 return std::nullopt;
13500
13501 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13502 uint64_t ByteShift = BitShift / 8;
13503
13504 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13505 uint64_t BytesProvided = BitsProvided / 8;
13506 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13507 NewIndex %= BytesProvided;
13508 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
13509 }
13510
13511 case ISD::SRA:
13512 case ISD::SRL: {
13513 if (IsVec)
13514 return std::nullopt;
13515
13516 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13517 if (!ShiftOp)
13518 return std::nullopt;
13519
13520 uint64_t BitShift = ShiftOp->getZExtValue();
13521 if (BitShift % 8)
13522 return std::nullopt;
13523
13524 auto BitsProvided = Op.getScalarValueSizeInBits();
13525 if (BitsProvided % 8 != 0)
13526 return std::nullopt;
13527
13528 uint64_t BytesProvided = BitsProvided / 8;
13529 uint64_t ByteShift = BitShift / 8;
13530 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
13531 // If the byte we are trying to provide (as tracked by index) falls in this
13532 // range, then the SRL provides the byte. The byte of interest of the src of
13533 // the SRL is Index + ByteShift
13534 return BytesProvided - ByteShift > Index
13535 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
13536 Index + ByteShift)
13538 }
13539
13540 case ISD::SHL: {
13541 if (IsVec)
13542 return std::nullopt;
13543
13544 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13545 if (!ShiftOp)
13546 return std::nullopt;
13547
13548 uint64_t BitShift = ShiftOp->getZExtValue();
13549 if (BitShift % 8 != 0)
13550 return std::nullopt;
13551 uint64_t ByteShift = BitShift / 8;
13552
13553 // If we are shifting by an amount greater than (or equal to)
13554 // the index we are trying to provide, then it provides 0s. If not,
13555 // then this bytes are not definitively 0s, and the corresponding byte
13556 // of interest is Index - ByteShift of the src
13557 return Index < ByteShift
13559 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
13560 Depth + 1, StartingIndex);
13561 }
13562 case ISD::ANY_EXTEND:
13563 case ISD::SIGN_EXTEND:
13564 case ISD::ZERO_EXTEND:
13566 case ISD::AssertZext:
13567 case ISD::AssertSext: {
13568 if (IsVec)
13569 return std::nullopt;
13570
13571 SDValue NarrowOp = Op->getOperand(0);
13572 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
13573 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
13574 Op->getOpcode() == ISD::AssertZext ||
13575 Op->getOpcode() == ISD::AssertSext) {
13576 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13577 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13578 }
13579 if (NarrowBitWidth % 8 != 0)
13580 return std::nullopt;
13581 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13582
13583 if (Index >= NarrowByteWidth)
13584 return Op.getOpcode() == ISD::ZERO_EXTEND
13585 ? std::optional<ByteProvider<SDValue>>(
13587 : std::nullopt;
13588 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
13589 }
13590
13591 case ISD::TRUNCATE: {
13592 if (IsVec)
13593 return std::nullopt;
13594
13595 uint64_t NarrowByteWidth = BitWidth / 8;
13596
13597 if (NarrowByteWidth >= Index) {
13598 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13599 StartingIndex);
13600 }
13601
13602 return std::nullopt;
13603 }
13604
13605 case ISD::CopyFromReg: {
13606 if (BitWidth / 8 > Index)
13607 return calculateSrcByte(Op, StartingIndex, Index);
13608
13609 return std::nullopt;
13610 }
13611
13612 case ISD::LOAD: {
13613 auto *L = cast<LoadSDNode>(Op.getNode());
13614
13615 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13616 if (NarrowBitWidth % 8 != 0)
13617 return std::nullopt;
13618 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13619
13620 // If the width of the load does not reach byte we are trying to provide for
13621 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
13622 // question
13623 if (Index >= NarrowByteWidth) {
13624 return L->getExtensionType() == ISD::ZEXTLOAD
13625 ? std::optional<ByteProvider<SDValue>>(
13627 : std::nullopt;
13628 }
13629
13630 if (NarrowByteWidth > Index) {
13631 return calculateSrcByte(Op, StartingIndex, Index);
13632 }
13633
13634 return std::nullopt;
13635 }
13636
13637 case ISD::BSWAP: {
13638 if (IsVec)
13639 return std::nullopt;
13640
13641 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
13642 Depth + 1, StartingIndex);
13643 }
13644
13646 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13647 if (!IdxOp)
13648 return std::nullopt;
13649 auto VecIdx = IdxOp->getZExtValue();
13650 auto ScalarSize = Op.getScalarValueSizeInBits();
13651 if (ScalarSize < 32)
13652 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13653 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
13654 StartingIndex, Index);
13655 }
13656
13657 case AMDGPUISD::PERM: {
13658 if (IsVec)
13659 return std::nullopt;
13660
13661 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13662 if (!PermMask)
13663 return std::nullopt;
13664
13665 auto IdxMask =
13666 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13667 if (IdxMask > 0x07 && IdxMask != 0x0c)
13668 return std::nullopt;
13669
13670 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13671 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13672
13673 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
13676 }
13677
13678 default: {
13679 return std::nullopt;
13680 }
13681 }
13682
13683 llvm_unreachable("fully handled switch");
13684}
13685
13686// Returns true if the Operand is a scalar and is 16 bits
13687static bool isExtendedFrom16Bits(SDValue &Operand) {
13688
13689 switch (Operand.getOpcode()) {
13690 case ISD::ANY_EXTEND:
13691 case ISD::SIGN_EXTEND:
13692 case ISD::ZERO_EXTEND: {
13693 auto OpVT = Operand.getOperand(0).getValueType();
13694 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
13695 }
13696 case ISD::LOAD: {
13697 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
13698 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
13699 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
13700 ExtType == ISD::EXTLOAD) {
13701 auto MemVT = L->getMemoryVT();
13702 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
13703 }
13704 return L->getMemoryVT().getSizeInBits() == 16;
13705 }
13706 default:
13707 return false;
13708 }
13709}
13710
13711// Returns true if the mask matches consecutive bytes, and the first byte
13712// begins at a power of 2 byte offset from 0th byte
13713static bool addresses16Bits(int Mask) {
13714 int Low8 = Mask & 0xff;
13715 int Hi8 = (Mask & 0xff00) >> 8;
13716
13717 assert(Low8 < 8 && Hi8 < 8);
13718 // Are the bytes contiguous in the order of increasing addresses.
13719 bool IsConsecutive = (Hi8 - Low8 == 1);
13720 // Is the first byte at location that is aligned for 16 bit instructions.
13721 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
13722 // In this case, we still need code to extract the 16 bit operand, so it
13723 // is better to use i8 v_perm
13724 bool Is16Aligned = !(Low8 % 2);
13725
13726 return IsConsecutive && Is16Aligned;
13727}
13728
13729// Do not lower into v_perm if the operands are actually 16 bit
13730// and the selected bits (based on PermMask) correspond with two
13731// easily addressable 16 bit operands.
13733 SDValue &OtherOp) {
13734 int Low16 = PermMask & 0xffff;
13735 int Hi16 = (PermMask & 0xffff0000) >> 16;
13736
13737 auto TempOp = peekThroughBitcasts(Op);
13738 auto TempOtherOp = peekThroughBitcasts(OtherOp);
13739
13740 auto OpIs16Bit =
13741 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
13742 if (!OpIs16Bit)
13743 return true;
13744
13745 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13746 isExtendedFrom16Bits(TempOtherOp);
13747 if (!OtherOpIs16Bit)
13748 return true;
13749
13750 // Do we cleanly address both
13751 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
13752}
13753
13755 unsigned DWordOffset) {
13756 SDValue Ret;
13757
13758 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
13759 // ByteProvider must be at least 8 bits
13760 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13761
13762 if (TypeSize <= 32)
13763 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
13764
13765 if (Src.getValueType().isVector()) {
13766 auto ScalarTySize = Src.getScalarValueSizeInBits();
13767 auto ScalarTy = Src.getValueType().getScalarType();
13768 if (ScalarTySize == 32) {
13769 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
13770 DAG.getConstant(DWordOffset, SL, MVT::i32));
13771 }
13772 if (ScalarTySize > 32) {
13773 Ret = DAG.getNode(
13774 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
13775 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13776 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13777 if (ShiftVal)
13778 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
13779 DAG.getConstant(ShiftVal, SL, MVT::i32));
13780 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13781 }
13782
13783 assert(ScalarTySize < 32);
13784 auto NumElements = TypeSize / ScalarTySize;
13785 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13786 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13787 auto NumElementsIn32 = 32 / ScalarTySize;
13788 auto NumAvailElements = DWordOffset < Trunc32Elements
13789 ? NumElementsIn32
13790 : NumElements - NormalizedTrunc;
13791
13793 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
13794 NumAvailElements);
13795
13796 Ret = DAG.getBuildVector(
13797 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
13798 VecSrcs);
13799 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13800 }
13801
13802 /// Scalar Type
13803 auto ShiftVal = 32 * DWordOffset;
13804 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
13805 DAG.getConstant(ShiftVal, SL, MVT::i32));
13806 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13807}
13808
13810 SelectionDAG &DAG = DCI.DAG;
13811 [[maybe_unused]] EVT VT = N->getValueType(0);
13813
13814 // VT is known to be MVT::i32, so we need to provide 4 bytes.
13815 assert(VT == MVT::i32);
13816 for (int i = 0; i < 4; i++) {
13817 // Find the ByteProvider that provides the ith byte of the result of OR
13818 std::optional<ByteProvider<SDValue>> P =
13819 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
13820 // TODO support constantZero
13821 if (!P || P->isConstantZero())
13822 return SDValue();
13823
13824 PermNodes.push_back(*P);
13825 }
13826 if (PermNodes.size() != 4)
13827 return SDValue();
13828
13829 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
13830 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
13831 uint64_t PermMask = 0x00000000;
13832 for (size_t i = 0; i < PermNodes.size(); i++) {
13833 auto PermOp = PermNodes[i];
13834 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
13835 // by sizeof(Src2) = 4
13836 int SrcByteAdjust = 4;
13837
13838 // If the Src uses a byte from a different DWORD, then it corresponds
13839 // with a difference source
13840 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
13841 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
13842 if (SecondSrc)
13843 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
13844 ((PermOp.SrcOffset / 4) != SecondSrc->second))
13845 return SDValue();
13846
13847 // Set the index of the second distinct Src node
13848 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
13849 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
13850 SrcByteAdjust = 0;
13851 }
13852 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
13854 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
13855 }
13856 SDLoc DL(N);
13857 SDValue Op = *PermNodes[FirstSrc.first].Src;
13858 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
13859 assert(Op.getValueSizeInBits() == 32);
13860
13861 // Check that we are not just extracting the bytes in order from an op
13862 if (!SecondSrc) {
13863 int Low16 = PermMask & 0xffff;
13864 int Hi16 = (PermMask & 0xffff0000) >> 16;
13865
13866 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
13867 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
13868
13869 // The perm op would really just produce Op. So combine into Op
13870 if (WellFormedLow && WellFormedHi)
13871 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
13872 }
13873
13874 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
13875
13876 if (SecondSrc) {
13877 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
13878 assert(OtherOp.getValueSizeInBits() == 32);
13879 }
13880
13881 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
13882
13883 assert(Op.getValueType().isByteSized() &&
13884 OtherOp.getValueType().isByteSized());
13885
13886 // If the ultimate src is less than 32 bits, then we will only be
13887 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
13888 // CalculateByteProvider would not have returned Op as source if we
13889 // used a byte that is outside its ValueType. Thus, we are free to
13890 // ANY_EXTEND as the extended bits are dont-cares.
13891 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
13892 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
13893
13894 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
13895 DAG.getConstant(PermMask, DL, MVT::i32));
13896 }
13897 return SDValue();
13898}
13899
13900SDValue SITargetLowering::performOrCombine(SDNode *N,
13901 DAGCombinerInfo &DCI) const {
13902 SelectionDAG &DAG = DCI.DAG;
13903 SDValue LHS = N->getOperand(0);
13904 SDValue RHS = N->getOperand(1);
13905
13906 EVT VT = N->getValueType(0);
13907 if (VT == MVT::i1) {
13908 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
13909 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13910 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
13911 SDValue Src = LHS.getOperand(0);
13912 if (Src != RHS.getOperand(0))
13913 return SDValue();
13914
13915 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
13916 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13917 if (!CLHS || !CRHS)
13918 return SDValue();
13919
13920 // Only 10 bits are used.
13921 static const uint32_t MaxMask = 0x3ff;
13922
13923 uint32_t NewMask =
13924 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
13925 SDLoc DL(N);
13926 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
13927 DAG.getConstant(NewMask, DL, MVT::i32));
13928 }
13929
13930 return SDValue();
13931 }
13932
13933 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13935 LHS.getOpcode() == AMDGPUISD::PERM &&
13936 isa<ConstantSDNode>(LHS.getOperand(2))) {
13937 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
13938 if (!Sel)
13939 return SDValue();
13940
13941 Sel |= LHS.getConstantOperandVal(2);
13942 SDLoc DL(N);
13943 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13944 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13945 }
13946
13947 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13948 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13949 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13950 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13951
13952 // If all the uses of an or need to extract the individual elements, do not
13953 // attempt to lower into v_perm
13954 auto usesCombinedOperand = [](SDNode *OrUse) {
13955 // If we have any non-vectorized use, then it is a candidate for v_perm
13956 if (OrUse->getOpcode() != ISD::BITCAST ||
13957 !OrUse->getValueType(0).isVector())
13958 return true;
13959
13960 // If we have any non-vectorized use, then it is a candidate for v_perm
13961 for (auto *VUser : OrUse->users()) {
13962 if (!VUser->getValueType(0).isVector())
13963 return true;
13964
13965 // If the use of a vector is a store, then combining via a v_perm
13966 // is beneficial.
13967 // TODO -- whitelist more uses
13968 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
13969 if (VUser->getOpcode() == VectorwiseOp)
13970 return true;
13971 }
13972 return false;
13973 };
13974
13975 if (!any_of(N->users(), usesCombinedOperand))
13976 return SDValue();
13977
13978 uint32_t LHSMask = getPermuteMask(LHS);
13979 uint32_t RHSMask = getPermuteMask(RHS);
13980
13981 if (LHSMask != ~0u && RHSMask != ~0u) {
13982 // Canonicalize the expression in an attempt to have fewer unique masks
13983 // and therefore fewer registers used to hold the masks.
13984 if (LHSMask > RHSMask) {
13985 std::swap(LHSMask, RHSMask);
13986 std::swap(LHS, RHS);
13987 }
13988
13989 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13990 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13991 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13992 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13993
13994 // Check of we need to combine values from two sources within a byte.
13995 if (!(LHSUsedLanes & RHSUsedLanes) &&
13996 // If we select high and lower word keep it for SDWA.
13997 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13998 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13999 // Kill zero bytes selected by other mask. Zero value is 0xc.
14000 LHSMask &= ~RHSUsedLanes;
14001 RHSMask &= ~LHSUsedLanes;
14002 // Add 4 to each active LHS lane
14003 LHSMask |= LHSUsedLanes & 0x04040404;
14004 // Combine masks
14005 uint32_t Sel = LHSMask | RHSMask;
14006 SDLoc DL(N);
14007
14008 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14009 RHS.getOperand(0),
14010 DAG.getConstant(Sel, DL, MVT::i32));
14011 }
14012 }
14013 if (LHSMask == ~0u || RHSMask == ~0u) {
14014 if (SDValue Perm = matchPERM(N, DCI))
14015 return Perm;
14016 }
14017 }
14018
14019 // Detect identity v2i32 OR and replace with identity source node.
14020 // Specifically an Or that has operands constructed from the same source node
14021 // via extract_vector_elt and build_vector. I.E.
14022 // v2i32 or(
14023 // v2i32 build_vector(
14024 // i32 extract_elt(%IdentitySrc, 0),
14025 // i32 0
14026 // ),
14027 // v2i32 build_vector(
14028 // i32 0,
14029 // i32 extract_elt(%IdentitySrc, 1)
14030 // ) )
14031 // =>
14032 // v2i32 %IdentitySrc
14033
14034 if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
14035 RHS->getOpcode() == ISD::BUILD_VECTOR) {
14036
14037 ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
14038 ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
14039
14040 // Test for and normalise build vectors.
14041 if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
14042
14043 // Get the extract_vector_element operands.
14044 SDValue LEVE = LHS->getOperand(0);
14045 SDValue REVE = RHS->getOperand(1);
14046
14047 if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14049 // Check that different elements from the same vector are
14050 // extracted.
14051 if (LEVE->getOperand(0) == REVE->getOperand(0) &&
14052 LEVE->getOperand(1) != REVE->getOperand(1)) {
14053 SDValue IdentitySrc = LEVE.getOperand(0);
14054 return IdentitySrc;
14055 }
14056 }
14057 }
14058 }
14059
14060 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14061 return SDValue();
14062
14063 // TODO: This could be a generic combine with a predicate for extracting the
14064 // high half of an integer being free.
14065
14066 // (or i64:x, (zero_extend i32:y)) ->
14067 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
14068 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
14069 RHS.getOpcode() != ISD::ZERO_EXTEND)
14070 std::swap(LHS, RHS);
14071
14072 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
14073 SDValue ExtSrc = RHS.getOperand(0);
14074 EVT SrcVT = ExtSrc.getValueType();
14075 if (SrcVT == MVT::i32) {
14076 SDLoc SL(N);
14077 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
14078 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
14079
14080 DCI.AddToWorklist(LowOr.getNode());
14081 DCI.AddToWorklist(HiBits.getNode());
14082
14083 SDValue Vec =
14084 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
14085 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
14086 }
14087 }
14088
14089 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
14090 if (CRHS) {
14091 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
14092 N->getOperand(0), CRHS))
14093 return Split;
14094 }
14095
14096 return SDValue();
14097}
14098
14099SDValue SITargetLowering::performXorCombine(SDNode *N,
14100 DAGCombinerInfo &DCI) const {
14101 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
14102 return RV;
14103
14104 SDValue LHS = N->getOperand(0);
14105 SDValue RHS = N->getOperand(1);
14106
14107 const ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
14108 SelectionDAG &DAG = DCI.DAG;
14109
14110 EVT VT = N->getValueType(0);
14111 if (CRHS && VT == MVT::i64) {
14112 if (SDValue Split =
14113 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
14114 return Split;
14115 }
14116
14117 // v2i32 (xor (vselect cc, x, y), K) ->
14118 // (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be
14119 // replaced with source modifiers when the select is lowered to CNDMASK.
14120 unsigned Opc = LHS.getOpcode();
14121 if (((Opc == ISD::VSELECT && VT == MVT::v2i32) ||
14122 (Opc == ISD::SELECT && VT == MVT::i64)) &&
14123 CRHS && CRHS->getAPIntValue().isSignMask()) {
14124 SDValue CC = LHS->getOperand(0);
14125 SDValue TRUE = LHS->getOperand(1);
14126 SDValue FALSE = LHS->getOperand(2);
14127 SDValue XTrue = DAG.getNode(ISD::XOR, SDLoc(N), VT, TRUE, RHS);
14128 SDValue XFalse = DAG.getNode(ISD::XOR, SDLoc(N), VT, FALSE, RHS);
14129 SDValue XSelect =
14130 DAG.getNode(ISD::VSELECT, SDLoc(N), VT, CC, XTrue, XFalse);
14131 return XSelect;
14132 }
14133
14134 // Make sure to apply the 64-bit constant splitting fold before trying to fold
14135 // fneg-like xors into 64-bit select.
14136 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
14137 // This looks like an fneg, try to fold as a source modifier.
14138 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
14140 // xor (select c, a, b), 0x80000000 ->
14141 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
14142 SDLoc DL(N);
14143 SDValue CastLHS =
14144 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
14145 SDValue CastRHS =
14146 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
14147 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
14148 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
14149 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
14150 LHS->getOperand(0), FNegLHS, FNegRHS);
14151 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
14152 }
14153 }
14154
14155 return SDValue();
14156}
14157
14158SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
14159 DAGCombinerInfo &DCI) const {
14160 if (!Subtarget->has16BitInsts() ||
14161 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14162 return SDValue();
14163
14164 EVT VT = N->getValueType(0);
14165 if (VT != MVT::i32)
14166 return SDValue();
14167
14168 SDValue Src = N->getOperand(0);
14169 if (Src.getValueType() != MVT::i16)
14170 return SDValue();
14171
14172 return SDValue();
14173}
14174
14175SDValue
14176SITargetLowering::performSignExtendInRegCombine(SDNode *N,
14177 DAGCombinerInfo &DCI) const {
14178 SDValue Src = N->getOperand(0);
14179 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
14180
14181 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
14182 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
14183 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14184 VTSign->getVT() == MVT::i8) ||
14185 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14186 VTSign->getVT() == MVT::i16))) {
14187 assert(Subtarget->hasScalarSubwordLoads() &&
14188 "s_buffer_load_{u8, i8} are supported "
14189 "in GFX12 (or newer) architectures.");
14190 EVT VT = Src.getValueType();
14191 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14194 SDLoc DL(N);
14195 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14196 SDValue Ops[] = {
14197 Src.getOperand(0), // source register
14198 Src.getOperand(1), // offset
14199 Src.getOperand(2) // cachePolicy
14200 };
14201 auto *M = cast<MemSDNode>(Src);
14202 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14203 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14204 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
14205 return LoadVal;
14206 }
14207 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
14208 VTSign->getVT() == MVT::i8) ||
14209 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
14210 VTSign->getVT() == MVT::i16)) &&
14211 Src.hasOneUse()) {
14212 auto *M = cast<MemSDNode>(Src);
14213 SDValue Ops[] = {Src.getOperand(0), // Chain
14214 Src.getOperand(1), // rsrc
14215 Src.getOperand(2), // vindex
14216 Src.getOperand(3), // voffset
14217 Src.getOperand(4), // soffset
14218 Src.getOperand(5), // offset
14219 Src.getOperand(6), Src.getOperand(7)};
14220 // replace with BUFFER_LOAD_BYTE/SHORT
14221 SDVTList ResList =
14222 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14223 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
14226 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14227 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14228 return DCI.DAG.getMergeValues(
14229 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
14230 }
14231 return SDValue();
14232}
14233
14234SDValue SITargetLowering::performClassCombine(SDNode *N,
14235 DAGCombinerInfo &DCI) const {
14236 SelectionDAG &DAG = DCI.DAG;
14237 SDValue Mask = N->getOperand(1);
14238
14239 // fp_class x, 0 -> false
14240 if (isNullConstant(Mask))
14241 return DAG.getConstant(0, SDLoc(N), MVT::i1);
14242
14243 if (N->getOperand(0).isUndef())
14244 return DAG.getUNDEF(MVT::i1);
14245
14246 return SDValue();
14247}
14248
14249SDValue SITargetLowering::performRcpCombine(SDNode *N,
14250 DAGCombinerInfo &DCI) const {
14251 EVT VT = N->getValueType(0);
14252 SDValue N0 = N->getOperand(0);
14253
14254 if (N0.isUndef()) {
14255 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
14256 SDLoc(N), VT);
14257 }
14258
14259 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
14260 N0.getOpcode() == ISD::SINT_TO_FP)) {
14261 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
14262 N->getFlags());
14263 }
14264
14265 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
14266 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
14267 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
14268 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
14269 N->getFlags());
14270 }
14271
14273}
14274
14276 unsigned MaxDepth) const {
14277 unsigned Opcode = Op.getOpcode();
14278 if (Opcode == ISD::FCANONICALIZE)
14279 return true;
14280
14281 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14282 const auto &F = CFP->getValueAPF();
14283 if (F.isNaN() && F.isSignaling())
14284 return false;
14285 if (!F.isDenormal())
14286 return true;
14287
14288 DenormalMode Mode =
14289 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
14290 return Mode == DenormalMode::getIEEE();
14291 }
14292
14293 // If source is a result of another standard FP operation it is already in
14294 // canonical form.
14295 if (MaxDepth == 0)
14296 return false;
14297
14298 switch (Opcode) {
14299 // These will flush denorms if required.
14300 case ISD::FADD:
14301 case ISD::FSUB:
14302 case ISD::FMUL:
14303 case ISD::FCEIL:
14304 case ISD::FFLOOR:
14305 case ISD::FMA:
14306 case ISD::FMAD:
14307 case ISD::FSQRT:
14308 case ISD::FDIV:
14309 case ISD::FREM:
14310 case ISD::FP_ROUND:
14311 case ISD::FP_EXTEND:
14312 case ISD::FP16_TO_FP:
14313 case ISD::FP_TO_FP16:
14314 case ISD::BF16_TO_FP:
14315 case ISD::FP_TO_BF16:
14316 case ISD::FLDEXP:
14319 case AMDGPUISD::RCP:
14320 case AMDGPUISD::RSQ:
14324 case AMDGPUISD::LOG:
14325 case AMDGPUISD::EXP:
14329 case AMDGPUISD::FRACT:
14336 case AMDGPUISD::SIN_HW:
14337 case AMDGPUISD::COS_HW:
14338 return true;
14339
14340 // It can/will be lowered or combined as a bit operation.
14341 // Need to check their input recursively to handle.
14342 case ISD::FNEG:
14343 case ISD::FABS:
14344 case ISD::FCOPYSIGN:
14345 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14346
14347 case ISD::AND:
14348 if (Op.getValueType() == MVT::i32) {
14349 // Be careful as we only know it is a bitcast floating point type. It
14350 // could be f32, v2f16, we have no way of knowing. Luckily the constant
14351 // value that we optimize for, which comes up in fp32 to bf16 conversions,
14352 // is valid to optimize for all types.
14353 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
14354 if (RHS->getZExtValue() == 0xffff0000) {
14355 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14356 }
14357 }
14358 }
14359 break;
14360
14361 case ISD::FSIN:
14362 case ISD::FCOS:
14363 case ISD::FSINCOS:
14364 return Op.getValueType().getScalarType() != MVT::f16;
14365
14366 case ISD::FMINNUM:
14367 case ISD::FMAXNUM:
14368 case ISD::FMINNUM_IEEE:
14369 case ISD::FMAXNUM_IEEE:
14370 case ISD::FMINIMUM:
14371 case ISD::FMAXIMUM:
14372 case ISD::FMINIMUMNUM:
14373 case ISD::FMAXIMUMNUM:
14374 case AMDGPUISD::CLAMP:
14375 case AMDGPUISD::FMED3:
14376 case AMDGPUISD::FMAX3:
14377 case AMDGPUISD::FMIN3:
14379 case AMDGPUISD::FMINIMUM3: {
14380 // FIXME: Shouldn't treat the generic operations different based these.
14381 // However, we aren't really required to flush the result from
14382 // minnum/maxnum..
14383
14384 // snans will be quieted, so we only need to worry about denormals.
14385 if (Subtarget->supportsMinMaxDenormModes() ||
14386 // FIXME: denormalsEnabledForType is broken for dynamic
14387 denormalsEnabledForType(DAG, Op.getValueType()))
14388 return true;
14389
14390 // Flushing may be required.
14391 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
14392 // targets need to check their input recursively.
14393
14394 // FIXME: Does this apply with clamp? It's implemented with max.
14395 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
14396 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
14397 return false;
14398 }
14399
14400 return true;
14401 }
14402 case ISD::SELECT: {
14403 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
14404 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
14405 }
14406 case ISD::BUILD_VECTOR: {
14407 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
14408 SDValue SrcOp = Op.getOperand(i);
14409 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
14410 return false;
14411 }
14412
14413 return true;
14414 }
14417 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14418 }
14420 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
14421 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
14422 }
14423 case ISD::UNDEF:
14424 // Could be anything.
14425 return false;
14426
14427 case ISD::BITCAST:
14428 // TODO: This is incorrect as it loses track of the operand's type. We may
14429 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
14430 // same bits that are canonicalized in one type need not be in the other.
14431 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14432 case ISD::TRUNCATE: {
14433 // Hack round the mess we make when legalizing extract_vector_elt
14434 if (Op.getValueType() == MVT::i16) {
14435 SDValue TruncSrc = Op.getOperand(0);
14436 if (TruncSrc.getValueType() == MVT::i32 &&
14437 TruncSrc.getOpcode() == ISD::BITCAST &&
14438 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
14439 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
14440 }
14441 }
14442 return false;
14443 }
14445 unsigned IntrinsicID = Op.getConstantOperandVal(0);
14446 // TODO: Handle more intrinsics
14447 switch (IntrinsicID) {
14448 case Intrinsic::amdgcn_cvt_pkrtz:
14449 case Intrinsic::amdgcn_cubeid:
14450 case Intrinsic::amdgcn_frexp_mant:
14451 case Intrinsic::amdgcn_fdot2:
14452 case Intrinsic::amdgcn_rcp:
14453 case Intrinsic::amdgcn_rsq:
14454 case Intrinsic::amdgcn_rsq_clamp:
14455 case Intrinsic::amdgcn_rcp_legacy:
14456 case Intrinsic::amdgcn_rsq_legacy:
14457 case Intrinsic::amdgcn_trig_preop:
14458 case Intrinsic::amdgcn_tanh:
14459 case Intrinsic::amdgcn_log:
14460 case Intrinsic::amdgcn_exp2:
14461 case Intrinsic::amdgcn_sqrt:
14462 return true;
14463 default:
14464 break;
14465 }
14466
14467 break;
14468 }
14469 default:
14470 break;
14471 }
14472
14473 // FIXME: denormalsEnabledForType is broken for dynamic
14474 return denormalsEnabledForType(DAG, Op.getValueType()) &&
14475 DAG.isKnownNeverSNaN(Op);
14476}
14477
14479 unsigned MaxDepth) const {
14480 const MachineRegisterInfo &MRI = MF.getRegInfo();
14481 MachineInstr *MI = MRI.getVRegDef(Reg);
14482 unsigned Opcode = MI->getOpcode();
14483
14484 if (Opcode == AMDGPU::G_FCANONICALIZE)
14485 return true;
14486
14487 std::optional<FPValueAndVReg> FCR;
14488 // Constant splat (can be padded with undef) or scalar constant.
14490 if (FCR->Value.isSignaling())
14491 return false;
14492 if (!FCR->Value.isDenormal())
14493 return true;
14494
14495 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
14496 return Mode == DenormalMode::getIEEE();
14497 }
14498
14499 if (MaxDepth == 0)
14500 return false;
14501
14502 switch (Opcode) {
14503 case AMDGPU::G_FADD:
14504 case AMDGPU::G_FSUB:
14505 case AMDGPU::G_FMUL:
14506 case AMDGPU::G_FCEIL:
14507 case AMDGPU::G_FFLOOR:
14508 case AMDGPU::G_FRINT:
14509 case AMDGPU::G_FNEARBYINT:
14510 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14511 case AMDGPU::G_INTRINSIC_TRUNC:
14512 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14513 case AMDGPU::G_FMA:
14514 case AMDGPU::G_FMAD:
14515 case AMDGPU::G_FSQRT:
14516 case AMDGPU::G_FDIV:
14517 case AMDGPU::G_FREM:
14518 case AMDGPU::G_FPOW:
14519 case AMDGPU::G_FPEXT:
14520 case AMDGPU::G_FLOG:
14521 case AMDGPU::G_FLOG2:
14522 case AMDGPU::G_FLOG10:
14523 case AMDGPU::G_FPTRUNC:
14524 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14525 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14526 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14527 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14528 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14529 return true;
14530 case AMDGPU::G_FNEG:
14531 case AMDGPU::G_FABS:
14532 case AMDGPU::G_FCOPYSIGN:
14533 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
14534 case AMDGPU::G_FMINNUM:
14535 case AMDGPU::G_FMAXNUM:
14536 case AMDGPU::G_FMINNUM_IEEE:
14537 case AMDGPU::G_FMAXNUM_IEEE:
14538 case AMDGPU::G_FMINIMUM:
14539 case AMDGPU::G_FMAXIMUM:
14540 case AMDGPU::G_FMINIMUMNUM:
14541 case AMDGPU::G_FMAXIMUMNUM: {
14542 if (Subtarget->supportsMinMaxDenormModes() ||
14543 // FIXME: denormalsEnabledForType is broken for dynamic
14544 denormalsEnabledForType(MRI.getType(Reg), MF))
14545 return true;
14546
14547 [[fallthrough]];
14548 }
14549 case AMDGPU::G_BUILD_VECTOR:
14550 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
14551 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
14552 return false;
14553 return true;
14554 case AMDGPU::G_INTRINSIC:
14555 case AMDGPU::G_INTRINSIC_CONVERGENT:
14556 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
14557 case Intrinsic::amdgcn_fmul_legacy:
14558 case Intrinsic::amdgcn_fmad_ftz:
14559 case Intrinsic::amdgcn_sqrt:
14560 case Intrinsic::amdgcn_fmed3:
14561 case Intrinsic::amdgcn_sin:
14562 case Intrinsic::amdgcn_cos:
14563 case Intrinsic::amdgcn_log:
14564 case Intrinsic::amdgcn_exp2:
14565 case Intrinsic::amdgcn_log_clamp:
14566 case Intrinsic::amdgcn_rcp:
14567 case Intrinsic::amdgcn_rcp_legacy:
14568 case Intrinsic::amdgcn_rsq:
14569 case Intrinsic::amdgcn_rsq_clamp:
14570 case Intrinsic::amdgcn_rsq_legacy:
14571 case Intrinsic::amdgcn_div_scale:
14572 case Intrinsic::amdgcn_div_fmas:
14573 case Intrinsic::amdgcn_div_fixup:
14574 case Intrinsic::amdgcn_fract:
14575 case Intrinsic::amdgcn_cvt_pkrtz:
14576 case Intrinsic::amdgcn_cubeid:
14577 case Intrinsic::amdgcn_cubema:
14578 case Intrinsic::amdgcn_cubesc:
14579 case Intrinsic::amdgcn_cubetc:
14580 case Intrinsic::amdgcn_frexp_mant:
14581 case Intrinsic::amdgcn_fdot2:
14582 case Intrinsic::amdgcn_trig_preop:
14583 case Intrinsic::amdgcn_tanh:
14584 return true;
14585 default:
14586 break;
14587 }
14588
14589 [[fallthrough]];
14590 default:
14591 return false;
14592 }
14593
14594 llvm_unreachable("invalid operation");
14595}
14596
14597// Constant fold canonicalize.
14598SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
14599 const SDLoc &SL, EVT VT,
14600 const APFloat &C) const {
14601 // Flush denormals to 0 if not enabled.
14602 if (C.isDenormal()) {
14603 DenormalMode Mode =
14604 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
14605 if (Mode == DenormalMode::getPreserveSign()) {
14606 return DAG.getConstantFP(
14607 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
14608 }
14609
14610 if (Mode != DenormalMode::getIEEE())
14611 return SDValue();
14612 }
14613
14614 if (C.isNaN()) {
14615 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
14616 if (C.isSignaling()) {
14617 // Quiet a signaling NaN.
14618 // FIXME: Is this supposed to preserve payload bits?
14619 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14620 }
14621
14622 // Make sure it is the canonical NaN bitpattern.
14623 //
14624 // TODO: Can we use -1 as the canonical NaN value since it's an inline
14625 // immediate?
14626 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
14627 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14628 }
14629
14630 // Already canonical.
14631 return DAG.getConstantFP(C, SL, VT);
14632}
14633
14635 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
14636}
14637
14638SDValue
14639SITargetLowering::performFCanonicalizeCombine(SDNode *N,
14640 DAGCombinerInfo &DCI) const {
14641 SelectionDAG &DAG = DCI.DAG;
14642 SDValue N0 = N->getOperand(0);
14643 EVT VT = N->getValueType(0);
14644
14645 // fcanonicalize undef -> qnan
14646 if (N0.isUndef()) {
14648 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
14649 }
14650
14651 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
14652 EVT VT = N->getValueType(0);
14653 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
14654 }
14655
14656 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
14657 // (fcanonicalize k)
14658 //
14659 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
14660
14661 // TODO: This could be better with wider vectors that will be split to v2f16,
14662 // and to consider uses since there aren't that many packed operations.
14663 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
14664 isTypeLegal(MVT::v2f16)) {
14665 SDLoc SL(N);
14666 SDValue NewElts[2];
14667 SDValue Lo = N0.getOperand(0);
14668 SDValue Hi = N0.getOperand(1);
14669 EVT EltVT = Lo.getValueType();
14670
14672 for (unsigned I = 0; I != 2; ++I) {
14673 SDValue Op = N0.getOperand(I);
14674 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14675 NewElts[I] =
14676 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14677 } else if (Op.isUndef()) {
14678 // Handled below based on what the other operand is.
14679 NewElts[I] = Op;
14680 } else {
14681 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
14682 }
14683 }
14684
14685 // If one half is undef, and one is constant, prefer a splat vector rather
14686 // than the normal qNaN. If it's a register, prefer 0.0 since that's
14687 // cheaper to use and may be free with a packed operation.
14688 if (NewElts[0].isUndef()) {
14689 if (isa<ConstantFPSDNode>(NewElts[1]))
14690 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
14691 ? NewElts[1]
14692 : DAG.getConstantFP(0.0f, SL, EltVT);
14693 }
14694
14695 if (NewElts[1].isUndef()) {
14696 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
14697 ? NewElts[0]
14698 : DAG.getConstantFP(0.0f, SL, EltVT);
14699 }
14700
14701 return DAG.getBuildVector(VT, SL, NewElts);
14702 }
14703 }
14704
14705 return SDValue();
14706}
14707
14708static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
14709 switch (Opc) {
14710 case ISD::FMAXNUM:
14711 case ISD::FMAXNUM_IEEE:
14712 case ISD::FMAXIMUMNUM:
14713 return AMDGPUISD::FMAX3;
14714 case ISD::FMAXIMUM:
14715 return AMDGPUISD::FMAXIMUM3;
14716 case ISD::SMAX:
14717 return AMDGPUISD::SMAX3;
14718 case ISD::UMAX:
14719 return AMDGPUISD::UMAX3;
14720 case ISD::FMINNUM:
14721 case ISD::FMINNUM_IEEE:
14722 case ISD::FMINIMUMNUM:
14723 return AMDGPUISD::FMIN3;
14724 case ISD::FMINIMUM:
14725 return AMDGPUISD::FMINIMUM3;
14726 case ISD::SMIN:
14727 return AMDGPUISD::SMIN3;
14728 case ISD::UMIN:
14729 return AMDGPUISD::UMIN3;
14730 default:
14731 llvm_unreachable("Not a min/max opcode");
14732 }
14733}
14734
14735SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
14736 const SDLoc &SL, SDValue Src,
14737 SDValue MinVal,
14738 SDValue MaxVal,
14739 bool Signed) const {
14740
14741 // med3 comes from
14742 // min(max(x, K0), K1), K0 < K1
14743 // max(min(x, K0), K1), K1 < K0
14744 //
14745 // "MinVal" and "MaxVal" respectively refer to the rhs of the
14746 // min/max op.
14747 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
14748 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
14749
14750 if (!MinK || !MaxK)
14751 return SDValue();
14752
14753 if (Signed) {
14754 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
14755 return SDValue();
14756 } else {
14757 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
14758 return SDValue();
14759 }
14760
14761 EVT VT = MinK->getValueType(0);
14762 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
14763 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
14764 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
14765
14766 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
14767 // not available, but this is unlikely to be profitable as constants
14768 // will often need to be materialized & extended, especially on
14769 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
14770 return SDValue();
14771}
14772
14775 return C;
14776
14778 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
14779 return C;
14780 }
14781
14782 return nullptr;
14783}
14784
14785SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
14786 const SDLoc &SL, SDValue Op0,
14787 SDValue Op1) const {
14788 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
14789 if (!K1)
14790 return SDValue();
14791
14792 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
14793 if (!K0)
14794 return SDValue();
14795
14796 // Ordered >= (although NaN inputs should have folded away by now).
14797 if (K0->getValueAPF() > K1->getValueAPF())
14798 return SDValue();
14799
14800 // med3 with a nan input acts like
14801 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
14802 //
14803 // So the result depends on whether the IEEE mode bit is enabled or not with a
14804 // signaling nan input.
14805 // ieee=1
14806 // s0 snan: yields s2
14807 // s1 snan: yields s2
14808 // s2 snan: qnan
14809
14810 // s0 qnan: min(s1, s2)
14811 // s1 qnan: min(s0, s2)
14812 // s2 qnan: min(s0, s1)
14813
14814 // ieee=0
14815 // s0 snan: min(s1, s2)
14816 // s1 snan: min(s0, s2)
14817 // s2 snan: qnan
14818
14819 // s0 qnan: min(s1, s2)
14820 // s1 qnan: min(s0, s2)
14821 // s2 qnan: min(s0, s1)
14822 const MachineFunction &MF = DAG.getMachineFunction();
14823 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
14824
14825 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
14826 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
14827 // can only form if op0 is fmaxnum_ieee if IEEE=1.
14828 EVT VT = Op0.getValueType();
14829 if (Info->getMode().DX10Clamp) {
14830 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
14831 // hardware fmed3 behavior converting to a min.
14832 // FIXME: Should this be allowing -0.0?
14833 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
14834 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
14835 }
14836
14837 // med3 for f16 is only available on gfx9+, and not available for v2f16.
14838 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
14839 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
14840 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
14841 // then give the other result, which is different from med3 with a NaN
14842 // input.
14843 SDValue Var = Op0.getOperand(0);
14844 if (!DAG.isKnownNeverSNaN(Var))
14845 return SDValue();
14846
14847 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14848
14849 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
14850 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
14851 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
14852 SDValue(K0, 0), SDValue(K1, 0));
14853 }
14854 }
14855
14856 return SDValue();
14857}
14858
14859/// \return true if the subtarget supports minimum3 and maximum3 with the given
14860/// base min/max opcode \p Opc for type \p VT.
14861static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
14862 EVT VT) {
14863 switch (Opc) {
14864 case ISD::FMINNUM:
14865 case ISD::FMAXNUM:
14866 case ISD::FMINNUM_IEEE:
14867 case ISD::FMAXNUM_IEEE:
14868 case ISD::FMINIMUMNUM:
14869 case ISD::FMAXIMUMNUM:
14872 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
14873 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
14874 case ISD::FMINIMUM:
14875 case ISD::FMAXIMUM:
14876 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
14877 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
14878 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
14879 case ISD::SMAX:
14880 case ISD::SMIN:
14881 case ISD::UMAX:
14882 case ISD::UMIN:
14883 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
14884 default:
14885 return false;
14886 }
14887
14888 llvm_unreachable("not a min/max opcode");
14889}
14890
14891SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
14892 DAGCombinerInfo &DCI) const {
14893 SelectionDAG &DAG = DCI.DAG;
14894
14895 EVT VT = N->getValueType(0);
14896 unsigned Opc = N->getOpcode();
14897 SDValue Op0 = N->getOperand(0);
14898 SDValue Op1 = N->getOperand(1);
14899
14900 // Only do this if the inner op has one use since this will just increases
14901 // register pressure for no benefit.
14902
14903 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
14904 // max(max(a, b), c) -> max3(a, b, c)
14905 // min(min(a, b), c) -> min3(a, b, c)
14906 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
14907 SDLoc DL(N);
14908 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
14909 Op0.getOperand(0), Op0.getOperand(1), Op1);
14910 }
14911
14912 // Try commuted.
14913 // max(a, max(b, c)) -> max3(a, b, c)
14914 // min(a, min(b, c)) -> min3(a, b, c)
14915 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
14916 SDLoc DL(N);
14917 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
14918 Op0, Op1.getOperand(0), Op1.getOperand(1));
14919 }
14920 }
14921
14922 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
14923 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
14924 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
14925 if (SDValue Med3 = performIntMed3ImmCombine(
14926 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
14927 return Med3;
14928 }
14929 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
14930 if (SDValue Med3 = performIntMed3ImmCombine(
14931 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
14932 return Med3;
14933 }
14934
14935 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
14936 if (SDValue Med3 = performIntMed3ImmCombine(
14937 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
14938 return Med3;
14939 }
14940 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
14941 if (SDValue Med3 = performIntMed3ImmCombine(
14942 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
14943 return Med3;
14944 }
14945
14946 // if !is_snan(x):
14947 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14948 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14949 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14950 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14951 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
14952 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
14953 (Opc == ISD::FMINIMUMNUM && Op0.getOpcode() == ISD::FMAXIMUMNUM) ||
14955 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
14956 (VT == MVT::f32 || VT == MVT::f64 ||
14957 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
14958 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
14959 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
14960 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
14961 Op0.hasOneUse()) {
14962 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
14963 return Res;
14964 }
14965
14966 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
14967 // for some types, but at a higher cost since it's implemented with a 3
14968 // operand form.
14969 const SDNodeFlags Flags = N->getFlags();
14970 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) &&
14971 !Subtarget->hasIEEEMinimumMaximumInsts() && Flags.hasNoNaNs()) {
14972 unsigned NewOpc =
14973 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
14974 return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
14975 }
14976
14977 return SDValue();
14978}
14979
14983 // FIXME: Should this be allowing -0.0?
14984 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
14985 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
14986 }
14987 }
14988
14989 return false;
14990}
14991
14992// FIXME: Should only worry about snans for version with chain.
14993SDValue SITargetLowering::performFMed3Combine(SDNode *N,
14994 DAGCombinerInfo &DCI) const {
14995 EVT VT = N->getValueType(0);
14996 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
14997 // NaNs. With a NaN input, the order of the operands may change the result.
14998
14999 SelectionDAG &DAG = DCI.DAG;
15000 SDLoc SL(N);
15001
15002 SDValue Src0 = N->getOperand(0);
15003 SDValue Src1 = N->getOperand(1);
15004 SDValue Src2 = N->getOperand(2);
15005
15006 if (isClampZeroToOne(Src0, Src1)) {
15007 // const_a, const_b, x -> clamp is safe in all cases including signaling
15008 // nans.
15009 // FIXME: Should this be allowing -0.0?
15010 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
15011 }
15012
15013 const MachineFunction &MF = DAG.getMachineFunction();
15014 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15015
15016 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
15017 // handling no dx10-clamp?
15018 if (Info->getMode().DX10Clamp) {
15019 // If NaNs is clamped to 0, we are free to reorder the inputs.
15020
15021 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15022 std::swap(Src0, Src1);
15023
15024 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
15025 std::swap(Src1, Src2);
15026
15027 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15028 std::swap(Src0, Src1);
15029
15030 if (isClampZeroToOne(Src1, Src2))
15031 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
15032 }
15033
15034 return SDValue();
15035}
15036
15037SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
15038 DAGCombinerInfo &DCI) const {
15039 SDValue Src0 = N->getOperand(0);
15040 SDValue Src1 = N->getOperand(1);
15041 if (Src0.isUndef() && Src1.isUndef())
15042 return DCI.DAG.getUNDEF(N->getValueType(0));
15043 return SDValue();
15044}
15045
15046// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
15047// expanded into a set of cmp/select instructions.
15049 unsigned NumElem,
15050 bool IsDivergentIdx,
15051 const GCNSubtarget *Subtarget) {
15053 return false;
15054
15055 unsigned VecSize = EltSize * NumElem;
15056
15057 // Sub-dword vectors of size 2 dword or less have better implementation.
15058 if (VecSize <= 64 && EltSize < 32)
15059 return false;
15060
15061 // Always expand the rest of sub-dword instructions, otherwise it will be
15062 // lowered via memory.
15063 if (EltSize < 32)
15064 return true;
15065
15066 // Always do this if var-idx is divergent, otherwise it will become a loop.
15067 if (IsDivergentIdx)
15068 return true;
15069
15070 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
15071 unsigned NumInsts = NumElem /* Number of compares */ +
15072 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
15073
15074 // On some architectures (GFX9) movrel is not available and it's better
15075 // to expand.
15076 if (Subtarget->useVGPRIndexMode())
15077 return NumInsts <= 16;
15078
15079 // If movrel is available, use it instead of expanding for vector of 8
15080 // elements.
15081 if (Subtarget->hasMovrel())
15082 return NumInsts <= 15;
15083
15084 return true;
15085}
15086
15088 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
15089 if (isa<ConstantSDNode>(Idx))
15090 return false;
15091
15092 SDValue Vec = N->getOperand(0);
15093 EVT VecVT = Vec.getValueType();
15094 EVT EltVT = VecVT.getVectorElementType();
15095 unsigned EltSize = EltVT.getSizeInBits();
15096 unsigned NumElem = VecVT.getVectorNumElements();
15097
15099 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
15100}
15101
15102SDValue
15103SITargetLowering::performExtractVectorEltCombine(SDNode *N,
15104 DAGCombinerInfo &DCI) const {
15105 SDValue Vec = N->getOperand(0);
15106 SelectionDAG &DAG = DCI.DAG;
15107
15108 EVT VecVT = Vec.getValueType();
15109 EVT VecEltVT = VecVT.getVectorElementType();
15110 EVT ResVT = N->getValueType(0);
15111
15112 unsigned VecSize = VecVT.getSizeInBits();
15113 unsigned VecEltSize = VecEltVT.getSizeInBits();
15114
15115 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
15117 SDLoc SL(N);
15118 SDValue Idx = N->getOperand(1);
15119 SDValue Elt =
15120 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
15121 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
15122 }
15123
15124 // (extract_vector_element (and {y0, y1}, (build_vector 0x1f, 0x1f)), index)
15125 // -> (and (extract_vector_element {y0, y1}, index), 0x1f)
15126 // There are optimisations to transform 64-bit shifts into 32-bit shifts
15127 // depending on the shift operand. See e.g. performSraCombine().
15128 // This combine ensures that the optimisation is compatible with v2i32
15129 // legalised AND.
15130 if (VecVT == MVT::v2i32 && Vec->getOpcode() == ISD::AND &&
15131 Vec->getOperand(1)->getOpcode() == ISD::BUILD_VECTOR) {
15132
15134 if (!C || C->getZExtValue() != 0x1f)
15135 return SDValue();
15136
15137 SDLoc SL(N);
15138 SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
15139 SDValue EVE = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
15140 Vec->getOperand(0), N->getOperand(1));
15141 SDValue A = DAG.getNode(ISD::AND, SL, MVT::i32, EVE, AndMask);
15142 DAG.ReplaceAllUsesWith(N, A.getNode());
15143 }
15144
15145 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
15146 // =>
15147 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
15148 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
15149 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
15150 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15151 SDLoc SL(N);
15152 SDValue Idx = N->getOperand(1);
15153 unsigned Opc = Vec.getOpcode();
15154
15155 switch (Opc) {
15156 default:
15157 break;
15158 // TODO: Support other binary operations.
15159 case ISD::FADD:
15160 case ISD::FSUB:
15161 case ISD::FMUL:
15162 case ISD::ADD:
15163 case ISD::UMIN:
15164 case ISD::UMAX:
15165 case ISD::SMIN:
15166 case ISD::SMAX:
15167 case ISD::FMAXNUM:
15168 case ISD::FMINNUM:
15169 case ISD::FMAXNUM_IEEE:
15170 case ISD::FMINNUM_IEEE:
15171 case ISD::FMAXIMUM:
15172 case ISD::FMINIMUM: {
15173 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15174 Vec.getOperand(0), Idx);
15175 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15176 Vec.getOperand(1), Idx);
15177
15178 DCI.AddToWorklist(Elt0.getNode());
15179 DCI.AddToWorklist(Elt1.getNode());
15180 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
15181 }
15182 }
15183 }
15184
15185 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
15187 SDLoc SL(N);
15188 SDValue Idx = N->getOperand(1);
15189 SDValue V;
15190 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15191 SDValue IC = DAG.getVectorIdxConstant(I, SL);
15192 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
15193 if (I == 0)
15194 V = Elt;
15195 else
15196 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
15197 }
15198 return V;
15199 }
15200
15201 // EXTRACT_VECTOR_ELT (v2i32 bitcast (i64/f64:k), Idx)
15202 // =>
15203 // i32:Lo(k) if Idx == 0, or
15204 // i32:Hi(k) if Idx == 1
15205 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
15206 if (Vec.getOpcode() == ISD::BITCAST && VecVT == MVT::v2i32 && Idx) {
15207 SDLoc SL(N);
15208 SDValue PeekThrough = Vec.getOperand(0);
15209 auto *KImm = dyn_cast<ConstantSDNode>(PeekThrough);
15210 if (KImm && KImm->getValueType(0).getSizeInBits() == 64) {
15211 uint64_t KImmValue = KImm->getZExtValue();
15212 return DAG.getConstant(
15213 (KImmValue >> (32 * Idx->getZExtValue())) & 0xffffffff, SL, MVT::i32);
15214 }
15215 auto *KFPImm = dyn_cast<ConstantFPSDNode>(PeekThrough);
15216 if (KFPImm && KFPImm->getValueType(0).getSizeInBits() == 64) {
15217 uint64_t KFPImmValue =
15218 KFPImm->getValueAPF().bitcastToAPInt().getZExtValue();
15219 return DAG.getConstant((KFPImmValue >> (32 * Idx->getZExtValue())) &
15220 0xffffffff,
15221 SL, MVT::i32);
15222 }
15223 }
15224
15225 if (!DCI.isBeforeLegalize())
15226 return SDValue();
15227
15228 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
15229 // elements. This exposes more load reduction opportunities by replacing
15230 // multiple small extract_vector_elements with a single 32-bit extract.
15231 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
15232 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15233 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
15234
15235 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15236 unsigned EltIdx = BitIndex / 32;
15237 unsigned LeftoverBitIdx = BitIndex % 32;
15238 SDLoc SL(N);
15239
15240 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
15241 DCI.AddToWorklist(Cast.getNode());
15242
15243 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
15244 DAG.getConstant(EltIdx, SL, MVT::i32));
15245 DCI.AddToWorklist(Elt.getNode());
15246 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
15247 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
15248 DCI.AddToWorklist(Srl.getNode());
15249
15250 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
15251 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
15252 DCI.AddToWorklist(Trunc.getNode());
15253
15254 if (VecEltVT == ResVT) {
15255 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
15256 }
15257
15258 assert(ResVT.isScalarInteger());
15259 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
15260 }
15261
15262 return SDValue();
15263}
15264
15265SDValue
15266SITargetLowering::performInsertVectorEltCombine(SDNode *N,
15267 DAGCombinerInfo &DCI) const {
15268 SDValue Vec = N->getOperand(0);
15269 SDValue Idx = N->getOperand(2);
15270 EVT VecVT = Vec.getValueType();
15271 EVT EltVT = VecVT.getVectorElementType();
15272
15273 // INSERT_VECTOR_ELT (<n x e>, var-idx)
15274 // => BUILD_VECTOR n x select (e, const-idx)
15276 return SDValue();
15277
15278 SelectionDAG &DAG = DCI.DAG;
15279 SDLoc SL(N);
15280 SDValue Ins = N->getOperand(1);
15281 EVT IdxVT = Idx.getValueType();
15282
15284 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15285 SDValue IC = DAG.getConstant(I, SL, IdxVT);
15286 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
15287 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
15288 Ops.push_back(V);
15289 }
15290
15291 return DAG.getBuildVector(VecVT, SL, Ops);
15292}
15293
15294/// Return the source of an fp_extend from f16 to f32, or a converted FP
15295/// constant.
15297 if (Src.getOpcode() == ISD::FP_EXTEND &&
15298 Src.getOperand(0).getValueType() == MVT::f16) {
15299 return Src.getOperand(0);
15300 }
15301
15302 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
15303 APFloat Val = CFP->getValueAPF();
15304 bool LosesInfo = true;
15306 if (!LosesInfo)
15307 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
15308 }
15309
15310 return SDValue();
15311}
15312
15313SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
15314 DAGCombinerInfo &DCI) const {
15315 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15316 "combine only useful on gfx8");
15317
15318 SDValue TruncSrc = N->getOperand(0);
15319 EVT VT = N->getValueType(0);
15320 if (VT != MVT::f16)
15321 return SDValue();
15322
15323 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
15324 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
15325 return SDValue();
15326
15327 SelectionDAG &DAG = DCI.DAG;
15328 SDLoc SL(N);
15329
15330 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
15331 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
15332 // casting back.
15333
15334 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
15335 // fmin(fmax(a, b), fmax(fmin(a, b), c))
15336 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
15337 if (!A)
15338 return SDValue();
15339
15340 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
15341 if (!B)
15342 return SDValue();
15343
15344 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
15345 if (!C)
15346 return SDValue();
15347
15348 // This changes signaling nan behavior. If an input is a signaling nan, it
15349 // would have been quieted by the fpext originally. We don't care because
15350 // these are unconstrained ops. If we needed to insert quieting canonicalizes
15351 // we would be worse off than just doing the promotion.
15352 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
15353 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
15354 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
15355 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
15356}
15357
15358unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
15359 const SDNode *N0,
15360 const SDNode *N1) const {
15361 EVT VT = N0->getValueType(0);
15362
15363 // Only do this if we are not trying to support denormals. v_mad_f32 does not
15364 // support denormals ever.
15365 if (((VT == MVT::f32 &&
15367 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15370 return ISD::FMAD;
15371
15372 const TargetOptions &Options = DAG.getTarget().Options;
15373 if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
15374 (N0->getFlags().hasAllowContract() &&
15375 N1->getFlags().hasAllowContract())) &&
15377 return ISD::FMA;
15378 }
15379
15380 return 0;
15381}
15382
15383// For a reassociatable opcode perform:
15384// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
15385SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
15386 SelectionDAG &DAG) const {
15387 EVT VT = N->getValueType(0);
15388 if (VT != MVT::i32 && VT != MVT::i64)
15389 return SDValue();
15390
15391 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
15392 return SDValue();
15393
15394 unsigned Opc = N->getOpcode();
15395 SDValue Op0 = N->getOperand(0);
15396 SDValue Op1 = N->getOperand(1);
15397
15398 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
15399 return SDValue();
15400
15401 if (Op0->isDivergent())
15402 std::swap(Op0, Op1);
15403
15404 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
15405 return SDValue();
15406
15407 SDValue Op2 = Op1.getOperand(1);
15408 Op1 = Op1.getOperand(0);
15409 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
15410 return SDValue();
15411
15412 if (Op1->isDivergent())
15413 std::swap(Op1, Op2);
15414
15415 SDLoc SL(N);
15416 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
15417 return DAG.getNode(Opc, SL, VT, Add1, Op2);
15418}
15419
15420static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
15421 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
15423 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
15424 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
15425 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
15426}
15427
15428// Fold
15429// y = lshr i64 x, 32
15430// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
15431// with Const.hi == -1
15432// To
15433// res = mad_u64_u32 y.lo ,Const.lo, x.lo
15435 SDValue MulLHS, SDValue MulRHS,
15436 SDValue AddRHS) {
15437 if (MulRHS.getOpcode() == ISD::SRL)
15438 std::swap(MulLHS, MulRHS);
15439
15440 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
15441 return SDValue();
15442
15443 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
15444 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
15445 MulLHS.getOperand(0) != AddRHS)
15446 return SDValue();
15447
15449 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
15450 return SDValue();
15451
15452 SDValue ConstMul =
15453 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
15454 return getMad64_32(DAG, SL, MVT::i64,
15455 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
15456 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
15457}
15458
15459// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
15460// multiplies, if any.
15461//
15462// Full 64-bit multiplies that feed into an addition are lowered here instead
15463// of using the generic expansion. The generic expansion ends up with
15464// a tree of ADD nodes that prevents us from using the "add" part of the
15465// MAD instruction. The expansion produced here results in a chain of ADDs
15466// instead of a tree.
15467SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
15468 DAGCombinerInfo &DCI) const {
15469 assert(N->isAnyAdd());
15470
15471 SelectionDAG &DAG = DCI.DAG;
15472 EVT VT = N->getValueType(0);
15473 SDLoc SL(N);
15474 SDValue LHS = N->getOperand(0);
15475 SDValue RHS = N->getOperand(1);
15476
15477 if (VT.isVector())
15478 return SDValue();
15479
15480 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
15481 // result in scalar registers for uniform values.
15482 if (!N->isDivergent() && Subtarget->hasSMulHi())
15483 return SDValue();
15484
15485 unsigned NumBits = VT.getScalarSizeInBits();
15486 if (NumBits <= 32 || NumBits > 64)
15487 return SDValue();
15488
15489 if (LHS.getOpcode() != ISD::MUL) {
15490 assert(RHS.getOpcode() == ISD::MUL);
15491 std::swap(LHS, RHS);
15492 }
15493
15494 // Avoid the fold if it would unduly increase the number of multiplies due to
15495 // multiple uses, except on hardware with full-rate multiply-add (which is
15496 // part of full-rate 64-bit ops).
15497 if (!Subtarget->hasFullRate64Ops()) {
15498 unsigned NumUsers = 0;
15499 for (SDNode *User : LHS->users()) {
15500 // There is a use that does not feed into addition, so the multiply can't
15501 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
15502 if (!User->isAnyAdd())
15503 return SDValue();
15504
15505 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
15506 // MUL + 3xADD + 3xADDC over 3xMAD.
15507 ++NumUsers;
15508 if (NumUsers >= 3)
15509 return SDValue();
15510 }
15511 }
15512
15513 SDValue MulLHS = LHS.getOperand(0);
15514 SDValue MulRHS = LHS.getOperand(1);
15515 SDValue AddRHS = RHS;
15516
15517 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
15518 return FoldedMAD;
15519
15520 // Always check whether operands are small unsigned values, since that
15521 // knowledge is useful in more cases. Check for small signed values only if
15522 // doing so can unlock a shorter code sequence.
15523 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
15524 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
15525
15526 bool MulSignedLo = false;
15527 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15528 MulSignedLo =
15529 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
15530 }
15531
15532 // The operands and final result all have the same number of bits. If
15533 // operands need to be extended, they can be extended with garbage. The
15534 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
15535 // truncated away in the end.
15536 if (VT != MVT::i64) {
15537 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
15538 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
15539 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
15540 }
15541
15542 // The basic code generated is conceptually straightforward. Pseudo code:
15543 //
15544 // accum = mad_64_32 lhs.lo, rhs.lo, accum
15545 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
15546 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
15547 //
15548 // The second and third lines are optional, depending on whether the factors
15549 // are {sign,zero}-extended or not.
15550 //
15551 // The actual DAG is noisier than the pseudo code, but only due to
15552 // instructions that disassemble values into low and high parts, and
15553 // assemble the final result.
15554 SDValue One = DAG.getConstant(1, SL, MVT::i32);
15555
15556 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
15557 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
15558 SDValue Accum =
15559 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15560
15561 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15562 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15563
15564 if (!MulLHSUnsigned32) {
15565 auto MulLHSHi =
15566 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
15567 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
15568 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15569 }
15570
15571 if (!MulRHSUnsigned32) {
15572 auto MulRHSHi =
15573 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
15574 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
15575 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15576 }
15577
15578 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
15579 Accum = DAG.getBitcast(MVT::i64, Accum);
15580 }
15581
15582 if (VT != MVT::i64)
15583 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
15584 return Accum;
15585}
15586
15587SDValue
15588SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
15589 DAGCombinerInfo &DCI) const {
15590 SDValue RHS = N->getOperand(1);
15591 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
15592 if (!CRHS)
15593 return SDValue();
15594
15595 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
15596 // common.
15597 uint64_t Val = CRHS->getZExtValue();
15598 if (countr_zero(Val) >= 32) {
15599 SelectionDAG &DAG = DCI.DAG;
15600 SDLoc SL(N);
15601 SDValue LHS = N->getOperand(0);
15602
15603 // Avoid carry machinery if we know the low half of the add does not
15604 // contribute to the final result.
15605 //
15606 // add i64:x, K if computeTrailingZeros(K) >= 32
15607 // => build_pair (add x.hi, K.hi), x.lo
15608
15609 // Breaking the 64-bit add here with this strange constant is unlikely
15610 // to interfere with addressing mode patterns.
15611
15612 SDValue Hi = getHiHalf64(LHS, DAG);
15613 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
15614 unsigned Opcode = N->getOpcode();
15615 if (Opcode == ISD::PTRADD)
15616 Opcode = ISD::ADD;
15617 SDValue AddHi =
15618 DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
15619
15620 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
15621 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
15622 }
15623
15624 return SDValue();
15625}
15626
15627// Collect the ultimate src of each of the mul node's operands, and confirm
15628// each operand is 8 bytes.
15629static std::optional<ByteProvider<SDValue>>
15630handleMulOperand(const SDValue &MulOperand) {
15631 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
15632 if (!Byte0 || Byte0->isConstantZero()) {
15633 return std::nullopt;
15634 }
15635 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
15636 if (Byte1 && !Byte1->isConstantZero()) {
15637 return std::nullopt;
15638 }
15639 return Byte0;
15640}
15641
15642static unsigned addPermMasks(unsigned First, unsigned Second) {
15643 unsigned FirstCs = First & 0x0c0c0c0c;
15644 unsigned SecondCs = Second & 0x0c0c0c0c;
15645 unsigned FirstNoCs = First & ~0x0c0c0c0c;
15646 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15647
15648 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15649 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15650 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15651 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15652
15653 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15654}
15655
15656struct DotSrc {
15658 int64_t PermMask;
15660};
15661
15665 SmallVectorImpl<DotSrc> &Src1s, int Step) {
15666
15667 assert(Src0.Src.has_value() && Src1.Src.has_value());
15668 // Src0s and Src1s are empty, just place arbitrarily.
15669 if (Step == 0) {
15670 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
15671 Src0.SrcOffset / 4});
15672 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
15673 Src1.SrcOffset / 4});
15674 return;
15675 }
15676
15677 for (int BPI = 0; BPI < 2; BPI++) {
15678 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
15679 if (BPI == 1) {
15680 BPP = {Src1, Src0};
15681 }
15682 unsigned ZeroMask = 0x0c0c0c0c;
15683 unsigned FMask = 0xFF << (8 * (3 - Step));
15684
15685 unsigned FirstMask =
15686 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15687 unsigned SecondMask =
15688 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15689 // Attempt to find Src vector which contains our SDValue, if so, add our
15690 // perm mask to the existing one. If we are unable to find a match for the
15691 // first SDValue, attempt to find match for the second.
15692 int FirstGroup = -1;
15693 for (int I = 0; I < 2; I++) {
15694 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
15695 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
15696 return IterElt.SrcOp == *BPP.first.Src &&
15697 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15698 };
15699
15700 auto *Match = llvm::find_if(Srcs, MatchesFirst);
15701 if (Match != Srcs.end()) {
15702 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
15703 FirstGroup = I;
15704 break;
15705 }
15706 }
15707 if (FirstGroup != -1) {
15708 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
15709 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
15710 return IterElt.SrcOp == *BPP.second.Src &&
15711 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15712 };
15713 auto *Match = llvm::find_if(Srcs, MatchesSecond);
15714 if (Match != Srcs.end()) {
15715 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
15716 } else
15717 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15718 return;
15719 }
15720 }
15721
15722 // If we have made it here, then we could not find a match in Src0s or Src1s
15723 // for either Src0 or Src1, so just place them arbitrarily.
15724
15725 unsigned ZeroMask = 0x0c0c0c0c;
15726 unsigned FMask = 0xFF << (8 * (3 - Step));
15727
15728 Src0s.push_back(
15729 {*Src0.Src,
15730 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15731 Src0.SrcOffset / 4});
15732 Src1s.push_back(
15733 {*Src1.Src,
15734 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15735 Src1.SrcOffset / 4});
15736}
15737
15739 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
15740 bool IsAny) {
15741
15742 // If we just have one source, just permute it accordingly.
15743 if (Srcs.size() == 1) {
15744 auto *Elt = Srcs.begin();
15745 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
15746
15747 // v_perm will produce the original value
15748 if (Elt->PermMask == 0x3020100)
15749 return EltOp;
15750
15751 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15752 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
15753 }
15754
15755 auto *FirstElt = Srcs.begin();
15756 auto *SecondElt = std::next(FirstElt);
15757
15759
15760 // If we have multiple sources in the chain, combine them via perms (using
15761 // calculated perm mask) and Ors.
15762 while (true) {
15763 auto FirstMask = FirstElt->PermMask;
15764 auto SecondMask = SecondElt->PermMask;
15765
15766 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15767 unsigned FirstPlusFour = FirstMask | 0x04040404;
15768 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
15769 // original 0x0C.
15770 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15771
15772 auto PermMask = addPermMasks(FirstMask, SecondMask);
15773 auto FirstVal =
15774 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15775 auto SecondVal =
15776 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
15777
15778 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
15779 SecondVal,
15780 DAG.getConstant(PermMask, SL, MVT::i32)));
15781
15782 FirstElt = std::next(SecondElt);
15783 if (FirstElt == Srcs.end())
15784 break;
15785
15786 SecondElt = std::next(FirstElt);
15787 // If we only have a FirstElt, then just combine that into the cumulative
15788 // source node.
15789 if (SecondElt == Srcs.end()) {
15790 auto EltOp =
15791 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15792
15793 Perms.push_back(
15794 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15795 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
15796 break;
15797 }
15798 }
15799
15800 assert(Perms.size() == 1 || Perms.size() == 2);
15801 return Perms.size() == 2
15802 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
15803 : Perms[0];
15804}
15805
15806static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
15807 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15808 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15809 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15810 EntryMask += ZeroMask;
15811 }
15812}
15813
15814static bool isMul(const SDValue Op) {
15815 auto Opcode = Op.getOpcode();
15816
15817 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
15818 Opcode == AMDGPUISD::MUL_I24);
15819}
15820
15821static std::optional<bool>
15823 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
15824 const SDValue &S1Op, const SelectionDAG &DAG) {
15825 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
15826 // of the dot4 is irrelevant.
15827 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
15828 return false;
15829
15830 auto Known0 = DAG.computeKnownBits(S0Op, 0);
15831 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
15832 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
15833 auto Known1 = DAG.computeKnownBits(S1Op, 0);
15834 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
15835 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
15836
15837 assert(!(S0IsUnsigned && S0IsSigned));
15838 assert(!(S1IsUnsigned && S1IsSigned));
15839
15840 // There are 9 possible permutations of
15841 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
15842
15843 // In two permutations, the sign bits are known to be the same for both Ops,
15844 // so simply return Signed / Unsigned corresponding to the MSB
15845
15846 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
15847 return S0IsSigned;
15848
15849 // In another two permutations, the sign bits are known to be opposite. In
15850 // this case return std::nullopt to indicate a bad match.
15851
15852 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
15853 return std::nullopt;
15854
15855 // In the remaining five permutations, we don't know the value of the sign
15856 // bit for at least one Op. Since we have a valid ByteProvider, we know that
15857 // the upper bits must be extension bits. Thus, the only ways for the sign
15858 // bit to be unknown is if it was sign extended from unknown value, or if it
15859 // was any extended. In either case, it is correct to use the signed
15860 // version of the signedness semantics of dot4
15861
15862 // In two of such permutations, we known the sign bit is set for
15863 // one op, and the other is unknown. It is okay to used signed version of
15864 // dot4.
15865 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
15866 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
15867 return true;
15868
15869 // In one such permutation, we don't know either of the sign bits. It is okay
15870 // to used the signed version of dot4.
15871 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
15872 return true;
15873
15874 // In two of such permutations, we known the sign bit is unset for
15875 // one op, and the other is unknown. Return std::nullopt to indicate a
15876 // bad match.
15877 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
15878 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
15879 return std::nullopt;
15880
15881 llvm_unreachable("Fully covered condition");
15882}
15883
15884SDValue SITargetLowering::performAddCombine(SDNode *N,
15885 DAGCombinerInfo &DCI) const {
15886 SelectionDAG &DAG = DCI.DAG;
15887 EVT VT = N->getValueType(0);
15888 SDLoc SL(N);
15889 SDValue LHS = N->getOperand(0);
15890 SDValue RHS = N->getOperand(1);
15891
15892 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
15893 if (Subtarget->hasMad64_32()) {
15894 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
15895 return Folded;
15896 }
15897 }
15898
15899 if (SDValue V = reassociateScalarOps(N, DAG)) {
15900 return V;
15901 }
15902
15903 if (VT == MVT::i64) {
15904 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
15905 return Folded;
15906 }
15907
15908 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
15909 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
15910 SDValue TempNode(N, 0);
15911 std::optional<bool> IsSigned;
15915
15916 // Match the v_dot4 tree, while collecting src nodes.
15917 int ChainLength = 0;
15918 for (int I = 0; I < 4; I++) {
15919 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
15920 if (MulIdx == -1)
15921 break;
15922 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
15923 if (!Src0)
15924 break;
15925 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
15926 if (!Src1)
15927 break;
15928
15929 auto IterIsSigned = checkDot4MulSignedness(
15930 TempNode->getOperand(MulIdx), *Src0, *Src1,
15931 TempNode->getOperand(MulIdx)->getOperand(0),
15932 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
15933 if (!IterIsSigned)
15934 break;
15935 if (!IsSigned)
15936 IsSigned = *IterIsSigned;
15937 if (*IterIsSigned != *IsSigned)
15938 break;
15939 placeSources(*Src0, *Src1, Src0s, Src1s, I);
15940 auto AddIdx = 1 - MulIdx;
15941 // Allow the special case where add (add (mul24, 0), mul24) became ->
15942 // add (mul24, mul24).
15943 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
15944 Src2s.push_back(TempNode->getOperand(AddIdx));
15945 auto Src0 =
15946 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
15947 if (!Src0)
15948 break;
15949 auto Src1 =
15950 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
15951 if (!Src1)
15952 break;
15953 auto IterIsSigned = checkDot4MulSignedness(
15954 TempNode->getOperand(AddIdx), *Src0, *Src1,
15955 TempNode->getOperand(AddIdx)->getOperand(0),
15956 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
15957 if (!IterIsSigned)
15958 break;
15959 assert(IsSigned);
15960 if (*IterIsSigned != *IsSigned)
15961 break;
15962 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
15963 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
15964 ChainLength = I + 2;
15965 break;
15966 }
15967
15968 TempNode = TempNode->getOperand(AddIdx);
15969 Src2s.push_back(TempNode);
15970 ChainLength = I + 1;
15971 if (TempNode->getNumOperands() < 2)
15972 break;
15973 LHS = TempNode->getOperand(0);
15974 RHS = TempNode->getOperand(1);
15975 }
15976
15977 if (ChainLength < 2)
15978 return SDValue();
15979
15980 // Masks were constructed with assumption that we would find a chain of
15981 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
15982 // 0x0c) so they do not affect dot calculation.
15983 if (ChainLength < 4) {
15984 fixMasks(Src0s, ChainLength);
15985 fixMasks(Src1s, ChainLength);
15986 }
15987
15988 SDValue Src0, Src1;
15989
15990 // If we are just using a single source for both, and have permuted the
15991 // bytes consistently, we can just use the sources without permuting
15992 // (commutation).
15993 bool UseOriginalSrc = false;
15994 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
15995 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
15996 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
15997 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
15998 SmallVector<unsigned, 4> SrcBytes;
15999 auto Src0Mask = Src0s.begin()->PermMask;
16000 SrcBytes.push_back(Src0Mask & 0xFF000000);
16001 bool UniqueEntries = true;
16002 for (auto I = 1; I < 4; I++) {
16003 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
16004
16005 if (is_contained(SrcBytes, NextByte)) {
16006 UniqueEntries = false;
16007 break;
16008 }
16009 SrcBytes.push_back(NextByte);
16010 }
16011
16012 if (UniqueEntries) {
16013 UseOriginalSrc = true;
16014
16015 auto *FirstElt = Src0s.begin();
16016 auto FirstEltOp =
16017 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16018
16019 auto *SecondElt = Src1s.begin();
16020 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
16021 SecondElt->DWordOffset);
16022
16023 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
16024 MVT::getIntegerVT(32));
16025 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
16026 MVT::getIntegerVT(32));
16027 }
16028 }
16029
16030 if (!UseOriginalSrc) {
16031 Src0 = resolveSources(DAG, SL, Src0s, false, true);
16032 Src1 = resolveSources(DAG, SL, Src1s, false, true);
16033 }
16034
16035 assert(IsSigned);
16036 SDValue Src2 =
16037 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16038
16039 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
16040 : Intrinsic::amdgcn_udot4,
16041 SL, MVT::i64);
16042
16043 assert(!VT.isVector());
16044 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
16045 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
16046
16047 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
16048 }
16049
16050 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16051 return SDValue();
16052
16053 // add x, zext (setcc) => uaddo_carry x, 0, setcc
16054 // add x, sext (setcc) => usubo_carry x, 0, setcc
16055 unsigned Opc = LHS.getOpcode();
16058 std::swap(RHS, LHS);
16059
16060 Opc = RHS.getOpcode();
16061 switch (Opc) {
16062 default:
16063 break;
16064 case ISD::ZERO_EXTEND:
16065 case ISD::SIGN_EXTEND:
16066 case ISD::ANY_EXTEND: {
16067 auto Cond = RHS.getOperand(0);
16068 // If this won't be a real VOPC output, we would still need to insert an
16069 // extra instruction anyway.
16070 if (!isBoolSGPR(Cond))
16071 break;
16072 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16073 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16075 return DAG.getNode(Opc, SL, VTList, Args);
16076 }
16077 case ISD::UADDO_CARRY: {
16078 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
16079 if (!isNullConstant(RHS.getOperand(1)))
16080 break;
16081 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
16082 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
16083 }
16084 }
16085 return SDValue();
16086}
16087
16088SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
16089 DAGCombinerInfo &DCI) const {
16090 SelectionDAG &DAG = DCI.DAG;
16091 SDLoc DL(N);
16092 EVT VT = N->getValueType(0);
16093 SDValue N0 = N->getOperand(0);
16094 SDValue N1 = N->getOperand(1);
16095
16096 // The following folds transform PTRADDs into regular arithmetic in cases
16097 // where the PTRADD wouldn't be folded as an immediate offset into memory
16098 // instructions anyway. They are target-specific in that other targets might
16099 // prefer to not lose information about the pointer arithmetic.
16100
16101 // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
16102 // Adapted from DAGCombiner::visitADDLikeCommutative.
16103 SDValue V, K;
16104 if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
16105 SDNodeFlags ShlFlags = N1->getFlags();
16106 // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
16107 // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
16108 // preserved.
16109 SDNodeFlags NewShlFlags =
16110 ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
16112 : SDNodeFlags();
16113 SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
16114 DCI.AddToWorklist(Inner.getNode());
16115 return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
16116 }
16117
16118 // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
16119 // performAddCombine.
16120 if (N1.getOpcode() == ISD::MUL) {
16121 if (Subtarget->hasMad64_32()) {
16122 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16123 return Folded;
16124 }
16125 }
16126
16127 // If the 32 low bits of the constant are all zero, there is nothing to fold
16128 // into an immediate offset, so it's better to eliminate the unnecessary
16129 // addition for the lower 32 bits than to preserve the PTRADD.
16130 // Analogous to a fold in performAddCombine.
16131 if (VT == MVT::i64) {
16132 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16133 return Folded;
16134 }
16135
16136 if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
16137 return SDValue();
16138
16139 SDValue X = N0;
16140 SDValue Y = N1.getOperand(0);
16141 SDValue Z = N1.getOperand(1);
16142 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
16143 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
16144
16145 if (!YIsConstant && !ZIsConstant && !X->isDivergent() &&
16146 Y->isDivergent() != Z->isDivergent()) {
16147 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
16148 // y are uniform and z isn't.
16149 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
16150 // z are uniform and y isn't.
16151 // The goal is to push uniform operands up in the computation, so that they
16152 // can be handled with scalar operations. We can't use reassociateScalarOps
16153 // for this since it requires two identical commutative operations to
16154 // reassociate.
16155 if (Y->isDivergent())
16156 std::swap(Y, Z);
16157 // If both additions in the original were NUW, reassociation preserves that.
16158 SDNodeFlags ReassocFlags =
16159 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
16160 SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
16161 DCI.AddToWorklist(UniformInner.getNode());
16162 return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
16163 }
16164
16165 return SDValue();
16166}
16167
16168SDValue SITargetLowering::performSubCombine(SDNode *N,
16169 DAGCombinerInfo &DCI) const {
16170 SelectionDAG &DAG = DCI.DAG;
16171 EVT VT = N->getValueType(0);
16172
16173 if (VT == MVT::i64) {
16174 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16175 return Folded;
16176 }
16177
16178 if (VT != MVT::i32)
16179 return SDValue();
16180
16181 SDLoc SL(N);
16182 SDValue LHS = N->getOperand(0);
16183 SDValue RHS = N->getOperand(1);
16184
16185 // sub x, zext (setcc) => usubo_carry x, 0, setcc
16186 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
16187 unsigned Opc = RHS.getOpcode();
16188 switch (Opc) {
16189 default:
16190 break;
16191 case ISD::ZERO_EXTEND:
16192 case ISD::SIGN_EXTEND:
16193 case ISD::ANY_EXTEND: {
16194 auto Cond = RHS.getOperand(0);
16195 // If this won't be a real VOPC output, we would still need to insert an
16196 // extra instruction anyway.
16197 if (!isBoolSGPR(Cond))
16198 break;
16199 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16200 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16202 return DAG.getNode(Opc, SL, VTList, Args);
16203 }
16204 }
16205
16206 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
16207 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
16208 if (!isNullConstant(LHS.getOperand(1)))
16209 return SDValue();
16210 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
16211 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
16212 }
16213 return SDValue();
16214}
16215
16216SDValue
16217SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
16218 DAGCombinerInfo &DCI) const {
16219
16220 if (N->getValueType(0) != MVT::i32)
16221 return SDValue();
16222
16223 if (!isNullConstant(N->getOperand(1)))
16224 return SDValue();
16225
16226 SelectionDAG &DAG = DCI.DAG;
16227 SDValue LHS = N->getOperand(0);
16228
16229 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
16230 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
16231 unsigned LHSOpc = LHS.getOpcode();
16232 unsigned Opc = N->getOpcode();
16233 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
16234 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
16235 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
16236 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
16237 }
16238 return SDValue();
16239}
16240
16241SDValue SITargetLowering::performFAddCombine(SDNode *N,
16242 DAGCombinerInfo &DCI) const {
16243 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16244 return SDValue();
16245
16246 SelectionDAG &DAG = DCI.DAG;
16247 EVT VT = N->getValueType(0);
16248
16249 SDLoc SL(N);
16250 SDValue LHS = N->getOperand(0);
16251 SDValue RHS = N->getOperand(1);
16252
16253 // These should really be instruction patterns, but writing patterns with
16254 // source modifiers is a pain.
16255
16256 // fadd (fadd (a, a), b) -> mad 2.0, a, b
16257 if (LHS.getOpcode() == ISD::FADD) {
16258 SDValue A = LHS.getOperand(0);
16259 if (A == LHS.getOperand(1)) {
16260 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16261 if (FusedOp != 0) {
16262 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16263 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
16264 }
16265 }
16266 }
16267
16268 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
16269 if (RHS.getOpcode() == ISD::FADD) {
16270 SDValue A = RHS.getOperand(0);
16271 if (A == RHS.getOperand(1)) {
16272 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16273 if (FusedOp != 0) {
16274 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16275 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
16276 }
16277 }
16278 }
16279
16280 return SDValue();
16281}
16282
16283SDValue SITargetLowering::performFSubCombine(SDNode *N,
16284 DAGCombinerInfo &DCI) const {
16285 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16286 return SDValue();
16287
16288 SelectionDAG &DAG = DCI.DAG;
16289 SDLoc SL(N);
16290 EVT VT = N->getValueType(0);
16291 assert(!VT.isVector());
16292
16293 // Try to get the fneg to fold into the source modifier. This undoes generic
16294 // DAG combines and folds them into the mad.
16295 //
16296 // Only do this if we are not trying to support denormals. v_mad_f32 does
16297 // not support denormals ever.
16298 SDValue LHS = N->getOperand(0);
16299 SDValue RHS = N->getOperand(1);
16300 if (LHS.getOpcode() == ISD::FADD) {
16301 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
16302 SDValue A = LHS.getOperand(0);
16303 if (A == LHS.getOperand(1)) {
16304 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16305 if (FusedOp != 0) {
16306 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16307 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
16308
16309 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
16310 }
16311 }
16312 }
16313
16314 if (RHS.getOpcode() == ISD::FADD) {
16315 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
16316
16317 SDValue A = RHS.getOperand(0);
16318 if (A == RHS.getOperand(1)) {
16319 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16320 if (FusedOp != 0) {
16321 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
16322 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
16323 }
16324 }
16325 }
16326
16327 return SDValue();
16328}
16329
16330SDValue SITargetLowering::performFDivCombine(SDNode *N,
16331 DAGCombinerInfo &DCI) const {
16332 SelectionDAG &DAG = DCI.DAG;
16333 SDLoc SL(N);
16334 EVT VT = N->getValueType(0);
16335 if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts())
16336 return SDValue();
16337
16338 SDValue LHS = N->getOperand(0);
16339 SDValue RHS = N->getOperand(1);
16340
16341 SDNodeFlags Flags = N->getFlags();
16342 SDNodeFlags RHSFlags = RHS->getFlags();
16343 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
16344 !RHS->hasOneUse())
16345 return SDValue();
16346
16347 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
16348 bool IsNegative = false;
16349 if (CLHS->isExactlyValue(1.0) ||
16350 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16351 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
16352 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
16353 if (RHS.getOpcode() == ISD::FSQRT) {
16354 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
16355 SDValue Rsq =
16356 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
16357 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
16358 }
16359 }
16360 }
16361
16362 return SDValue();
16363}
16364
16365SDValue SITargetLowering::performFMulCombine(SDNode *N,
16366 DAGCombinerInfo &DCI) const {
16367 SelectionDAG &DAG = DCI.DAG;
16368 EVT VT = N->getValueType(0);
16369 EVT ScalarVT = VT.getScalarType();
16370 EVT IntVT = VT.changeElementType(MVT::i32);
16371
16372 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
16373 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16374 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
16375 return SDValue();
16376 }
16377
16378 SDValue LHS = N->getOperand(0);
16379 SDValue RHS = N->getOperand(1);
16380
16381 // It is cheaper to realize i32 inline constants as compared against
16382 // materializing f16 or f64 (or even non-inline f32) values,
16383 // possible via ldexp usage, as shown below :
16384 //
16385 // Given : A = 2^a & B = 2^b ; where a and b are integers.
16386 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
16387 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
16388 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16389 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
16390 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
16391 if (!TrueNode)
16392 return SDValue();
16393 const ConstantFPSDNode *FalseNode =
16394 isConstOrConstSplatFP(RHS.getOperand(2));
16395 if (!FalseNode)
16396 return SDValue();
16397
16398 if (TrueNode->isNegative() != FalseNode->isNegative())
16399 return SDValue();
16400
16401 // For f32, only non-inline constants should be transformed.
16402 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16403 if (ScalarVT == MVT::f32 &&
16404 TII->isInlineConstant(TrueNode->getValueAPF()) &&
16405 TII->isInlineConstant(FalseNode->getValueAPF()))
16406 return SDValue();
16407
16408 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
16409 if (TrueNodeExpVal == INT_MIN)
16410 return SDValue();
16411 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
16412 if (FalseNodeExpVal == INT_MIN)
16413 return SDValue();
16414
16415 SDLoc SL(N);
16416 SDValue SelectNode =
16417 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
16418 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
16419 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
16420
16421 LHS = TrueNode->isNegative()
16422 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
16423 : LHS;
16424
16425 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
16426 }
16427
16428 return SDValue();
16429}
16430
16431SDValue SITargetLowering::performFMACombine(SDNode *N,
16432 DAGCombinerInfo &DCI) const {
16433 SelectionDAG &DAG = DCI.DAG;
16434 EVT VT = N->getValueType(0);
16435 SDLoc SL(N);
16436
16437 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16438 return SDValue();
16439
16440 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
16441 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
16442 SDValue Op1 = N->getOperand(0);
16443 SDValue Op2 = N->getOperand(1);
16444 SDValue FMA = N->getOperand(2);
16445
16446 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
16447 Op2.getOpcode() != ISD::FP_EXTEND)
16448 return SDValue();
16449
16450 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
16451 // regardless of the denorm mode setting. Therefore,
16452 // fp-contract is sufficient to allow generating fdot2.
16453 const TargetOptions &Options = DAG.getTarget().Options;
16454 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
16455 (N->getFlags().hasAllowContract() &&
16456 FMA->getFlags().hasAllowContract())) {
16457 Op1 = Op1.getOperand(0);
16458 Op2 = Op2.getOperand(0);
16459 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16461 return SDValue();
16462
16463 SDValue Vec1 = Op1.getOperand(0);
16464 SDValue Idx1 = Op1.getOperand(1);
16465 SDValue Vec2 = Op2.getOperand(0);
16466
16467 SDValue FMAOp1 = FMA.getOperand(0);
16468 SDValue FMAOp2 = FMA.getOperand(1);
16469 SDValue FMAAcc = FMA.getOperand(2);
16470
16471 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
16472 FMAOp2.getOpcode() != ISD::FP_EXTEND)
16473 return SDValue();
16474
16475 FMAOp1 = FMAOp1.getOperand(0);
16476 FMAOp2 = FMAOp2.getOperand(0);
16477 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16479 return SDValue();
16480
16481 SDValue Vec3 = FMAOp1.getOperand(0);
16482 SDValue Vec4 = FMAOp2.getOperand(0);
16483 SDValue Idx2 = FMAOp1.getOperand(1);
16484
16485 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
16486 // Idx1 and Idx2 cannot be the same.
16487 Idx1 == Idx2)
16488 return SDValue();
16489
16490 if (Vec1 == Vec2 || Vec3 == Vec4)
16491 return SDValue();
16492
16493 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
16494 return SDValue();
16495
16496 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16497 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
16498 DAG.getTargetConstant(0, SL, MVT::i1));
16499 }
16500 }
16501 return SDValue();
16502}
16503
16504SDValue SITargetLowering::performSetCCCombine(SDNode *N,
16505 DAGCombinerInfo &DCI) const {
16506 SelectionDAG &DAG = DCI.DAG;
16507 SDLoc SL(N);
16508
16509 SDValue LHS = N->getOperand(0);
16510 SDValue RHS = N->getOperand(1);
16511 EVT VT = LHS.getValueType();
16512 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
16513
16514 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
16515 if (!CRHS) {
16517 if (CRHS) {
16518 std::swap(LHS, RHS);
16519 CC = getSetCCSwappedOperands(CC);
16520 }
16521 }
16522
16523 if (CRHS) {
16524 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
16525 isBoolSGPR(LHS.getOperand(0))) {
16526 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
16527 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
16528 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
16529 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
16530 if ((CRHS->isAllOnes() &&
16531 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
16532 (CRHS->isZero() &&
16533 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
16534 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16535 DAG.getAllOnesConstant(SL, MVT::i1));
16536 if ((CRHS->isAllOnes() &&
16537 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
16538 (CRHS->isZero() &&
16539 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
16540 return LHS.getOperand(0);
16541 }
16542
16543 const APInt &CRHSVal = CRHS->getAPIntValue();
16544 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
16545 LHS.getOpcode() == ISD::SELECT &&
16546 isa<ConstantSDNode>(LHS.getOperand(1)) &&
16547 isa<ConstantSDNode>(LHS.getOperand(2)) &&
16548 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
16549 isBoolSGPR(LHS.getOperand(0))) {
16550 // Given CT != FT:
16551 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
16552 // setcc (select cc, CT, CF), CF, ne => cc
16553 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
16554 // setcc (select cc, CT, CF), CT, eq => cc
16555 const APInt &CT = LHS.getConstantOperandAPInt(1);
16556 const APInt &CF = LHS.getConstantOperandAPInt(2);
16557
16558 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
16559 (CT == CRHSVal && CC == ISD::SETNE))
16560 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16561 DAG.getAllOnesConstant(SL, MVT::i1));
16562 if ((CF == CRHSVal && CC == ISD::SETNE) ||
16563 (CT == CRHSVal && CC == ISD::SETEQ))
16564 return LHS.getOperand(0);
16565 }
16566 }
16567
16568 if (VT != MVT::f32 && VT != MVT::f64 &&
16569 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16570 return SDValue();
16571
16572 // Match isinf/isfinite pattern
16573 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
16574 // (fcmp one (fabs x), inf) -> (fp_class x,
16575 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
16576 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
16577 LHS.getOpcode() == ISD::FABS) {
16578 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
16579 if (!CRHS)
16580 return SDValue();
16581
16582 const APFloat &APF = CRHS->getValueAPF();
16583 if (APF.isInfinity() && !APF.isNegative()) {
16584 const unsigned IsInfMask =
16586 const unsigned IsFiniteMask =
16590 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
16591 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
16592 DAG.getConstant(Mask, SL, MVT::i32));
16593 }
16594 }
16595
16596 return SDValue();
16597}
16598
16599SDValue
16600SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
16601 DAGCombinerInfo &DCI) const {
16602 SelectionDAG &DAG = DCI.DAG;
16603 SDLoc SL(N);
16604 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
16605
16606 SDValue Src = N->getOperand(0);
16607 SDValue Shift = N->getOperand(0);
16608
16609 // TODO: Extend type shouldn't matter (assuming legal types).
16610 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
16611 Shift = Shift.getOperand(0);
16612
16613 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
16614 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
16615 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
16616 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
16617 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
16618 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
16619 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
16620 SDValue Shifted = DAG.getZExtOrTrunc(
16621 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
16622
16623 unsigned ShiftOffset = 8 * Offset;
16624 if (Shift.getOpcode() == ISD::SHL)
16625 ShiftOffset -= C->getZExtValue();
16626 else
16627 ShiftOffset += C->getZExtValue();
16628
16629 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16630 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
16631 MVT::f32, Shifted);
16632 }
16633 }
16634 }
16635
16636 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16637 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
16638 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
16639 // We simplified Src. If this node is not dead, visit it again so it is
16640 // folded properly.
16641 if (N->getOpcode() != ISD::DELETED_NODE)
16642 DCI.AddToWorklist(N);
16643 return SDValue(N, 0);
16644 }
16645
16646 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
16647 if (SDValue DemandedSrc =
16648 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
16649 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
16650
16651 return SDValue();
16652}
16653
16654SDValue SITargetLowering::performClampCombine(SDNode *N,
16655 DAGCombinerInfo &DCI) const {
16656 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
16657 if (!CSrc)
16658 return SDValue();
16659
16660 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16661 const APFloat &F = CSrc->getValueAPF();
16662 APFloat Zero = APFloat::getZero(F.getSemantics());
16663 if (F < Zero ||
16664 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16665 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
16666 }
16667
16668 APFloat One(F.getSemantics(), "1.0");
16669 if (F > One)
16670 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
16671
16672 return SDValue(CSrc, 0);
16673}
16674
16675SDValue SITargetLowering::performSelectCombine(SDNode *N,
16676 DAGCombinerInfo &DCI) const {
16677
16678 // Try to fold CMP + SELECT patterns with shared constants (both FP and
16679 // integer).
16680 // Detect when CMP and SELECT use the same constant and fold them to avoid
16681 // loading the constant twice. Specifically handles patterns like:
16682 // %cmp = icmp eq i32 %val, 4242
16683 // %sel = select i1 %cmp, i32 4242, i32 %other
16684 // It can be optimized to reuse %val instead of 4242 in select.
16685 SDValue Cond = N->getOperand(0);
16686 SDValue TrueVal = N->getOperand(1);
16687 SDValue FalseVal = N->getOperand(2);
16688
16689 // Check if condition is a comparison.
16690 if (Cond.getOpcode() != ISD::SETCC)
16691 return SDValue();
16692
16693 SDValue LHS = Cond.getOperand(0);
16694 SDValue RHS = Cond.getOperand(1);
16695 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
16696
16697 bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
16698 bool isInteger = LHS.getValueType().isInteger();
16699
16700 // Handle simple floating-point and integer types only.
16701 if (!isFloatingPoint && !isInteger)
16702 return SDValue();
16703
16704 bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
16705 bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
16706 if (!isEquality && !isNonEquality)
16707 return SDValue();
16708
16709 SDValue ArgVal, ConstVal;
16710 if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
16711 (isInteger && isa<ConstantSDNode>(RHS))) {
16712 ConstVal = RHS;
16713 ArgVal = LHS;
16714 } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
16715 (isInteger && isa<ConstantSDNode>(LHS))) {
16716 ConstVal = LHS;
16717 ArgVal = RHS;
16718 } else {
16719 return SDValue();
16720 }
16721
16722 // Skip optimization for inlinable immediates.
16723 if (isFloatingPoint) {
16724 const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
16725 if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
16726 return SDValue();
16727 } else {
16729 cast<ConstantSDNode>(ConstVal)->getSExtValue()))
16730 return SDValue();
16731 }
16732
16733 // For equality and non-equality comparisons, patterns:
16734 // select (setcc x, const), const, y -> select (setcc x, const), x, y
16735 // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
16736 if (!(isEquality && TrueVal == ConstVal) &&
16737 !(isNonEquality && FalseVal == ConstVal))
16738 return SDValue();
16739
16740 SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
16741 SDValue SelectRHS =
16742 (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
16743 return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
16744 SelectLHS, SelectRHS);
16745}
16746
16748 DAGCombinerInfo &DCI) const {
16749 switch (N->getOpcode()) {
16750 case ISD::ADD:
16751 case ISD::SUB:
16752 case ISD::SHL:
16753 case ISD::SRL:
16754 case ISD::SRA:
16755 case ISD::AND:
16756 case ISD::OR:
16757 case ISD::XOR:
16758 case ISD::MUL:
16759 case ISD::SETCC:
16760 case ISD::SELECT:
16761 case ISD::SMIN:
16762 case ISD::SMAX:
16763 case ISD::UMIN:
16764 case ISD::UMAX:
16765 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
16766 return Res;
16767 break;
16768 default:
16769 break;
16770 }
16771
16772 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
16773 return SDValue();
16774
16775 switch (N->getOpcode()) {
16776 case ISD::ADD:
16777 return performAddCombine(N, DCI);
16778 case ISD::PTRADD:
16779 return performPtrAddCombine(N, DCI);
16780 case ISD::SUB:
16781 return performSubCombine(N, DCI);
16782 case ISD::UADDO_CARRY:
16783 case ISD::USUBO_CARRY:
16784 return performAddCarrySubCarryCombine(N, DCI);
16785 case ISD::FADD:
16786 return performFAddCombine(N, DCI);
16787 case ISD::FSUB:
16788 return performFSubCombine(N, DCI);
16789 case ISD::FDIV:
16790 return performFDivCombine(N, DCI);
16791 case ISD::FMUL:
16792 return performFMulCombine(N, DCI);
16793 case ISD::SETCC:
16794 return performSetCCCombine(N, DCI);
16795 case ISD::SELECT:
16796 if (auto Res = performSelectCombine(N, DCI))
16797 return Res;
16798 break;
16799 case ISD::FMAXNUM:
16800 case ISD::FMINNUM:
16801 case ISD::FMAXNUM_IEEE:
16802 case ISD::FMINNUM_IEEE:
16803 case ISD::FMAXIMUM:
16804 case ISD::FMINIMUM:
16805 case ISD::FMAXIMUMNUM:
16806 case ISD::FMINIMUMNUM:
16807 case ISD::SMAX:
16808 case ISD::SMIN:
16809 case ISD::UMAX:
16810 case ISD::UMIN:
16813 return performMinMaxCombine(N, DCI);
16814 case ISD::FMA:
16815 return performFMACombine(N, DCI);
16816 case ISD::AND:
16817 return performAndCombine(N, DCI);
16818 case ISD::OR:
16819 return performOrCombine(N, DCI);
16820 case ISD::FSHR: {
16822 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
16823 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
16824 return matchPERM(N, DCI);
16825 }
16826 break;
16827 }
16828 case ISD::XOR:
16829 return performXorCombine(N, DCI);
16830 case ISD::ZERO_EXTEND:
16831 return performZeroExtendCombine(N, DCI);
16833 return performSignExtendInRegCombine(N, DCI);
16835 return performClassCombine(N, DCI);
16836 case ISD::FCANONICALIZE:
16837 return performFCanonicalizeCombine(N, DCI);
16838 case AMDGPUISD::RCP:
16839 return performRcpCombine(N, DCI);
16840 case ISD::FLDEXP:
16841 case AMDGPUISD::FRACT:
16842 case AMDGPUISD::RSQ:
16845 case AMDGPUISD::RSQ_CLAMP: {
16846 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
16847 SDValue Src = N->getOperand(0);
16848 if (Src.isUndef())
16849 return Src;
16850 break;
16851 }
16852 case ISD::SINT_TO_FP:
16853 case ISD::UINT_TO_FP:
16854 return performUCharToFloatCombine(N, DCI);
16855 case ISD::FCOPYSIGN:
16856 return performFCopySignCombine(N, DCI);
16861 return performCvtF32UByteNCombine(N, DCI);
16862 case AMDGPUISD::FMED3:
16863 return performFMed3Combine(N, DCI);
16865 return performCvtPkRTZCombine(N, DCI);
16866 case AMDGPUISD::CLAMP:
16867 return performClampCombine(N, DCI);
16868 case ISD::SCALAR_TO_VECTOR: {
16869 SelectionDAG &DAG = DCI.DAG;
16870 EVT VT = N->getValueType(0);
16871
16872 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
16873 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
16874 SDLoc SL(N);
16875 SDValue Src = N->getOperand(0);
16876 EVT EltVT = Src.getValueType();
16877 if (EltVT != MVT::i16)
16878 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
16879
16880 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
16881 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
16882 }
16883
16884 break;
16885 }
16887 return performExtractVectorEltCombine(N, DCI);
16889 return performInsertVectorEltCombine(N, DCI);
16890 case ISD::FP_ROUND:
16891 return performFPRoundCombine(N, DCI);
16892 case ISD::LOAD: {
16893 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
16894 return Widened;
16895 [[fallthrough]];
16896 }
16897 default: {
16898 if (!DCI.isBeforeLegalize()) {
16899 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
16900 return performMemSDNodeCombine(MemNode, DCI);
16901 }
16902
16903 break;
16904 }
16905 }
16906
16908}
16909
16910/// Helper function for adjustWritemask
16911static unsigned SubIdx2Lane(unsigned Idx) {
16912 switch (Idx) {
16913 default:
16914 return ~0u;
16915 case AMDGPU::sub0:
16916 return 0;
16917 case AMDGPU::sub1:
16918 return 1;
16919 case AMDGPU::sub2:
16920 return 2;
16921 case AMDGPU::sub3:
16922 return 3;
16923 case AMDGPU::sub4:
16924 return 4; // Possible with TFE/LWE
16925 }
16926}
16927
16928/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
16929SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
16930 SelectionDAG &DAG) const {
16931 unsigned Opcode = Node->getMachineOpcode();
16932
16933 // Subtract 1 because the vdata output is not a MachineSDNode operand.
16934 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
16935 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
16936 return Node; // not implemented for D16
16937
16938 SDNode *Users[5] = {nullptr};
16939 unsigned Lane = 0;
16940 unsigned DmaskIdx =
16941 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
16942 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
16943 unsigned NewDmask = 0;
16944 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
16945 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
16946 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
16947 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx));
16948 unsigned TFCLane = 0;
16949 bool HasChain = Node->getNumValues() > 1;
16950
16951 if (OldDmask == 0) {
16952 // These are folded out, but on the chance it happens don't assert.
16953 return Node;
16954 }
16955
16956 unsigned OldBitsSet = llvm::popcount(OldDmask);
16957 // Work out which is the TFE/LWE lane if that is enabled.
16958 if (UsesTFC) {
16959 TFCLane = OldBitsSet;
16960 }
16961
16962 // Try to figure out the used register components
16963 for (SDUse &Use : Node->uses()) {
16964
16965 // Don't look at users of the chain.
16966 if (Use.getResNo() != 0)
16967 continue;
16968
16969 SDNode *User = Use.getUser();
16970
16971 // Abort if we can't understand the usage
16972 if (!User->isMachineOpcode() ||
16973 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
16974 return Node;
16975
16976 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
16977 // Note that subregs are packed, i.e. Lane==0 is the first bit set
16978 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
16979 // set, etc.
16980 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
16981 if (Lane == ~0u)
16982 return Node;
16983
16984 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
16985 if (UsesTFC && Lane == TFCLane) {
16986 Users[Lane] = User;
16987 } else {
16988 // Set which texture component corresponds to the lane.
16989 unsigned Comp;
16990 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
16991 Comp = llvm::countr_zero(Dmask);
16992 Dmask &= ~(1 << Comp);
16993 }
16994
16995 // Abort if we have more than one user per component.
16996 if (Users[Lane])
16997 return Node;
16998
16999 Users[Lane] = User;
17000 NewDmask |= 1 << Comp;
17001 }
17002 }
17003
17004 // Don't allow 0 dmask, as hardware assumes one channel enabled.
17005 bool NoChannels = !NewDmask;
17006 if (NoChannels) {
17007 if (!UsesTFC) {
17008 // No uses of the result and not using TFC. Then do nothing.
17009 return Node;
17010 }
17011 // If the original dmask has one channel - then nothing to do
17012 if (OldBitsSet == 1)
17013 return Node;
17014 // Use an arbitrary dmask - required for the instruction to work
17015 NewDmask = 1;
17016 }
17017 // Abort if there's no change
17018 if (NewDmask == OldDmask)
17019 return Node;
17020
17021 unsigned BitsSet = llvm::popcount(NewDmask);
17022
17023 // Check for TFE or LWE - increase the number of channels by one to account
17024 // for the extra return value
17025 // This will need adjustment for D16 if this is also included in
17026 // adjustWriteMask (this function) but at present D16 are excluded.
17027 unsigned NewChannels = BitsSet + UsesTFC;
17028
17029 int NewOpcode =
17030 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
17031 assert(NewOpcode != -1 &&
17032 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
17033 "failed to find equivalent MIMG op");
17034
17035 // Adjust the writemask in the node
17037 llvm::append_range(Ops, Node->ops().take_front(DmaskIdx));
17038 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
17039 llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1));
17040
17041 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
17042
17043 MVT ResultVT = NewChannels == 1
17044 ? SVT
17045 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
17046 : NewChannels == 5 ? 8
17047 : NewChannels);
17048 SDVTList NewVTList =
17049 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
17050
17051 MachineSDNode *NewNode =
17052 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
17053
17054 if (HasChain) {
17055 // Update chain.
17056 DAG.setNodeMemRefs(NewNode, Node->memoperands());
17057 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
17058 }
17059
17060 if (NewChannels == 1) {
17061 assert(Node->hasNUsesOfValue(1, 0));
17062 SDNode *Copy =
17063 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
17064 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
17065 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
17066 return nullptr;
17067 }
17068
17069 // Update the users of the node with the new indices
17070 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17071 SDNode *User = Users[i];
17072 if (!User) {
17073 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
17074 // Users[0] is still nullptr because channel 0 doesn't really have a use.
17075 if (i || !NoChannels)
17076 continue;
17077 } else {
17078 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
17079 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
17080 if (NewUser != User) {
17081 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
17082 DAG.RemoveDeadNode(User);
17083 }
17084 }
17085
17086 switch (Idx) {
17087 default:
17088 break;
17089 case AMDGPU::sub0:
17090 Idx = AMDGPU::sub1;
17091 break;
17092 case AMDGPU::sub1:
17093 Idx = AMDGPU::sub2;
17094 break;
17095 case AMDGPU::sub2:
17096 Idx = AMDGPU::sub3;
17097 break;
17098 case AMDGPU::sub3:
17099 Idx = AMDGPU::sub4;
17100 break;
17101 }
17102 }
17103
17104 DAG.RemoveDeadNode(Node);
17105 return nullptr;
17106}
17107
17109 if (Op.getOpcode() == ISD::AssertZext)
17110 Op = Op.getOperand(0);
17111
17112 return isa<FrameIndexSDNode>(Op);
17113}
17114
17115/// Legalize target independent instructions (e.g. INSERT_SUBREG)
17116/// with frame index operands.
17117/// LLVM assumes that inputs are to these instructions are registers.
17118SDNode *
17120 SelectionDAG &DAG) const {
17121 if (Node->getOpcode() == ISD::CopyToReg) {
17122 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
17123 SDValue SrcVal = Node->getOperand(2);
17124
17125 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
17126 // to try understanding copies to physical registers.
17127 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
17128 SDLoc SL(Node);
17130 SDValue VReg = DAG.getRegister(
17131 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17132
17133 SDNode *Glued = Node->getGluedNode();
17134 SDValue ToVReg = DAG.getCopyToReg(
17135 Node->getOperand(0), SL, VReg, SrcVal,
17136 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
17137 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
17138 VReg, ToVReg.getValue(1));
17139 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
17140 DAG.RemoveDeadNode(Node);
17141 return ToResultReg.getNode();
17142 }
17143 }
17144
17146 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
17147 if (!isFrameIndexOp(Node->getOperand(i))) {
17148 Ops.push_back(Node->getOperand(i));
17149 continue;
17150 }
17151
17152 SDLoc DL(Node);
17153 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
17154 Node->getOperand(i).getValueType(),
17155 Node->getOperand(i)),
17156 0));
17157 }
17158
17159 return DAG.UpdateNodeOperands(Node, Ops);
17160}
17161
17162/// Fold the instructions after selecting them.
17163/// Returns null if users were already updated.
17165 SelectionDAG &DAG) const {
17167 unsigned Opcode = Node->getMachineOpcode();
17168
17169 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
17170 !TII->isGather4(Opcode) &&
17171 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
17172 return adjustWritemask(Node, DAG);
17173 }
17174
17175 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17177 return Node;
17178 }
17179
17180 switch (Opcode) {
17181 case AMDGPU::V_DIV_SCALE_F32_e64:
17182 case AMDGPU::V_DIV_SCALE_F64_e64: {
17183 // Satisfy the operand register constraint when one of the inputs is
17184 // undefined. Ordinarily each undef value will have its own implicit_def of
17185 // a vreg, so force these to use a single register.
17186 SDValue Src0 = Node->getOperand(1);
17187 SDValue Src1 = Node->getOperand(3);
17188 SDValue Src2 = Node->getOperand(5);
17189
17190 if ((Src0.isMachineOpcode() &&
17191 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
17192 (Src0 == Src1 || Src0 == Src2))
17193 break;
17194
17195 MVT VT = Src0.getValueType().getSimpleVT();
17196 const TargetRegisterClass *RC =
17197 getRegClassFor(VT, Src0.getNode()->isDivergent());
17198
17200 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
17201
17202 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
17203 Src0, SDValue());
17204
17205 // src0 must be the same register as src1 or src2, even if the value is
17206 // undefined, so make sure we don't violate this constraint.
17207 if (Src0.isMachineOpcode() &&
17208 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
17209 if (Src1.isMachineOpcode() &&
17210 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17211 Src0 = Src1;
17212 else if (Src2.isMachineOpcode() &&
17213 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17214 Src0 = Src2;
17215 else {
17216 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
17217 Src0 = UndefReg;
17218 Src1 = UndefReg;
17219 }
17220 } else
17221 break;
17222
17224 Ops[1] = Src0;
17225 Ops[3] = Src1;
17226 Ops[5] = Src2;
17227 Ops.push_back(ImpDef.getValue(1));
17228 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
17229 }
17230 default:
17231 break;
17232 }
17233
17234 return Node;
17235}
17236
17237// Any MIMG instructions that use tfe or lwe require an initialization of the
17238// result register that will be written in the case of a memory access failure.
17239// The required code is also added to tie this init code to the result of the
17240// img instruction.
17243 const SIRegisterInfo &TRI = TII->getRegisterInfo();
17244 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
17245 MachineBasicBlock &MBB = *MI.getParent();
17246
17247 int DstIdx =
17248 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
17249 unsigned InitIdx = 0;
17250
17251 if (TII->isImage(MI)) {
17252 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
17253 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
17254 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
17255
17256 if (!TFE && !LWE) // intersect_ray
17257 return;
17258
17259 unsigned TFEVal = TFE ? TFE->getImm() : 0;
17260 unsigned LWEVal = LWE ? LWE->getImm() : 0;
17261 unsigned D16Val = D16 ? D16->getImm() : 0;
17262
17263 if (!TFEVal && !LWEVal)
17264 return;
17265
17266 // At least one of TFE or LWE are non-zero
17267 // We have to insert a suitable initialization of the result value and
17268 // tie this to the dest of the image instruction.
17269
17270 // Calculate which dword we have to initialize to 0.
17271 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
17272
17273 // check that dmask operand is found.
17274 assert(MO_Dmask && "Expected dmask operand in instruction");
17275
17276 unsigned dmask = MO_Dmask->getImm();
17277 // Determine the number of active lanes taking into account the
17278 // Gather4 special case
17279 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
17280
17281 bool Packed = !Subtarget->hasUnpackedD16VMem();
17282
17283 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17284
17285 // Abandon attempt if the dst size isn't large enough
17286 // - this is in fact an error but this is picked up elsewhere and
17287 // reported correctly.
17288 uint32_t DstSize =
17289 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
17290 if (DstSize < InitIdx)
17291 return;
17292 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
17293 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
17294 } else {
17295 return;
17296 }
17297
17298 const DebugLoc &DL = MI.getDebugLoc();
17299
17300 // Create a register for the initialization value.
17301 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
17302 unsigned NewDst = 0; // Final initialized value will be in here
17303
17304 // If PRTStrictNull feature is enabled (the default) then initialize
17305 // all the result registers to 0, otherwise just the error indication
17306 // register (VGPRn+1)
17307 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17308 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17309
17310 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
17311 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17312 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
17313 // Initialize dword
17314 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
17315 // clang-format off
17316 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
17317 .addImm(0);
17318 // clang-format on
17319 // Insert into the super-reg
17320 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
17321 .addReg(PrevDst)
17322 .addReg(SubReg)
17324
17325 PrevDst = NewDst;
17326 }
17327
17328 // Add as an implicit operand
17329 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
17330
17331 // Tie the just added implicit operand to the dst
17332 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
17333}
17334
17335/// Assign the register class depending on the number of
17336/// bits set in the writemask
17338 SDNode *Node) const {
17340
17341 MachineFunction *MF = MI.getParent()->getParent();
17344
17345 if (TII->isVOP3(MI.getOpcode())) {
17346 // Make sure constant bus requirements are respected.
17347 TII->legalizeOperandsVOP3(MRI, MI);
17348
17349 // Prefer VGPRs over AGPRs in mAI instructions where possible.
17350 // This saves a chain-copy of registers and better balance register
17351 // use between vgpr and agpr as agpr tuples tend to be big.
17352 if (!MI.getDesc().operands().empty()) {
17353 unsigned Opc = MI.getOpcode();
17354 bool HasAGPRs = Info->mayNeedAGPRs();
17355 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17356 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
17357 for (auto I :
17358 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
17359 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
17360 if (I == -1)
17361 break;
17362 if ((I == Src2Idx) && (HasAGPRs))
17363 break;
17364 MachineOperand &Op = MI.getOperand(I);
17365 if (!Op.isReg() || !Op.getReg().isVirtual())
17366 continue;
17367 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
17368 if (!TRI->hasAGPRs(RC))
17369 continue;
17370 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
17371 if (!Src || !Src->isCopy() ||
17372 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
17373 continue;
17374 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
17375 // All uses of agpr64 and agpr32 can also accept vgpr except for
17376 // v_accvgpr_read, but we do not produce agpr reads during selection,
17377 // so no use checks are needed.
17378 MRI.setRegClass(Op.getReg(), NewRC);
17379 }
17380
17381 if (TII->isMAI(MI)) {
17382 // The ordinary src0, src1, src2 were legalized above.
17383 //
17384 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
17385 // as a separate instruction.
17386 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17387 AMDGPU::OpName::scale_src0);
17388 if (Src0Idx != -1) {
17389 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17390 AMDGPU::OpName::scale_src1);
17391 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
17392 TII->usesConstantBus(MRI, MI, Src1Idx))
17393 TII->legalizeOpWithMove(MI, Src1Idx);
17394 }
17395 }
17396
17397 if (!HasAGPRs)
17398 return;
17399
17400 // Resolve the rest of AV operands to AGPRs.
17401 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
17402 if (Src2->isReg() && Src2->getReg().isVirtual()) {
17403 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
17404 if (TRI->isVectorSuperClass(RC)) {
17405 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
17406 MRI.setRegClass(Src2->getReg(), NewRC);
17407 if (Src2->isTied())
17408 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
17409 }
17410 }
17411 }
17412 }
17413
17414 return;
17415 }
17416
17417 if (TII->isImage(MI))
17418 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
17419}
17420
17422 uint64_t Val) {
17423 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
17424 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
17425}
17426
17428 const SDLoc &DL,
17429 SDValue Ptr) const {
17431
17432 // Build the half of the subregister with the constants before building the
17433 // full 128-bit register. If we are building multiple resource descriptors,
17434 // this will allow CSEing of the 2-component register.
17435 const SDValue Ops0[] = {
17436 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
17437 buildSMovImm32(DAG, DL, 0),
17438 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17439 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
17440 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
17441
17442 SDValue SubRegHi = SDValue(
17443 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
17444
17445 // Combine the constants and the pointer.
17446 const SDValue Ops1[] = {
17447 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
17448 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
17449 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
17450
17451 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
17452}
17453
17454/// Return a resource descriptor with the 'Add TID' bit enabled
17455/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
17456/// of the resource descriptor) to create an offset, which is added to
17457/// the resource pointer.
17459 SDValue Ptr, uint32_t RsrcDword1,
17460 uint64_t RsrcDword2And3) const {
17461 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
17462 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
17463 if (RsrcDword1) {
17464 PtrHi =
17465 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
17466 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
17467 0);
17468 }
17469
17470 SDValue DataLo =
17471 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
17472 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
17473
17474 const SDValue Ops[] = {
17475 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
17476 PtrLo,
17477 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17478 PtrHi,
17479 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
17480 DataLo,
17481 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
17482 DataHi,
17483 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
17484
17485 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
17486}
17487
17488//===----------------------------------------------------------------------===//
17489// SI Inline Assembly Support
17490//===----------------------------------------------------------------------===//
17491
17492std::pair<unsigned, const TargetRegisterClass *>
17494 StringRef Constraint,
17495 MVT VT) const {
17496 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
17497
17498 const TargetRegisterClass *RC = nullptr;
17499 if (Constraint.size() == 1) {
17500 // Check if we cannot determine the bit size of the given value type. This
17501 // can happen, for example, in this situation where we have an empty struct
17502 // (size 0): `call void asm "", "v"({} poison)`-
17503 if (VT == MVT::Other)
17504 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17505 const unsigned BitWidth = VT.getSizeInBits();
17506 switch (Constraint[0]) {
17507 default:
17508 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17509 case 's':
17510 case 'r':
17511 switch (BitWidth) {
17512 case 16:
17513 RC = &AMDGPU::SReg_32RegClass;
17514 break;
17515 case 64:
17516 RC = &AMDGPU::SGPR_64RegClass;
17517 break;
17518 default:
17520 if (!RC)
17521 return std::pair(0U, nullptr);
17522 break;
17523 }
17524 break;
17525 case 'v':
17526 switch (BitWidth) {
17527 case 16:
17528 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17529 : &AMDGPU::VGPR_32_Lo256RegClass;
17530 break;
17531 default:
17532 RC = Subtarget->has1024AddressableVGPRs()
17533 ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
17534 : TRI->getVGPRClassForBitWidth(BitWidth);
17535 if (!RC)
17536 return std::pair(0U, nullptr);
17537 break;
17538 }
17539 break;
17540 case 'a':
17541 if (!Subtarget->hasMAIInsts())
17542 break;
17543 switch (BitWidth) {
17544 case 16:
17545 RC = &AMDGPU::AGPR_32RegClass;
17546 break;
17547 default:
17548 RC = TRI->getAGPRClassForBitWidth(BitWidth);
17549 if (!RC)
17550 return std::pair(0U, nullptr);
17551 break;
17552 }
17553 break;
17554 }
17555 } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
17556 const unsigned BitWidth = VT.getSizeInBits();
17557 switch (BitWidth) {
17558 case 16:
17559 RC = &AMDGPU::AV_32RegClass;
17560 break;
17561 default:
17562 RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
17563 if (!RC)
17564 return std::pair(0U, nullptr);
17565 break;
17566 }
17567 }
17568
17569 // We actually support i128, i16 and f16 as inline parameters
17570 // even if they are not reported as legal
17571 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
17572 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
17573 return std::pair(0U, RC);
17574
17575 auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
17576 if (Kind != '\0') {
17577 if (Kind == 'v') {
17578 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17579 } else if (Kind == 's') {
17580 RC = &AMDGPU::SGPR_32RegClass;
17581 } else if (Kind == 'a') {
17582 RC = &AMDGPU::AGPR_32RegClass;
17583 }
17584
17585 if (RC) {
17586 if (NumRegs > 1) {
17587 if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
17588 return std::pair(0U, nullptr);
17589
17590 uint32_t Width = NumRegs * 32;
17591 // Prohibit constraints for register ranges with a width that does not
17592 // match the required type.
17593 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
17594 return std::pair(0U, nullptr);
17595
17596 MCRegister Reg = RC->getRegister(Idx);
17598 RC = TRI->getVGPRClassForBitWidth(Width);
17599 else if (SIRegisterInfo::isSGPRClass(RC))
17600 RC = TRI->getSGPRClassForBitWidth(Width);
17601 else if (SIRegisterInfo::isAGPRClass(RC))
17602 RC = TRI->getAGPRClassForBitWidth(Width);
17603 if (RC) {
17604 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17605 if (!Reg) {
17606 // The register class does not contain the requested register,
17607 // e.g., because it is an SGPR pair that would violate alignment
17608 // requirements.
17609 return std::pair(0U, nullptr);
17610 }
17611 return std::pair(Reg, RC);
17612 }
17613 }
17614
17615 // Check for lossy scalar/vector conversions.
17616 if (VT.isVector() && VT.getSizeInBits() != 32)
17617 return std::pair(0U, nullptr);
17618 if (Idx < RC->getNumRegs())
17619 return std::pair(RC->getRegister(Idx), RC);
17620 return std::pair(0U, nullptr);
17621 }
17622 }
17623
17624 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17625 if (Ret.first)
17626 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
17627
17628 return Ret;
17629}
17630
17631static bool isImmConstraint(StringRef Constraint) {
17632 if (Constraint.size() == 1) {
17633 switch (Constraint[0]) {
17634 default:
17635 break;
17636 case 'I':
17637 case 'J':
17638 case 'A':
17639 case 'B':
17640 case 'C':
17641 return true;
17642 }
17643 } else if (Constraint == "DA" || Constraint == "DB") {
17644 return true;
17645 }
17646 return false;
17647}
17648
17651 if (Constraint.size() == 1) {
17652 switch (Constraint[0]) {
17653 default:
17654 break;
17655 case 's':
17656 case 'v':
17657 case 'a':
17658 return C_RegisterClass;
17659 }
17660 } else if (Constraint.size() == 2) {
17661 if (Constraint == "VA")
17662 return C_RegisterClass;
17663 }
17664 if (isImmConstraint(Constraint)) {
17665 return C_Other;
17666 }
17667 return TargetLowering::getConstraintType(Constraint);
17668}
17669
17670static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
17672 Val = Val & maskTrailingOnes<uint64_t>(Size);
17673 }
17674 return Val;
17675}
17676
17678 StringRef Constraint,
17679 std::vector<SDValue> &Ops,
17680 SelectionDAG &DAG) const {
17681 if (isImmConstraint(Constraint)) {
17682 uint64_t Val;
17683 if (getAsmOperandConstVal(Op, Val) &&
17684 checkAsmConstraintVal(Op, Constraint, Val)) {
17685 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
17686 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
17687 }
17688 } else {
17690 }
17691}
17692
17694 unsigned Size = Op.getScalarValueSizeInBits();
17695 if (Size > 64)
17696 return false;
17697
17698 if (Size == 16 && !Subtarget->has16BitInsts())
17699 return false;
17700
17702 Val = C->getSExtValue();
17703 return true;
17704 }
17706 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17707 return true;
17708 }
17710 if (Size != 16 || Op.getNumOperands() != 2)
17711 return false;
17712 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
17713 return false;
17714 if (ConstantSDNode *C = V->getConstantSplatNode()) {
17715 Val = C->getSExtValue();
17716 return true;
17717 }
17718 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
17719 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17720 return true;
17721 }
17722 }
17723
17724 return false;
17725}
17726
17728 uint64_t Val) const {
17729 if (Constraint.size() == 1) {
17730 switch (Constraint[0]) {
17731 case 'I':
17733 case 'J':
17734 return isInt<16>(Val);
17735 case 'A':
17736 return checkAsmConstraintValA(Op, Val);
17737 case 'B':
17738 return isInt<32>(Val);
17739 case 'C':
17740 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
17742 default:
17743 break;
17744 }
17745 } else if (Constraint.size() == 2) {
17746 if (Constraint == "DA") {
17747 int64_t HiBits = static_cast<int32_t>(Val >> 32);
17748 int64_t LoBits = static_cast<int32_t>(Val);
17749 return checkAsmConstraintValA(Op, HiBits, 32) &&
17750 checkAsmConstraintValA(Op, LoBits, 32);
17751 }
17752 if (Constraint == "DB") {
17753 return true;
17754 }
17755 }
17756 llvm_unreachable("Invalid asm constraint");
17757}
17758
17760 unsigned MaxSize) const {
17761 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
17762 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
17763 if (Size == 16) {
17764 MVT VT = Op.getSimpleValueType();
17765 switch (VT.SimpleTy) {
17766 default:
17767 return false;
17768 case MVT::i16:
17769 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
17770 case MVT::f16:
17771 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
17772 case MVT::bf16:
17773 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
17774 case MVT::v2i16:
17775 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
17776 case MVT::v2f16:
17777 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
17778 case MVT::v2bf16:
17779 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
17780 }
17781 }
17782 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
17783 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
17784 return true;
17785 return false;
17786}
17787
17788static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
17789 switch (UnalignedClassID) {
17790 case AMDGPU::VReg_64RegClassID:
17791 return AMDGPU::VReg_64_Align2RegClassID;
17792 case AMDGPU::VReg_96RegClassID:
17793 return AMDGPU::VReg_96_Align2RegClassID;
17794 case AMDGPU::VReg_128RegClassID:
17795 return AMDGPU::VReg_128_Align2RegClassID;
17796 case AMDGPU::VReg_160RegClassID:
17797 return AMDGPU::VReg_160_Align2RegClassID;
17798 case AMDGPU::VReg_192RegClassID:
17799 return AMDGPU::VReg_192_Align2RegClassID;
17800 case AMDGPU::VReg_224RegClassID:
17801 return AMDGPU::VReg_224_Align2RegClassID;
17802 case AMDGPU::VReg_256RegClassID:
17803 return AMDGPU::VReg_256_Align2RegClassID;
17804 case AMDGPU::VReg_288RegClassID:
17805 return AMDGPU::VReg_288_Align2RegClassID;
17806 case AMDGPU::VReg_320RegClassID:
17807 return AMDGPU::VReg_320_Align2RegClassID;
17808 case AMDGPU::VReg_352RegClassID:
17809 return AMDGPU::VReg_352_Align2RegClassID;
17810 case AMDGPU::VReg_384RegClassID:
17811 return AMDGPU::VReg_384_Align2RegClassID;
17812 case AMDGPU::VReg_512RegClassID:
17813 return AMDGPU::VReg_512_Align2RegClassID;
17814 case AMDGPU::VReg_1024RegClassID:
17815 return AMDGPU::VReg_1024_Align2RegClassID;
17816 case AMDGPU::AReg_64RegClassID:
17817 return AMDGPU::AReg_64_Align2RegClassID;
17818 case AMDGPU::AReg_96RegClassID:
17819 return AMDGPU::AReg_96_Align2RegClassID;
17820 case AMDGPU::AReg_128RegClassID:
17821 return AMDGPU::AReg_128_Align2RegClassID;
17822 case AMDGPU::AReg_160RegClassID:
17823 return AMDGPU::AReg_160_Align2RegClassID;
17824 case AMDGPU::AReg_192RegClassID:
17825 return AMDGPU::AReg_192_Align2RegClassID;
17826 case AMDGPU::AReg_256RegClassID:
17827 return AMDGPU::AReg_256_Align2RegClassID;
17828 case AMDGPU::AReg_512RegClassID:
17829 return AMDGPU::AReg_512_Align2RegClassID;
17830 case AMDGPU::AReg_1024RegClassID:
17831 return AMDGPU::AReg_1024_Align2RegClassID;
17832 default:
17833 return -1;
17834 }
17835}
17836
17837// Figure out which registers should be reserved for stack access. Only after
17838// the function is legalized do we know all of the non-spill stack objects or if
17839// calls are present.
17843 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
17844 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17845 const SIInstrInfo *TII = ST.getInstrInfo();
17846
17847 if (Info->isEntryFunction()) {
17848 // Callable functions have fixed registers used for stack access.
17850 }
17851
17852 // TODO: Move this logic to getReservedRegs()
17853 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
17854 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
17855 Register SReg = ST.isWave32()
17856 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
17857 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
17858 &AMDGPU::SGPR_64RegClass);
17859 Info->setSGPRForEXECCopy(SReg);
17860
17861 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
17862 Info->getStackPtrOffsetReg()));
17863 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
17864 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
17865
17866 // We need to worry about replacing the default register with itself in case
17867 // of MIR testcases missing the MFI.
17868 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
17869 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
17870
17871 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
17872 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
17873
17874 Info->limitOccupancy(MF);
17875
17876 if (ST.isWave32() && !MF.empty()) {
17877 for (auto &MBB : MF) {
17878 for (auto &MI : MBB) {
17879 TII->fixImplicitOperands(MI);
17880 }
17881 }
17882 }
17883
17884 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
17885 // classes if required. Ideally the register class constraints would differ
17886 // per-subtarget, but there's no easy way to achieve that right now. This is
17887 // not a problem for VGPRs because the correctly aligned VGPR class is implied
17888 // from using them as the register class for legal types.
17889 if (ST.needsAlignedVGPRs()) {
17890 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
17891 const Register Reg = Register::index2VirtReg(I);
17892 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
17893 if (!RC)
17894 continue;
17895 int NewClassID = getAlignedAGPRClassID(RC->getID());
17896 if (NewClassID != -1)
17897 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
17898 }
17899 }
17900
17902}
17903
17905 KnownBits &Known,
17906 const APInt &DemandedElts,
17907 const SelectionDAG &DAG,
17908 unsigned Depth) const {
17909 Known.resetAll();
17910 unsigned Opc = Op.getOpcode();
17911 switch (Opc) {
17913 unsigned IID = Op.getConstantOperandVal(0);
17914 switch (IID) {
17915 case Intrinsic::amdgcn_mbcnt_lo:
17916 case Intrinsic::amdgcn_mbcnt_hi: {
17917 const GCNSubtarget &ST =
17919 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
17920 // most 31 + src1.
17921 Known.Zero.setBitsFrom(
17922 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
17923 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
17924 Known = KnownBits::add(Known, Known2);
17925 return;
17926 }
17927 }
17928 break;
17929 }
17930 }
17932 Op, Known, DemandedElts, DAG, Depth);
17933}
17934
17936 const int FI, KnownBits &Known, const MachineFunction &MF) const {
17938
17939 // Set the high bits to zero based on the maximum allowed scratch size per
17940 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
17941 // calculation won't overflow, so assume the sign bit is never set.
17942 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
17943}
17944
17946 GISelValueTracking &VT, KnownBits &Known,
17947 unsigned Dim) {
17948 unsigned MaxValue =
17949 ST.getMaxWorkitemID(VT.getMachineFunction().getFunction(), Dim);
17950 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
17951}
17952
17954 KnownBits &Known, const APInt &DemandedElts,
17955 unsigned BFEWidth, bool SExt, unsigned Depth) {
17957 const MachineOperand &Src1 = MI.getOperand(2);
17958
17959 unsigned Src1Cst = 0;
17960 if (Src1.isImm()) {
17961 Src1Cst = Src1.getImm();
17962 } else if (Src1.isReg()) {
17963 auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI);
17964 if (!Cst)
17965 return;
17966 Src1Cst = Cst->Value.getZExtValue();
17967 } else {
17968 return;
17969 }
17970
17971 // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
17972 // Width is always [22:16].
17973 const unsigned Offset =
17974 Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
17975 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
17976
17977 if (Width >= BFEWidth) // Ill-formed.
17978 return;
17979
17980 VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
17981 Depth + 1);
17982
17983 Known = Known.extractBits(Width, Offset);
17984
17985 if (SExt)
17986 Known = Known.sext(BFEWidth);
17987 else
17988 Known = Known.zext(BFEWidth);
17989}
17990
17992 GISelValueTracking &VT, Register R, KnownBits &Known,
17993 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
17994 unsigned Depth) const {
17995 Known.resetAll();
17996 const MachineInstr *MI = MRI.getVRegDef(R);
17997 switch (MI->getOpcode()) {
17998 case AMDGPU::S_BFE_I32:
17999 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18000 /*SExt=*/true, Depth);
18001 case AMDGPU::S_BFE_U32:
18002 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18003 /*SExt=*/false, Depth);
18004 case AMDGPU::S_BFE_I64:
18005 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18006 /*SExt=*/true, Depth);
18007 case AMDGPU::S_BFE_U64:
18008 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18009 /*SExt=*/false, Depth);
18010 case AMDGPU::G_INTRINSIC:
18011 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18012 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
18013 switch (IID) {
18014 case Intrinsic::amdgcn_workitem_id_x:
18015 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 0);
18016 break;
18017 case Intrinsic::amdgcn_workitem_id_y:
18018 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 1);
18019 break;
18020 case Intrinsic::amdgcn_workitem_id_z:
18021 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 2);
18022 break;
18023 case Intrinsic::amdgcn_mbcnt_lo:
18024 case Intrinsic::amdgcn_mbcnt_hi: {
18025 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18026 // most 31 + src1.
18027 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
18028 ? getSubtarget()->getWavefrontSizeLog2()
18029 : 5);
18030 KnownBits Known2;
18031 VT.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
18032 Depth + 1);
18033 Known = KnownBits::add(Known, Known2);
18034 break;
18035 }
18036 case Intrinsic::amdgcn_groupstaticsize: {
18037 // We can report everything over the maximum size as 0. We can't report
18038 // based on the actual size because we don't know if it's accurate or not
18039 // at any given point.
18040 Known.Zero.setHighBits(
18041 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
18042 break;
18043 }
18044 }
18045 break;
18046 }
18047 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18048 Known.Zero.setHighBits(24);
18049 break;
18050 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18051 Known.Zero.setHighBits(16);
18052 break;
18053 case AMDGPU::G_AMDGPU_SMED3:
18054 case AMDGPU::G_AMDGPU_UMED3: {
18055 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
18056
18057 KnownBits Known2;
18058 VT.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
18059 if (Known2.isUnknown())
18060 break;
18061
18062 KnownBits Known1;
18063 VT.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
18064 if (Known1.isUnknown())
18065 break;
18066
18067 KnownBits Known0;
18068 VT.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
18069 if (Known0.isUnknown())
18070 break;
18071
18072 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
18073 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
18074 Known.One = Known0.One & Known1.One & Known2.One;
18075 break;
18076 }
18077 }
18078}
18079
18082 unsigned Depth) const {
18083 const MachineInstr *MI = MRI.getVRegDef(R);
18084 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
18085 // FIXME: Can this move to generic code? What about the case where the call
18086 // site specifies a lower alignment?
18087 Intrinsic::ID IID = GI->getIntrinsicID();
18089 AttributeList Attrs =
18090 Intrinsic::getAttributes(Ctx, IID, Intrinsic::getType(Ctx, IID));
18091 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
18092 return *RetAlign;
18093 }
18094 return Align(1);
18095}
18096
18099 const Align CacheLineAlign = Align(64);
18100
18101 // Pre-GFX10 target did not benefit from loop alignment
18102 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
18103 getSubtarget()->hasInstFwdPrefetchBug())
18104 return PrefAlign;
18105
18106 // On GFX10 I$ is 4 x 64 bytes cache lines.
18107 // By default prefetcher keeps one cache line behind and reads two ahead.
18108 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
18109 // behind and one ahead.
18110 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
18111 // If loop fits 64 bytes it always spans no more than two cache lines and
18112 // does not need an alignment.
18113 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
18114 // Else if loop is less or equal 192 bytes we need two lines behind.
18115
18117 const MachineBasicBlock *Header = ML->getHeader();
18118 if (Header->getAlignment() != PrefAlign)
18119 return Header->getAlignment(); // Already processed.
18120
18121 unsigned LoopSize = 0;
18122 for (const MachineBasicBlock *MBB : ML->blocks()) {
18123 // If inner loop block is aligned assume in average half of the alignment
18124 // size to be added as nops.
18125 if (MBB != Header)
18126 LoopSize += MBB->getAlignment().value() / 2;
18127
18128 for (const MachineInstr &MI : *MBB) {
18129 LoopSize += TII->getInstSizeInBytes(MI);
18130 if (LoopSize > 192)
18131 return PrefAlign;
18132 }
18133 }
18134
18135 if (LoopSize <= 64)
18136 return PrefAlign;
18137
18138 if (LoopSize <= 128)
18139 return CacheLineAlign;
18140
18141 // If any of parent loops is surrounded by prefetch instructions do not
18142 // insert new for inner loop, which would reset parent's settings.
18143 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
18144 if (MachineBasicBlock *Exit = P->getExitBlock()) {
18145 auto I = Exit->getFirstNonDebugInstr();
18146 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18147 return CacheLineAlign;
18148 }
18149 }
18150
18151 MachineBasicBlock *Pre = ML->getLoopPreheader();
18152 MachineBasicBlock *Exit = ML->getExitBlock();
18153
18154 if (Pre && Exit) {
18155 auto PreTerm = Pre->getFirstTerminator();
18156 if (PreTerm == Pre->begin() ||
18157 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18158 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18159 .addImm(1); // prefetch 2 lines behind PC
18160
18161 auto ExitHead = Exit->getFirstNonDebugInstr();
18162 if (ExitHead == Exit->end() ||
18163 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18164 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18165 .addImm(2); // prefetch 1 line behind PC
18166 }
18167
18168 return CacheLineAlign;
18169}
18170
18172static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
18173 assert(N->getOpcode() == ISD::CopyFromReg);
18174 do {
18175 // Follow the chain until we find an INLINEASM node.
18176 N = N->getOperand(0).getNode();
18177 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
18178 return true;
18179 } while (N->getOpcode() == ISD::CopyFromReg);
18180 return false;
18181}
18182
18185 UniformityInfo *UA) const {
18186 switch (N->getOpcode()) {
18187 case ISD::CopyFromReg: {
18188 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
18189 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
18190 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18191 Register Reg = R->getReg();
18192
18193 // FIXME: Why does this need to consider isLiveIn?
18194 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
18195 return !TRI->isSGPRReg(MRI, Reg);
18196
18197 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
18198 return UA->isDivergent(V);
18199
18201 return !TRI->isSGPRReg(MRI, Reg);
18202 }
18203 case ISD::LOAD: {
18204 const LoadSDNode *L = cast<LoadSDNode>(N);
18205 unsigned AS = L->getAddressSpace();
18206 // A flat load may access private memory.
18208 }
18209 case ISD::CALLSEQ_END:
18210 return true;
18212 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
18214 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
18233 // Target-specific read-modify-write atomics are sources of divergence.
18234 return true;
18235 default:
18236 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
18237 // Generic read-modify-write atomics are sources of divergence.
18238 return A->readMem() && A->writeMem();
18239 }
18240 return false;
18241 }
18242}
18243
18245 EVT VT) const {
18246 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
18247 case MVT::f32:
18249 case MVT::f64:
18250 case MVT::f16:
18252 default:
18253 return false;
18254 }
18255}
18256
18258 LLT Ty, const MachineFunction &MF) const {
18259 switch (Ty.getScalarSizeInBits()) {
18260 case 32:
18261 return !denormalModeIsFlushAllF32(MF);
18262 case 64:
18263 case 16:
18264 return !denormalModeIsFlushAllF64F16(MF);
18265 default:
18266 return false;
18267 }
18268}
18269
18271 const APInt &DemandedElts,
18272 const SelectionDAG &DAG,
18273 bool SNaN,
18274 unsigned Depth) const {
18275 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
18276 const MachineFunction &MF = DAG.getMachineFunction();
18278
18279 if (Info->getMode().DX10Clamp)
18280 return true; // Clamped to 0.
18281 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
18282 }
18283
18285 DAG, SNaN, Depth);
18286}
18287
18288// On older subtargets, global FP atomic instructions have a hardcoded FP mode
18289// and do not support FP32 denormals, and only support v2f16/f64 denormals.
18291 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
18292 return true;
18293
18295 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
18296 if (DenormMode == DenormalMode::getPreserveSign())
18297 return true;
18298
18299 // TODO: Remove this.
18300 return RMW->getFunction()
18301 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
18302 .getValueAsBool();
18303}
18304
18306 LLVMContext &Ctx = RMW->getContext();
18307 StringRef MemScope =
18308 Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system");
18309
18310 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
18311 << "Hardware instruction generated for atomic "
18312 << RMW->getOperationName(RMW->getOperation())
18313 << " operation at memory scope " << MemScope;
18314}
18315
18316static bool isV2F16OrV2BF16(Type *Ty) {
18317 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
18318 Type *EltTy = VT->getElementType();
18319 return VT->getNumElements() == 2 &&
18320 (EltTy->isHalfTy() || EltTy->isBFloatTy());
18321 }
18322
18323 return false;
18324}
18325
18326static bool isV2F16(Type *Ty) {
18328 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
18329}
18330
18331static bool isV2BF16(Type *Ty) {
18333 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
18334}
18335
18336/// \return true if atomicrmw integer ops work for the type.
18337static bool isAtomicRMWLegalIntTy(Type *Ty) {
18338 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
18339 unsigned BW = IT->getBitWidth();
18340 return BW == 32 || BW == 64;
18341 }
18342
18343 return false;
18344}
18345
18346/// \return true if this atomicrmw xchg type can be selected.
18347static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
18348 Type *Ty = RMW->getType();
18349 if (isAtomicRMWLegalIntTy(Ty))
18350 return true;
18351
18352 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
18353 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
18354 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
18355 return BW == 32 || BW == 64;
18356 }
18357
18358 if (Ty->isFloatTy() || Ty->isDoubleTy())
18359 return true;
18360
18362 return VT->getNumElements() == 2 &&
18363 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18364 }
18365
18366 return false;
18367}
18368
18369/// \returns true if it's valid to emit a native instruction for \p RMW, based
18370/// on the properties of the target memory.
18371static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
18372 const AtomicRMWInst *RMW,
18373 bool HasSystemScope) {
18374 // The remote/fine-grained access logic is different from the integer
18375 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
18376 // fine-grained access does not work, even for a device local allocation.
18377 //
18378 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
18379 // allocations work.
18380 if (HasSystemScope) {
18382 RMW->hasMetadata("amdgpu.no.remote.memory"))
18383 return true;
18384 if (Subtarget.hasEmulatedSystemScopeAtomics())
18385 return true;
18387 return true;
18388
18389 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
18390}
18391
18392/// \return Action to perform on AtomicRMWInsts for integer operations.
18399
18400/// Return if a flat address space atomicrmw can access private memory.
18402 const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace);
18403 return !MD ||
18405}
18406
18414
18417 unsigned AS = RMW->getPointerAddressSpace();
18418 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
18420
18421 // 64-bit flat atomics that dynamically reside in private memory will silently
18422 // be dropped.
18423 //
18424 // Note that we will emit a new copy of the original atomic in the expansion,
18425 // which will be incrementally relegalized.
18426 const DataLayout &DL = RMW->getFunction()->getDataLayout();
18427 if (AS == AMDGPUAS::FLAT_ADDRESS &&
18428 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
18431
18432 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
18434 ORE.emit([=]() {
18435 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
18436 });
18437 return Kind;
18438 };
18439
18440 auto SSID = RMW->getSyncScopeID();
18441 bool HasSystemScope =
18442 SSID == SyncScope::System ||
18443 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
18444
18445 auto Op = RMW->getOperation();
18446 switch (Op) {
18448 // PCIe supports add and xchg for system atomics.
18449 return isAtomicRMWLegalXChgTy(RMW)
18452 case AtomicRMWInst::Add:
18453 // PCIe supports add and xchg for system atomics.
18455 case AtomicRMWInst::Sub:
18456 case AtomicRMWInst::And:
18457 case AtomicRMWInst::Or:
18458 case AtomicRMWInst::Xor:
18459 case AtomicRMWInst::Max:
18460 case AtomicRMWInst::Min:
18467 if (Subtarget->hasEmulatedSystemScopeAtomics())
18469
18470 // On most subtargets, for atomicrmw operations other than add/xchg,
18471 // whether or not the instructions will behave correctly depends on where
18472 // the address physically resides and what interconnect is used in the
18473 // system configuration. On some some targets the instruction will nop,
18474 // and in others synchronization will only occur at degraded device scope.
18475 //
18476 // If the allocation is known local to the device, the instructions should
18477 // work correctly.
18478 if (RMW->hasMetadata("amdgpu.no.remote.memory"))
18480
18481 // If fine-grained remote memory works at device scope, we don't need to
18482 // do anything.
18483 if (!HasSystemScope &&
18484 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18486
18487 // If we are targeting a remote allocated address, it depends what kind of
18488 // allocation the address belongs to.
18489 //
18490 // If the allocation is fine-grained (in host memory, or in PCIe peer
18491 // device memory), the operation will fail depending on the target.
18492 //
18493 // Note fine-grained host memory access does work on APUs or if XGMI is
18494 // used, but we do not know if we are targeting an APU or the system
18495 // configuration from the ISA version/target-cpu.
18496 if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
18498
18501 // Atomic sub/or/xor do not work over PCI express, but atomic add
18502 // does. InstCombine transforms these with 0 to or, so undo that.
18503 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
18504 ConstVal && ConstVal->isNullValue())
18506 }
18507
18508 // If the allocation could be in remote, fine-grained memory, the rmw
18509 // instructions may fail. cmpxchg should work, so emit that. On some
18510 // system configurations, PCIe atomics aren't supported so cmpxchg won't
18511 // even work, so you're out of luck anyway.
18512
18513 // In summary:
18514 //
18515 // Cases that may fail:
18516 // - fine-grained pinned host memory
18517 // - fine-grained migratable host memory
18518 // - fine-grained PCIe peer device
18519 //
18520 // Cases that should work, but may be treated overly conservatively.
18521 // - fine-grained host memory on an APU
18522 // - fine-grained XGMI peer device
18524 }
18525
18527 }
18528 case AtomicRMWInst::FAdd: {
18529 Type *Ty = RMW->getType();
18530
18531 // TODO: Handle REGION_ADDRESS
18532 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18533 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
18534 // is fixed to round-to-nearest-even.
18535 //
18536 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
18537 // round-to-nearest-even.
18538 //
18539 // We ignore the rounding mode problem, even in strictfp. The C++ standard
18540 // suggests it is OK if the floating-point mode may not match the calling
18541 // thread.
18542 if (Ty->isFloatTy()) {
18543 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
18545 }
18546
18547 if (Ty->isDoubleTy()) {
18548 // Ignores denormal mode, but we don't consider flushing mandatory.
18549 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
18551 }
18552
18553 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18555
18557 }
18558
18559 // LDS atomics respect the denormal mode from the mode register.
18560 //
18561 // Traditionally f32 global/buffer memory atomics would unconditionally
18562 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
18563 // flush.
18564 //
18565 // On targets with flat atomic fadd, denormals would flush depending on
18566 // whether the target address resides in LDS or global memory. We consider
18567 // this flat-maybe-flush as will-flush.
18568 if (Ty->isFloatTy() &&
18569 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18572
18573 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
18574 // safe. The message phrasing also should be better.
18575 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18576 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18577 // gfx942, gfx12
18578 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18579 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18580 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
18581 // gfx90a, gfx942, gfx12
18582 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18583 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18584
18585 // gfx942, gfx12
18586 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
18587 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18588 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
18589 // gfx90a, gfx942, gfx12
18590 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18591 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18592
18593 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
18594 // buffer. gfx12 does have the buffer version.
18595 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
18596 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18597 }
18598
18599 // global and flat atomic fadd f64: gfx90a, gfx942.
18600 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18601 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18602
18603 if (AS != AMDGPUAS::FLAT_ADDRESS) {
18604 if (Ty->isFloatTy()) {
18605 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
18606 // gfx11+.
18607 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18608 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18609 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
18610 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18611 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18612 } else {
18613 // gfx908
18614 if (RMW->use_empty() &&
18615 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18616 isV2F16(Ty))
18617 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18618 }
18619 }
18620
18621 // flat atomic fadd f32: gfx942, gfx11+.
18622 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
18623 if (Subtarget->hasFlatAtomicFaddF32Inst())
18624 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18625
18626 // If it is in flat address space, and the type is float, we will try to
18627 // expand it, if the target supports global and lds atomic fadd. The
18628 // reason we need that is, in the expansion, we emit the check of
18629 // address space. If it is in global address space, we emit the global
18630 // atomic fadd; if it is in shared address space, we emit the LDS atomic
18631 // fadd.
18632 if (Subtarget->hasLDSFPAtomicAddF32()) {
18633 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18635 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18637 }
18638 }
18639 }
18640
18642 }
18644 case AtomicRMWInst::FMax: {
18645 Type *Ty = RMW->getType();
18646
18647 // LDS float and double fmin/fmax were always supported.
18648 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18649 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
18651 }
18652
18653 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18654 // For flat and global cases:
18655 // float, double in gfx7. Manual claims denormal support.
18656 // Removed in gfx8.
18657 // float, double restored in gfx10.
18658 // double removed again in gfx11, so only f32 for gfx11/gfx12.
18659 //
18660 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
18661 // no f32.
18662 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18663 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18664 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18665 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18666 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18667 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
18669 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18670 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18671 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18672 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18673 }
18674 }
18675
18677 }
18680 default:
18682 }
18683
18684 llvm_unreachable("covered atomicrmw op switch");
18685}
18686
18693
18700
18703 unsigned AddrSpace = CmpX->getPointerAddressSpace();
18704 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
18706
18707 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
18709
18710 const DataLayout &DL = CmpX->getDataLayout();
18711
18712 Type *ValTy = CmpX->getNewValOperand()->getType();
18713
18714 // If a 64-bit flat atomic may alias private, we need to avoid using the
18715 // atomic in the private case.
18716 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand
18718}
18719
18720const TargetRegisterClass *
18721SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
18723 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18724 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18725 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18726 : &AMDGPU::SReg_32RegClass;
18727 if (!TRI->isSGPRClass(RC) && !isDivergent)
18728 return TRI->getEquivalentSGPRClass(RC);
18729 if (TRI->isSGPRClass(RC) && isDivergent)
18730 return TRI->getEquivalentVGPRClass(RC);
18731
18732 return RC;
18733}
18734
18735// FIXME: This is a workaround for DivergenceAnalysis not understanding always
18736// uniform values (as produced by the mask results of control flow intrinsics)
18737// used outside of divergent blocks. The phi users need to also be treated as
18738// always uniform.
18739//
18740// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
18741static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
18742 unsigned WaveSize) {
18743 // FIXME: We assume we never cast the mask results of a control flow
18744 // intrinsic.
18745 // Early exit if the type won't be consistent as a compile time hack.
18746 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
18747 if (!IT || IT->getBitWidth() != WaveSize)
18748 return false;
18749
18750 if (!isa<Instruction>(V))
18751 return false;
18752 if (!Visited.insert(V).second)
18753 return false;
18754 bool Result = false;
18755 for (const auto *U : V->users()) {
18757 if (V == U->getOperand(1)) {
18758 switch (Intrinsic->getIntrinsicID()) {
18759 default:
18760 Result = false;
18761 break;
18762 case Intrinsic::amdgcn_if_break:
18763 case Intrinsic::amdgcn_if:
18764 case Intrinsic::amdgcn_else:
18765 Result = true;
18766 break;
18767 }
18768 }
18769 if (V == U->getOperand(0)) {
18770 switch (Intrinsic->getIntrinsicID()) {
18771 default:
18772 Result = false;
18773 break;
18774 case Intrinsic::amdgcn_end_cf:
18775 case Intrinsic::amdgcn_loop:
18776 Result = true;
18777 break;
18778 }
18779 }
18780 } else {
18781 Result = hasCFUser(U, Visited, WaveSize);
18782 }
18783 if (Result)
18784 break;
18785 }
18786 return Result;
18787}
18788
18790 const Value *V) const {
18791 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
18792 if (CI->isInlineAsm()) {
18793 // FIXME: This cannot give a correct answer. This should only trigger in
18794 // the case where inline asm returns mixed SGPR and VGPR results, used
18795 // outside the defining block. We don't have a specific result to
18796 // consider, so this assumes if any value is SGPR, the overall register
18797 // also needs to be SGPR.
18798 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
18800 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
18801 for (auto &TC : TargetConstraints) {
18802 if (TC.Type == InlineAsm::isOutput) {
18804 const TargetRegisterClass *RC =
18805 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
18806 TC.ConstraintVT)
18807 .second;
18808 if (RC && SIRI->isSGPRClass(RC))
18809 return true;
18810 }
18811 }
18812 }
18813 }
18815 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
18816}
18817
18819 for (SDUse &Use : N->uses()) {
18821 if (getBasePtrIndex(M) == Use.getOperandNo())
18822 return true;
18823 }
18824 }
18825 return false;
18826}
18827
18829 SDValue N1) const {
18830 if (!N0.hasOneUse())
18831 return false;
18832 // Take care of the opportunity to keep N0 uniform
18833 if (N0->isDivergent() || !N1->isDivergent())
18834 return true;
18835 // Check if we have a good chance to form the memory access pattern with the
18836 // base and offset
18837 return (DAG.isBaseWithConstantOffset(N0) &&
18839}
18840
18842 Register N0, Register N1) const {
18843 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
18844}
18845
18848 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
18850 if (I.getMetadata("amdgpu.noclobber"))
18851 Flags |= MONoClobber;
18852 if (I.getMetadata("amdgpu.last.use"))
18853 Flags |= MOLastUse;
18854 return Flags;
18855}
18856
18858 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
18859 const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const {
18860 if (User->getOpcode() != ISD::CopyToReg)
18861 return false;
18862 if (!Def->isMachineOpcode())
18863 return false;
18865 if (!MDef)
18866 return false;
18867
18868 unsigned ResNo = User->getOperand(Op).getResNo();
18869 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
18870 return false;
18871 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
18872 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
18873 PhysReg = AMDGPU::SCC;
18874 const TargetRegisterClass *RC =
18875 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
18876 Cost = RC->getCopyCost();
18877 return true;
18878 }
18879 return false;
18880}
18881
18883 Instruction *AI) const {
18884 // Given: atomicrmw fadd ptr %addr, float %val ordering
18885 //
18886 // With this expansion we produce the following code:
18887 // [...]
18888 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
18889 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
18890 //
18891 // atomicrmw.shared:
18892 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
18893 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
18894 // float %val ordering
18895 // br label %atomicrmw.phi
18896 //
18897 // atomicrmw.check.private:
18898 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
18899 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
18900 //
18901 // atomicrmw.private:
18902 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
18903 // %loaded.private = load float, ptr addrspace(5) %cast.private
18904 // %val.new = fadd float %loaded.private, %val
18905 // store float %val.new, ptr addrspace(5) %cast.private
18906 // br label %atomicrmw.phi
18907 //
18908 // atomicrmw.global:
18909 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
18910 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
18911 // float %val ordering
18912 // br label %atomicrmw.phi
18913 //
18914 // atomicrmw.phi:
18915 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
18916 // [ %loaded.private, %atomicrmw.private ],
18917 // [ %loaded.global, %atomicrmw.global ]
18918 // br label %atomicrmw.end
18919 //
18920 // atomicrmw.end:
18921 // [...]
18922 //
18923 //
18924 // For 64-bit atomics which may reside in private memory, we perform a simpler
18925 // version that only inserts the private check, and uses the flat operation.
18926
18927 IRBuilder<> Builder(AI);
18928 LLVMContext &Ctx = Builder.getContext();
18929
18930 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
18931 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
18933 Value *Addr = AI->getOperand(PtrOpIdx);
18934
18935 /// TODO: Only need to check private, then emit flat-known-not private (no
18936 /// need for shared block, or cast to global).
18938
18939 Align Alignment;
18940 if (RMW)
18941 Alignment = RMW->getAlign();
18942 else if (CX)
18943 Alignment = CX->getAlign();
18944 else
18945 llvm_unreachable("unhandled atomic operation");
18946
18947 // FullFlatEmulation is true if we need to issue the private, shared, and
18948 // global cases.
18949 //
18950 // If this is false, we are only dealing with the flat-targeting-private case,
18951 // where we only insert a check for private and still use the flat instruction
18952 // for global and shared.
18953
18954 bool FullFlatEmulation =
18955 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
18956 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
18957 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
18958 RMW->getType()->isDoubleTy()));
18959
18960 // If the return value isn't used, do not introduce a false use in the phi.
18961 bool ReturnValueIsUsed = !AI->use_empty();
18962
18963 BasicBlock *BB = Builder.GetInsertBlock();
18964 Function *F = BB->getParent();
18965 BasicBlock *ExitBB =
18966 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
18967 BasicBlock *SharedBB = nullptr;
18968
18969 BasicBlock *CheckPrivateBB = BB;
18970 if (FullFlatEmulation) {
18971 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
18972 CheckPrivateBB =
18973 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
18974 }
18975
18976 BasicBlock *PrivateBB =
18977 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
18978 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
18979 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
18980
18981 std::prev(BB->end())->eraseFromParent();
18982 Builder.SetInsertPoint(BB);
18983
18984 Value *LoadedShared = nullptr;
18985 if (FullFlatEmulation) {
18986 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
18987 {Addr}, nullptr, "is.shared");
18988 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
18989 Builder.SetInsertPoint(SharedBB);
18990 Value *CastToLocal = Builder.CreateAddrSpaceCast(
18992
18993 Instruction *Clone = AI->clone();
18994 Clone->insertInto(SharedBB, SharedBB->end());
18995 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
18996 LoadedShared = Clone;
18997
18998 Builder.CreateBr(PhiBB);
18999 Builder.SetInsertPoint(CheckPrivateBB);
19000 }
19001
19002 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19003 {Addr}, nullptr, "is.private");
19004 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19005
19006 Builder.SetInsertPoint(PrivateBB);
19007
19008 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19010
19011 Value *LoadedPrivate;
19012 if (RMW) {
19013 LoadedPrivate = Builder.CreateAlignedLoad(
19014 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
19015
19016 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
19017 LoadedPrivate, RMW->getValOperand());
19018
19019 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19020 } else {
19021 auto [ResultLoad, Equal] =
19022 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
19023 CX->getNewValOperand(), CX->getAlign());
19024
19025 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
19026 ResultLoad, 0);
19027 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19028 }
19029
19030 Builder.CreateBr(PhiBB);
19031
19032 Builder.SetInsertPoint(GlobalBB);
19033
19034 // Continue using a flat instruction if we only emitted the check for private.
19035 Instruction *LoadedGlobal = AI;
19036 if (FullFlatEmulation) {
19037 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19039 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
19040 }
19041
19042 AI->removeFromParent();
19043 AI->insertInto(GlobalBB, GlobalBB->end());
19044
19045 // The new atomicrmw may go through another round of legalization later.
19046 if (!FullFlatEmulation) {
19047 // We inserted the runtime check already, make sure we do not try to
19048 // re-expand this.
19049 // TODO: Should union with any existing metadata.
19050 MDBuilder MDB(F->getContext());
19051 MDNode *RangeNotPrivate =
19054 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
19055 RangeNotPrivate);
19056 }
19057
19058 Builder.CreateBr(PhiBB);
19059
19060 Builder.SetInsertPoint(PhiBB);
19061
19062 if (ReturnValueIsUsed) {
19063 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
19064 AI->replaceAllUsesWith(Loaded);
19065 if (FullFlatEmulation)
19066 Loaded->addIncoming(LoadedShared, SharedBB);
19067 Loaded->addIncoming(LoadedPrivate, PrivateBB);
19068 Loaded->addIncoming(LoadedGlobal, GlobalBB);
19069 Loaded->takeName(AI);
19070 }
19071
19072 Builder.CreateBr(ExitBB);
19073}
19074
19076 unsigned PtrOpIdx) {
19077 Value *PtrOp = I->getOperand(PtrOpIdx);
19080
19081 Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
19082 Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
19083 I->getIterator());
19084 I->setOperand(PtrOpIdx, ASCast);
19085}
19086
19089
19092
19095 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
19096 ConstVal && ConstVal->isNullValue()) {
19097 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
19099
19100 // We may still need the private-alias-flat handling below.
19101
19102 // TODO: Skip this for cases where we cannot access remote memory.
19103 }
19104 }
19105
19106 // The non-flat expansions should only perform the de-canonicalization of
19107 // identity values.
19109 return;
19110
19112}
19113
19120
19124
19126 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19127}
19128
19130 if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19131 return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
19132
19134 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19135}
19136
19137LoadInst *
19139 IRBuilder<> Builder(AI);
19140 auto Order = AI->getOrdering();
19141
19142 // The optimization removes store aspect of the atomicrmw. Therefore, cache
19143 // must be flushed if the atomic ordering had a release semantics. This is
19144 // not necessary a fence, a release fence just coincides to do that flush.
19145 // Avoid replacing of an atomicrmw with a release semantics.
19146 if (isReleaseOrStronger(Order))
19147 return nullptr;
19148
19149 LoadInst *LI = Builder.CreateAlignedLoad(
19150 AI->getType(), AI->getPointerOperand(), AI->getAlign());
19151 LI->setAtomic(Order, AI->getSyncScopeID());
19152 LI->copyMetadata(*AI);
19153 LI->takeName(AI);
19154 AI->replaceAllUsesWith(LI);
19155 AI->eraseFromParent();
19156 return LI;
19157}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
return SDValue()
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
constexpr LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition Compiler.h:298
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
dxil translate DXIL Translate Metadata
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
#define P(N)
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1254
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition SIDefines.h:1251
static cl::opt< bool > UseSelectionDAGPTRADD("amdgpu-use-sdag-ptradd", cl::Hidden, cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the " "SelectionDAG ISel"), cl::init(false))
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
The Input class is used to parse a yaml document into in-memory structs and vectors.
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1120
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:6057
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1497
bool isNegative() const
Definition APFloat.h:1449
bool isNormal() const
Definition APFloat.h:1453
APInt bitcastToAPInt() const
Definition APFloat.h:1353
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1138
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1098
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1079
bool isInfinity() const
Definition APFloat.h:1446
Class for arbitrary precision integers.
Definition APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1385
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:466
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
Definition APInt.h:366
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1237
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1221
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition Function.cpp:339
const Function * getParent() const
Definition Argument.h:44
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:361
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
BitVector & set()
Definition BitVector.h:351
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ ICMP_NE
not equal
Definition InstrTypes.h:700
bool isSigned() const
Definition InstrTypes.h:932
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:772
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:778
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:214
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
bool isBigEndian() const
Definition DataLayout.h:199
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
iterator_range< arg_iterator > args()
Definition Function.h:890
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition Function.cpp:803
Argument * getArg(unsigned i) const
Definition Function.h:884
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMadF16() const
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasPrivateSegmentBuffer() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition MDBuilder.cpp:96
Metadata node.
Definition Metadata.h:1077
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1441
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition ModRef.h:221
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:215
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition ModRef.h:218
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition Register.h:67
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:420
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition Type.h:258
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:107
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI void set(Value *Val)
Definition Value.h:905
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
const Use & getOperandUse(unsigned i) const
Definition User.h:245
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1101
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isZero() const
Definition TypeSize.h:154
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:130
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:774
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ GlobalAddress
Definition ISDOpcodes.h:88
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:656
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:773
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:528
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:535
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
Offsets
Offsets in bytes from the start of the input buffer.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
Definition MathExtras.h:54
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
@ Offset
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition Analysis.cpp:241
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1705
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:841
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
@ Done
Definition Threading.h:60
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition MathExtras.h:232
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:289
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2116
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:557
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
Definition ModRef.h:296
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:396
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:282
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1712
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:222
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition Analysis.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:155
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition MathExtras.h:241
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1877
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:212
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:86
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
#define N
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:304
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition APFloat.cpp:264
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:85
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:42
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
unsigned getOrigArgIndex() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:172
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:225
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:180
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:248
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs