LLVM 22.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "AMDGPUTargetMachine.h"
19#include "GCNSubtarget.h"
22#include "SIRegisterInfo.h"
23#include "llvm/ADT/APInt.h"
25#include "llvm/ADT/Statistic.h"
39#include "llvm/IR/IRBuilder.h"
41#include "llvm/IR/IntrinsicsAMDGPU.h"
42#include "llvm/IR/IntrinsicsR600.h"
43#include "llvm/IR/MDBuilder.h"
46#include "llvm/Support/ModRef.h"
48#include <optional>
49
50using namespace llvm;
51using namespace llvm::SDPatternMatch;
52
53#define DEBUG_TYPE "si-lower"
54
55STATISTIC(NumTailCalls, "Number of tail calls");
56
57static cl::opt<bool>
58 DisableLoopAlignment("amdgpu-disable-loop-alignment",
59 cl::desc("Do not align and prefetch loops"),
60 cl::init(false));
61
63 "amdgpu-use-divergent-register-indexing", cl::Hidden,
64 cl::desc("Use indirect register addressing for divergent indexes"),
65 cl::init(false));
66
69 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
70}
71
74 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
75}
76
77static unsigned findFirstFreeSGPR(CCState &CCInfo) {
78 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
79 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
80 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
81 return AMDGPU::SGPR0 + Reg;
82 }
83 }
84 llvm_unreachable("Cannot allocate sgpr");
85}
86
88 const GCNSubtarget &STI)
89 : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) {
90 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
91 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
92
93 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
94 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
95
96 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
97
98 const SIRegisterInfo *TRI = STI.getRegisterInfo();
99 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
100
101 addRegisterClass(MVT::f64, V64RegClass);
102 addRegisterClass(MVT::v2f32, V64RegClass);
103 addRegisterClass(MVT::Untyped, V64RegClass);
104
105 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
106 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
107
108 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
109 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
110
111 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
112 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
113
114 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
115 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
116
117 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
118 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
119
120 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
121 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
122
123 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
124 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
125
126 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
127 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
128
129 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
130 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
131
132 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
133 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
134
135 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
136 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
137
138 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
139 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
140
141 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
142 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
143
144 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
145 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
146
147 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
148 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
149
150 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
151 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
152
153 if (Subtarget->has16BitInsts()) {
154 if (Subtarget->useRealTrue16Insts()) {
155 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
156 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
157 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
158 } else {
159 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
160 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
161 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
162 }
163
164 // Unless there are also VOP3P operations, not operations are really legal.
165 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
166 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
167 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
168 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
169 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
170 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
171 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
172 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
173 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
174 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
175 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
176 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
177 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
178 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
179 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
180 }
181
182 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
183 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
184
185 computeRegisterProperties(Subtarget->getRegisterInfo());
186
187 // The boolean content concept here is too inflexible. Compares only ever
188 // really produce a 1-bit result. Any copy/extend from these will turn into a
189 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
190 // it's what most targets use.
193
194 // We need to custom lower vector stores from local memory
195 setOperationAction(ISD::LOAD,
196 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
197 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
198 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
199 MVT::i1, MVT::v32i32},
200 Custom);
201
202 setOperationAction(ISD::STORE,
203 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
204 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
205 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
206 MVT::i1, MVT::v32i32},
207 Custom);
208
209 if (isTypeLegal(MVT::bf16)) {
210 for (unsigned Opc :
212 ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM,
213 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
214 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
215 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
216 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
217 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
218 ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE,
219 ISD::SETCC}) {
220 // FIXME: The promoted to type shouldn't need to be explicit
221 setOperationAction(Opc, MVT::bf16, Promote);
222 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
223 }
224
226
228 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
229
230 setOperationAction(ISD::FABS, MVT::bf16, Legal);
231 setOperationAction(ISD::FNEG, MVT::bf16, Legal);
233
234 // We only need to custom lower because we can't specify an action for bf16
235 // sources.
238 }
239
240 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
241 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
242 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
243 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
244 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
245 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
246 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
247 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
248 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
249 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
250 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
251 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
252 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
253 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
254 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
255 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
256
257 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
258 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
259 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
260 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
261 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
262 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
263 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
264
265 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
266
270 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
271
272 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
273
275 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
276
278 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
279 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
280
282 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
283 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
284 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
285 Expand);
287 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
288 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
289 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
290 Expand);
291
293 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
294 MVT::v3i16, MVT::v4i16, MVT::Other},
295 Custom);
296
297 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
298 setOperationAction(ISD::BR_CC,
299 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
300
302
304
306 Expand);
307
308#if 0
310#endif
311
312 // We only support LOAD/STORE and vector manipulation ops for vectors
313 // with > 4 elements.
314 for (MVT VT :
315 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
316 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
317 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
318 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
319 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
320 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
321 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
322 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
323 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
324 switch (Op) {
325 case ISD::LOAD:
326 case ISD::STORE:
328 case ISD::BITCAST:
329 case ISD::UNDEF:
333 case ISD::IS_FPCLASS:
334 break;
339 break;
340 default:
342 break;
343 }
344 }
345 }
346
347 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
348
349 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
350 // is expanded to avoid having two separate loops in case the index is a VGPR.
351
352 // Most operations are naturally 32-bit vector operations. We only support
353 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
354 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
356 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
357
359 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
360
362 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
363
365 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
366 }
367
368 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
370 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
371
373 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
374
376 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
377
379 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
380 }
381
382 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
384 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
385
387 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
388
390 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
391
393 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
394 }
395
396 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
398 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
399
401 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
402
404 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
405
407 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
408 }
409
410 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
412 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
413
415 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
416
418 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
419
421 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
422 }
423
425 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
426 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
427 Custom);
428
429 if (Subtarget->hasPkMovB32()) {
430 // TODO: 16-bit element vectors should be legal with even aligned elements.
431 // TODO: Can be legal with wider source types than the result with
432 // subregister extracts.
433 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
434 }
435
437 // Prevent SELECT v2i32 from being implemented with the above bitwise ops and
438 // instead lower to cndmask in SITargetLowering::LowerSELECT().
440 // Enable MatchRotate to produce ISD::ROTR, which is later transformed to
441 // alignbit.
442 setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
443
444 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
445 Custom);
446
447 // Avoid stack access for these.
448 // TODO: Generalize to more vector types.
450 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
451 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
452 Custom);
453
454 // Deal with vec3 vector operations when widened to vec4.
456 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
457
458 // Deal with vec5/6/7 vector operations when widened to vec8.
460 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
461 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
462 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
463 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
464 Custom);
465
466 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
467 // and output demarshalling
468 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
469
470 // We can't return success/failure, only the old value,
471 // let LLVM add the comparison
472 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, {MVT::i32, MVT::i64},
473 Expand);
474
475 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
476
477 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
478
479 // FIXME: This should be narrowed to i32, but that only happens if i64 is
480 // illegal.
481 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
482 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
483
484 // On SI this is s_memtime and s_memrealtime on VI.
485 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
486
487 if (Subtarget->hasSMemRealTime() ||
488 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
489 setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
490 setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom);
491
492 if (Subtarget->has16BitInsts()) {
493 setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote);
494 setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom);
495 } else {
496 setOperationAction(ISD::FSQRT, MVT::f16, Custom);
497 }
498
499 if (Subtarget->hasMadMacF32Insts())
501
502 if (!Subtarget->hasBFI())
503 // fcopysign can be done in a single instruction with BFI.
504 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
505
506 if (!Subtarget->hasBCNT(32))
508
509 if (!Subtarget->hasBCNT(64))
511
512 if (Subtarget->hasFFBH())
514
515 if (Subtarget->hasFFBL())
517
518 // We only really have 32-bit BFE instructions (and 16-bit on VI).
519 //
520 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
521 // effort to match them now. We want this to be false for i64 cases when the
522 // extraction isn't restricted to the upper or lower half. Ideally we would
523 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
524 // span the midpoint are probably relatively rare, so don't worry about them
525 // for now.
526 if (Subtarget->hasBFE())
528
529 // Clamp modifier on add/sub
530 if (Subtarget->hasIntClamp())
532
533 if (Subtarget->hasAddNoCarry())
534 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
535 Legal);
536
538 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
539 {MVT::f32, MVT::f64}, Custom);
540
541 // These are really only legal for ieee_mode functions. We should be avoiding
542 // them for functions that don't have ieee_mode enabled, so just say they are
543 // legal.
544 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
545 {MVT::f32, MVT::f64}, Legal);
546
547 if (Subtarget->haveRoundOpsF64())
548 setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, MVT::f64,
549 Legal);
550 else
551 setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
552 MVT::f64, Custom);
553
554 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
555 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
556 Legal);
557 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
558
559 setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom);
561
562 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
563 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
564
565 // Custom lower these because we can't specify a rule based on an illegal
566 // source bf16.
567 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom);
568 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f64, Custom);
569
570 if (Subtarget->has16BitInsts()) {
573 MVT::i16, Legal);
574
575 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
576
578 MVT::i16, Expand);
579
583 ISD::CTPOP},
584 MVT::i16, Promote);
585
586 setOperationAction(ISD::LOAD, MVT::i16, Custom);
587
588 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
589
590 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
591 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
592 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
593 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
594
598
600
601 // F16 - Constant Actions.
604
605 // F16 - Load/Store Actions.
606 setOperationAction(ISD::LOAD, MVT::f16, Promote);
607 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
608 setOperationAction(ISD::STORE, MVT::f16, Promote);
609 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
610
611 // BF16 - Load/Store Actions.
612 setOperationAction(ISD::LOAD, MVT::bf16, Promote);
613 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
614 setOperationAction(ISD::STORE, MVT::bf16, Promote);
615 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
616
617 // F16 - VOP1 Actions.
619 ISD::FSIN, ISD::FROUND},
620 MVT::f16, Custom);
621
622 // BF16 - VOP1 Actions.
623 if (Subtarget->hasBF16TransInsts())
624 setOperationAction({ISD::FCOS, ISD::FSIN, ISD::FDIV}, MVT::bf16, Custom);
625
628
629 // F16 - VOP2 Actions.
630 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
631 Expand);
632 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom);
633 setOperationAction(ISD::FFREXP, MVT::f16, Custom);
635
636 // F16 - VOP3 Actions.
638 if (STI.hasMadF16())
640
641 for (MVT VT :
642 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
643 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
644 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
645 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
646 switch (Op) {
647 case ISD::LOAD:
648 case ISD::STORE:
650 case ISD::BITCAST:
651 case ISD::UNDEF:
656 case ISD::IS_FPCLASS:
657 break;
661 break;
662 default:
664 break;
665 }
666 }
667 }
668
669 // v_perm_b32 can handle either of these.
670 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
672
673 // XXX - Do these do anything? Vector constants turn into build_vector.
674 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
675
676 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
677 Legal);
678
679 setOperationAction(ISD::STORE, MVT::v2i16, Promote);
680 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
681 setOperationAction(ISD::STORE, MVT::v2f16, Promote);
682 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
683
684 setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
685 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
686 setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
687 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
688
689 setOperationAction(ISD::AND, MVT::v2i16, Promote);
690 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
691 setOperationAction(ISD::OR, MVT::v2i16, Promote);
692 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
693 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
694 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
695
696 setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
697 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
698 setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
699 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
700 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
701 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
702
703 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
704 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
705 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
706 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
707 setOperationAction(ISD::STORE, MVT::v4bf16, Promote);
708 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
709
710 setOperationAction(ISD::LOAD, MVT::v8i16, Promote);
711 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
712 setOperationAction(ISD::LOAD, MVT::v8f16, Promote);
713 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
714 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
715 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
716
717 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
718 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
719 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
720 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
721
722 setOperationAction(ISD::STORE, MVT::v8i16, Promote);
723 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
724 setOperationAction(ISD::STORE, MVT::v8f16, Promote);
725 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
726 setOperationAction(ISD::STORE, MVT::v8bf16, Promote);
727 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
728
729 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
730 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
731 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
732 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
733 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
734 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
735
736 setOperationAction(ISD::STORE, MVT::v16i16, Promote);
737 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
738 setOperationAction(ISD::STORE, MVT::v16f16, Promote);
739 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
740 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
741 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
742
743 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
744 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
745 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
746 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
747 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
748 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
749
750 setOperationAction(ISD::STORE, MVT::v32i16, Promote);
751 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
752 setOperationAction(ISD::STORE, MVT::v32f16, Promote);
753 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
754 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
755 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
756
758 MVT::v2i32, Expand);
759 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
760
762 MVT::v4i32, Expand);
763
765 MVT::v8i32, Expand);
766
767 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
768 Subtarget->hasVOP3PInsts() ? Legal : Custom);
769
770 setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal);
771 // This isn't really legal, but this avoids the legalizer unrolling it (and
772 // allows matching fneg (fabs x) patterns)
773 setOperationAction(ISD::FABS, {MVT::v2f16, MVT::v2bf16}, Legal);
774
775 // Can do this in one BFI plus a constant materialize.
777 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
778 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
779 MVT::v32f16, MVT::v32bf16},
780 Custom);
781
783 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
784 MVT::f16, Custom);
785 setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
786
787 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM,
788 ISD::FMAXIMUMNUM},
789 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
790 Custom);
791
792 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},
793 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
794 Expand);
795
796 for (MVT Vec16 :
797 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
798 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
801 Vec16, Custom);
803 }
804 }
805
806 if (Subtarget->hasVOP3PInsts()) {
810 MVT::v2i16, Legal);
811
812 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
813 ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
814 MVT::v2f16, Legal);
815
817 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
818
820 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
821 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
822 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
823 Custom);
824
825 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
826 // Split vector operations.
831 VT, Custom);
832
833 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
834 // Split vector operations.
836 VT, Custom);
837
839 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
840 {MVT::v2f16, MVT::v4f16}, Custom);
841
842 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
843 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
844 Custom);
845
846 if (Subtarget->hasBF16PackedInsts()) {
847 for (MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
848 // Split vector operations.
850 VT, Custom);
851 }
852
853 if (Subtarget->hasPackedFP32Ops()) {
855 MVT::v2f32, Legal);
857 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
858 Custom);
859 }
860 }
861
862 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom);
863
864 if (Subtarget->has16BitInsts()) {
866 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
868 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
869 } else {
870 // Legalization hack.
871 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
872
873 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom);
874 }
875
877 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
878 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
879 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
880 MVT::v32f16, MVT::v32bf16},
881 Custom);
882
884
885 if (Subtarget->hasVectorMulU64())
887 else if (Subtarget->hasScalarSMulU64())
889
890 if (Subtarget->hasMad64_32())
892
893 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
894 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
895
896 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
897 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM},
898 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
899 } else {
900 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
901 if (Subtarget->hasMinimum3Maximum3F32())
902 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal);
903
904 if (Subtarget->hasMinimum3Maximum3PKF16()) {
905 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::v2f16, Legal);
906
907 // If only the vector form is available, we need to widen to a vector.
908 if (!Subtarget->hasMinimum3Maximum3F16())
909 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f16, Custom);
910 }
911 }
912
913 if (Subtarget->hasVOP3PInsts()) {
914 // We want to break these into v2f16 pieces, not scalarize.
915 setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM},
916 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
917 Custom);
918 }
919
920 if (Subtarget->hasIntMinMax64())
922 Legal);
923
925 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
926 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
927 MVT::i8},
928 Custom);
929
931 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
932 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
933 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
934 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
935 Custom);
936
938 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
939 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
940 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
941 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
942 Custom);
943
944 setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
946 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
947 setOperationAction(ISD::GET_FPENV, MVT::i64, Custom);
948 setOperationAction(ISD::SET_FPENV, MVT::i64, Custom);
949
950 // TODO: Could move this to custom lowering, could benefit from combines on
951 // extract of relevant bits.
952 setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
953
955
956 if (Subtarget->hasBF16ConversionInsts()) {
957 setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom);
959 }
960
961 if (Subtarget->hasBF16PackedInsts()) {
963 {ISD::FADD, ISD::FMUL, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMA},
964 MVT::v2bf16, Legal);
965 }
966
967 if (Subtarget->hasBF16TransInsts()) {
968 setOperationAction({ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, MVT::bf16, Legal);
969 }
970
971 if (Subtarget->hasCvtPkF16F32Inst()) {
973 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
974 Custom);
975 }
976
978 ISD::PTRADD,
980 ISD::SUB,
982 ISD::MUL,
983 ISD::FADD,
984 ISD::FSUB,
985 ISD::FDIV,
986 ISD::FMUL,
987 ISD::FMINNUM,
988 ISD::FMAXNUM,
989 ISD::FMINNUM_IEEE,
990 ISD::FMAXNUM_IEEE,
991 ISD::FMINIMUM,
992 ISD::FMAXIMUM,
993 ISD::FMINIMUMNUM,
994 ISD::FMAXIMUMNUM,
995 ISD::FMA,
996 ISD::SMIN,
997 ISD::SMAX,
998 ISD::UMIN,
999 ISD::UMAX,
1000 ISD::SETCC,
1002 ISD::SMIN,
1003 ISD::SMAX,
1004 ISD::UMIN,
1005 ISD::UMAX,
1006 ISD::AND,
1007 ISD::OR,
1008 ISD::XOR,
1009 ISD::SHL,
1010 ISD::SRL,
1011 ISD::SRA,
1012 ISD::FSHR,
1022
1023 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1025
1026 // All memory operations. Some folding on the pointer operand is done to help
1027 // matching the constant offsets in the addressing modes.
1028 setTargetDAGCombine({ISD::LOAD,
1029 ISD::STORE,
1030 ISD::ATOMIC_LOAD,
1031 ISD::ATOMIC_STORE,
1032 ISD::ATOMIC_CMP_SWAP,
1033 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1034 ISD::ATOMIC_SWAP,
1035 ISD::ATOMIC_LOAD_ADD,
1036 ISD::ATOMIC_LOAD_SUB,
1037 ISD::ATOMIC_LOAD_AND,
1038 ISD::ATOMIC_LOAD_OR,
1039 ISD::ATOMIC_LOAD_XOR,
1040 ISD::ATOMIC_LOAD_NAND,
1041 ISD::ATOMIC_LOAD_MIN,
1042 ISD::ATOMIC_LOAD_MAX,
1043 ISD::ATOMIC_LOAD_UMIN,
1044 ISD::ATOMIC_LOAD_UMAX,
1045 ISD::ATOMIC_LOAD_FADD,
1046 ISD::ATOMIC_LOAD_FMIN,
1047 ISD::ATOMIC_LOAD_FMAX,
1048 ISD::ATOMIC_LOAD_UINC_WRAP,
1049 ISD::ATOMIC_LOAD_UDEC_WRAP,
1052
1053 // FIXME: In other contexts we pretend this is a per-function property.
1055
1057}
1058
1059const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1060
1062 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1063 return RCRegs;
1064}
1065
1066//===----------------------------------------------------------------------===//
1067// TargetLowering queries
1068//===----------------------------------------------------------------------===//
1069
1070// v_mad_mix* support a conversion from f16 to f32.
1071//
1072// There is only one special case when denormals are enabled we don't currently,
1073// where this is OK to use.
1074bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1075 EVT DestVT, EVT SrcVT) const {
1076 return DestVT.getScalarType() == MVT::f32 &&
1077 ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1078 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1079 SrcVT.getScalarType() == MVT::f16) ||
1080 (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1081 SrcVT.getScalarType() == MVT::bf16)) &&
1082 // TODO: This probably only requires no input flushing?
1084}
1085
1087 LLT DestTy, LLT SrcTy) const {
1088 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1089 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1090 DestTy.getScalarSizeInBits() == 32 &&
1091 SrcTy.getScalarSizeInBits() == 16 &&
1092 // TODO: This probably only requires no input flushing?
1093 denormalModeIsFlushAllF32(*MI.getMF());
1094}
1095
1097 // SI has some legal vector types, but no legal vector operations. Say no
1098 // shuffles are legal in order to prefer scalarizing some vector operations.
1099 return false;
1100}
1101
1103 CallingConv::ID CC,
1104 EVT VT) const {
1106 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1107
1108 if (VT.isVector()) {
1109 EVT ScalarVT = VT.getScalarType();
1110 unsigned Size = ScalarVT.getSizeInBits();
1111 if (Size == 16) {
1112 if (Subtarget->has16BitInsts()) {
1113 if (VT.isInteger())
1114 return MVT::v2i16;
1115 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1116 }
1117 return VT.isInteger() ? MVT::i32 : MVT::f32;
1118 }
1119
1120 if (Size < 16)
1121 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1122 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1123 }
1124
1125 if (VT.getSizeInBits() > 32)
1126 return MVT::i32;
1127
1128 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1129}
1130
1132 CallingConv::ID CC,
1133 EVT VT) const {
1135 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1136
1137 if (VT.isVector()) {
1138 unsigned NumElts = VT.getVectorNumElements();
1139 EVT ScalarVT = VT.getScalarType();
1140 unsigned Size = ScalarVT.getSizeInBits();
1141
1142 // FIXME: Should probably promote 8-bit vectors to i16.
1143 if (Size == 16 && Subtarget->has16BitInsts())
1144 return (NumElts + 1) / 2;
1145
1146 if (Size <= 32)
1147 return NumElts;
1148
1149 if (Size > 32)
1150 return NumElts * ((Size + 31) / 32);
1151 } else if (VT.getSizeInBits() > 32)
1152 return (VT.getSizeInBits() + 31) / 32;
1153
1154 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1155}
1156
1158 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1159 unsigned &NumIntermediates, MVT &RegisterVT) const {
1160 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1161 unsigned NumElts = VT.getVectorNumElements();
1162 EVT ScalarVT = VT.getScalarType();
1163 unsigned Size = ScalarVT.getSizeInBits();
1164 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1165 // support, but unless we can properly handle 3-vectors, it will be still be
1166 // inconsistent.
1167 if (Size == 16 && Subtarget->has16BitInsts()) {
1168 if (ScalarVT == MVT::bf16) {
1169 RegisterVT = MVT::i32;
1170 IntermediateVT = MVT::v2bf16;
1171 } else {
1172 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1173 IntermediateVT = RegisterVT;
1174 }
1175 NumIntermediates = (NumElts + 1) / 2;
1176 return NumIntermediates;
1177 }
1178
1179 if (Size == 32) {
1180 RegisterVT = ScalarVT.getSimpleVT();
1181 IntermediateVT = RegisterVT;
1182 NumIntermediates = NumElts;
1183 return NumIntermediates;
1184 }
1185
1186 if (Size < 16 && Subtarget->has16BitInsts()) {
1187 // FIXME: Should probably form v2i16 pieces
1188 RegisterVT = MVT::i16;
1189 IntermediateVT = ScalarVT;
1190 NumIntermediates = NumElts;
1191 return NumIntermediates;
1192 }
1193
1194 if (Size != 16 && Size <= 32) {
1195 RegisterVT = MVT::i32;
1196 IntermediateVT = ScalarVT;
1197 NumIntermediates = NumElts;
1198 return NumIntermediates;
1199 }
1200
1201 if (Size > 32) {
1202 RegisterVT = MVT::i32;
1203 IntermediateVT = RegisterVT;
1204 NumIntermediates = NumElts * ((Size + 31) / 32);
1205 return NumIntermediates;
1206 }
1207 }
1208
1210 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1211}
1212
1214 const DataLayout &DL, Type *Ty,
1215 unsigned MaxNumLanes) {
1216 assert(MaxNumLanes != 0);
1217
1218 LLVMContext &Ctx = Ty->getContext();
1219 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1220 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1221 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1222 NumElts);
1223 }
1224
1225 return TLI.getValueType(DL, Ty);
1226}
1227
1228// Peek through TFE struct returns to only use the data size.
1230 const DataLayout &DL, Type *Ty,
1231 unsigned MaxNumLanes) {
1232 auto *ST = dyn_cast<StructType>(Ty);
1233 if (!ST)
1234 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1235
1236 // TFE intrinsics return an aggregate type.
1237 assert(ST->getNumContainedTypes() == 2 &&
1238 ST->getContainedType(1)->isIntegerTy(32));
1239 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1240}
1241
1242/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1243/// in-memory representation. This return value is a custom type because there
1244/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1245/// could cause issues during codegen, these address space 7 pointers will be
1246/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1247/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1248/// for cost modeling, to work. (This also sets us up decently for doing the
1249/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1251 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1252 return MVT::amdgpuBufferFatPointer;
1254 DL.getPointerSizeInBits(AS) == 192)
1255 return MVT::amdgpuBufferStridedPointer;
1257}
1258/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1259/// v8i32 when padding is added.
1260/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1261/// also v8i32 with padding.
1263 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1264 DL.getPointerSizeInBits(AS) == 160) ||
1266 DL.getPointerSizeInBits(AS) == 192))
1267 return MVT::v8i32;
1269}
1270
1271static unsigned getIntrMemWidth(unsigned IntrID) {
1272 switch (IntrID) {
1273 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1274 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1275 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1276 return 8;
1277 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1278 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1279 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1280 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1281 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1282 return 32;
1283 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1284 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1285 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1286 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1287 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1288 return 64;
1289 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1290 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1291 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1292 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1293 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1294 return 128;
1295 default:
1296 llvm_unreachable("Unknown width");
1297 }
1298}
1299
1300static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad,
1302 Value *OrderingArg = CI.getArgOperand(IsLoad ? 1 : 2);
1303 unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
1304 switch (AtomicOrderingCABI(Ord)) {
1307 break;
1310 break;
1313 break;
1314 default:
1316 break;
1317 }
1318
1319 Info.flags =
1321 Info.flags |= MOCooperative;
1322
1323 MDNode *ScopeMD = cast<MDNode>(
1324 cast<MetadataAsValue>(CI.getArgOperand(IsLoad ? 2 : 3))->getMetadata());
1325 StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
1326 Info.ssid = CI.getContext().getOrInsertSyncScopeID(Scope);
1327}
1328
1330 const CallInst &CI,
1331 MachineFunction &MF,
1332 unsigned IntrID) const {
1333 Info.flags = MachineMemOperand::MONone;
1334 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1335 Info.flags |= MachineMemOperand::MOInvariant;
1336 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1338 Info.flags |= getTargetMMOFlags(CI);
1339
1340 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1342 AttributeSet Attr =
1344 MemoryEffects ME = Attr.getMemoryEffects();
1345 if (ME.doesNotAccessMemory())
1346 return false;
1347
1348 // TODO: Should images get their own address space?
1349 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1350
1351 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1352 if (RsrcIntr->IsImage) {
1353 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1355 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1356 Info.align.reset();
1357 }
1358
1359 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1360 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1361 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1362 // We conservatively set the memory operand of a buffer intrinsic to the
1363 // base resource pointer, so that we can access alias information about
1364 // those pointers. Cases like "this points at the same value
1365 // but with a different offset" are handled in
1366 // areMemAccessesTriviallyDisjoint.
1367 Info.ptrVal = RsrcArg;
1368 }
1369
1370 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1371 if (!IsSPrefetch) {
1372 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1373 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1374 Info.flags |= MachineMemOperand::MOVolatile;
1375 }
1376
1378 if (ME.onlyReadsMemory()) {
1379 if (RsrcIntr->IsImage) {
1380 unsigned MaxNumLanes = 4;
1381
1382 if (!BaseOpcode->Gather4) {
1383 // If this isn't a gather, we may have excess loaded elements in the
1384 // IR type. Check the dmask for the real number of elements loaded.
1385 unsigned DMask =
1386 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1387 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1388 }
1389
1390 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1391 CI.getType(), MaxNumLanes);
1392 } else {
1393 Info.memVT =
1395 std::numeric_limits<unsigned>::max());
1396 }
1397
1398 // FIXME: What does alignment mean for an image?
1399 Info.opc = ISD::INTRINSIC_W_CHAIN;
1400 Info.flags |= MachineMemOperand::MOLoad;
1401 } else if (ME.onlyWritesMemory()) {
1402 Info.opc = ISD::INTRINSIC_VOID;
1403
1404 Type *DataTy = CI.getArgOperand(0)->getType();
1405 if (RsrcIntr->IsImage) {
1406 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1407 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1408 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1409 DMaskLanes);
1410 } else
1411 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1412
1413 Info.flags |= MachineMemOperand::MOStore;
1414 } else {
1415 // Atomic, NoReturn Sampler or prefetch
1416 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1418 Info.flags |=
1420
1421 if (!IsSPrefetch)
1422 Info.flags |= MachineMemOperand::MOStore;
1423
1424 switch (IntrID) {
1425 default:
1426 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1427 // Fake memory access type for no return sampler intrinsics
1428 Info.memVT = MVT::i32;
1429 } else {
1430 // XXX - Should this be volatile without known ordering?
1431 Info.flags |= MachineMemOperand::MOVolatile;
1432 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1433 }
1434 break;
1435 case Intrinsic::amdgcn_raw_buffer_load_lds:
1436 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1437 case Intrinsic::amdgcn_struct_buffer_load_lds:
1438 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1439 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1440 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1441 Info.ptrVal = CI.getArgOperand(1);
1442 return true;
1443 }
1444 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1445 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1446 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1447 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1448 Info.memVT =
1450 std::numeric_limits<unsigned>::max());
1451 Info.flags &= ~MachineMemOperand::MOStore;
1452 return true;
1453 }
1454 }
1455 }
1456 return true;
1457 }
1458
1459 switch (IntrID) {
1460 case Intrinsic::amdgcn_ds_ordered_add:
1461 case Intrinsic::amdgcn_ds_ordered_swap: {
1462 Info.opc = ISD::INTRINSIC_W_CHAIN;
1463 Info.memVT = MVT::getVT(CI.getType());
1464 Info.ptrVal = CI.getOperand(0);
1465 Info.align.reset();
1467
1468 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1469 if (!Vol->isZero())
1470 Info.flags |= MachineMemOperand::MOVolatile;
1471
1472 return true;
1473 }
1474 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1475 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1476 Info.opc = ISD::INTRINSIC_W_CHAIN;
1477 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1478 Info.ptrVal = nullptr;
1479 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1481 return true;
1482 }
1483 case Intrinsic::amdgcn_ds_append:
1484 case Intrinsic::amdgcn_ds_consume: {
1485 Info.opc = ISD::INTRINSIC_W_CHAIN;
1486 Info.memVT = MVT::getVT(CI.getType());
1487 Info.ptrVal = CI.getOperand(0);
1488 Info.align.reset();
1490
1491 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1492 if (!Vol->isZero())
1493 Info.flags |= MachineMemOperand::MOVolatile;
1494
1495 return true;
1496 }
1497 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1498 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1499 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1502 Info.memVT = MVT::getVT(CI.getType());
1503 Info.ptrVal = CI.getOperand(0);
1504 Info.memVT = MVT::i64;
1505 Info.size = 8;
1506 Info.align.reset();
1508 return true;
1509 }
1510 case Intrinsic::amdgcn_global_atomic_csub: {
1511 Info.opc = ISD::INTRINSIC_W_CHAIN;
1512 Info.memVT = MVT::getVT(CI.getType());
1513 Info.ptrVal = CI.getOperand(0);
1514 Info.align.reset();
1517 return true;
1518 }
1519 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1520 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1521 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1522 Info.opc = ISD::INTRINSIC_W_CHAIN;
1523 Info.memVT =
1524 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1525 ? CI.getType()
1527 ->getElementType(0)); // XXX: what is correct VT?
1528
1529 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1530 Info.align.reset();
1531 Info.flags |=
1533 return true;
1534 }
1535 case Intrinsic::amdgcn_global_atomic_fmin_num:
1536 case Intrinsic::amdgcn_global_atomic_fmax_num:
1537 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1538 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1539 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1540 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1541 Info.opc = ISD::INTRINSIC_W_CHAIN;
1542 Info.memVT = MVT::getVT(CI.getType());
1543 Info.ptrVal = CI.getOperand(0);
1544 Info.align.reset();
1548 return true;
1549 }
1550 case Intrinsic::amdgcn_flat_load_monitor_b32:
1551 case Intrinsic::amdgcn_flat_load_monitor_b64:
1552 case Intrinsic::amdgcn_flat_load_monitor_b128:
1553 case Intrinsic::amdgcn_global_load_monitor_b32:
1554 case Intrinsic::amdgcn_global_load_monitor_b64:
1555 case Intrinsic::amdgcn_global_load_monitor_b128:
1556 case Intrinsic::amdgcn_cluster_load_b32:
1557 case Intrinsic::amdgcn_cluster_load_b64:
1558 case Intrinsic::amdgcn_cluster_load_b128:
1559 case Intrinsic::amdgcn_ds_load_tr6_b96:
1560 case Intrinsic::amdgcn_ds_load_tr4_b64:
1561 case Intrinsic::amdgcn_ds_load_tr8_b64:
1562 case Intrinsic::amdgcn_ds_load_tr16_b128:
1563 case Intrinsic::amdgcn_global_load_tr6_b96:
1564 case Intrinsic::amdgcn_global_load_tr4_b64:
1565 case Intrinsic::amdgcn_global_load_tr_b64:
1566 case Intrinsic::amdgcn_global_load_tr_b128:
1567 case Intrinsic::amdgcn_ds_read_tr4_b64:
1568 case Intrinsic::amdgcn_ds_read_tr6_b96:
1569 case Intrinsic::amdgcn_ds_read_tr8_b64:
1570 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1571 Info.opc = ISD::INTRINSIC_W_CHAIN;
1572 Info.memVT = MVT::getVT(CI.getType());
1573 Info.ptrVal = CI.getOperand(0);
1574 Info.align.reset();
1575 Info.flags |= MachineMemOperand::MOLoad;
1576 return true;
1577 }
1578 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1579 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1580 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1581 Info.opc = ISD::INTRINSIC_W_CHAIN;
1582 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1583 Info.ptrVal = CI.getOperand(0);
1584 Info.align.reset();
1585 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/true, Info);
1586 return true;
1587 }
1588 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1589 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1590 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1591 Info.opc = ISD::INTRINSIC_VOID;
1592 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1593 Info.ptrVal = CI.getArgOperand(0);
1594 Info.align.reset();
1595 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/false, Info);
1596 return true;
1597 }
1598 case Intrinsic::amdgcn_ds_gws_init:
1599 case Intrinsic::amdgcn_ds_gws_barrier:
1600 case Intrinsic::amdgcn_ds_gws_sema_v:
1601 case Intrinsic::amdgcn_ds_gws_sema_br:
1602 case Intrinsic::amdgcn_ds_gws_sema_p:
1603 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1604 Info.opc = ISD::INTRINSIC_VOID;
1605
1606 const GCNTargetMachine &TM =
1607 static_cast<const GCNTargetMachine &>(getTargetMachine());
1608
1610 Info.ptrVal = MFI->getGWSPSV(TM);
1611
1612 // This is an abstract access, but we need to specify a type and size.
1613 Info.memVT = MVT::i32;
1614 Info.size = 4;
1615 Info.align = Align(4);
1616
1617 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1618 Info.flags |= MachineMemOperand::MOLoad;
1619 else
1620 Info.flags |= MachineMemOperand::MOStore;
1621 return true;
1622 }
1623 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1624 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1625 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1626 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1627 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1628 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1629 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1630 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1631 Info.opc = ISD::INTRINSIC_VOID;
1632 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1633 Info.ptrVal = CI.getArgOperand(1);
1635 return true;
1636 }
1637 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1638 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1639 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1640 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1641 Info.opc = ISD::INTRINSIC_VOID;
1642 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1643 Info.ptrVal = CI.getArgOperand(0);
1645 return true;
1646 }
1647 case Intrinsic::amdgcn_load_to_lds:
1648 case Intrinsic::amdgcn_global_load_lds: {
1649 Info.opc = ISD::INTRINSIC_VOID;
1650 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1651 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1652 Info.ptrVal = CI.getArgOperand(1);
1654 return true;
1655 }
1656 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1657 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1658 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1659 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1660 Info.opc = ISD::INTRINSIC_W_CHAIN;
1661
1662 const GCNTargetMachine &TM =
1663 static_cast<const GCNTargetMachine &>(getTargetMachine());
1664
1666 Info.ptrVal = MFI->getGWSPSV(TM);
1667
1668 // This is an abstract access, but we need to specify a type and size.
1669 Info.memVT = MVT::i32;
1670 Info.size = 4;
1671 Info.align = Align(4);
1672
1674 return true;
1675 }
1676 case Intrinsic::amdgcn_s_prefetch_data:
1677 case Intrinsic::amdgcn_flat_prefetch:
1678 case Intrinsic::amdgcn_global_prefetch: {
1679 Info.opc = ISD::INTRINSIC_VOID;
1680 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1681 Info.ptrVal = CI.getArgOperand(0);
1682 Info.flags |= MachineMemOperand::MOLoad;
1683 return true;
1684 }
1685 default:
1686 return false;
1687 }
1688}
1689
1691 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1693 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1694 // The DAG's ValueType loses the addrspaces.
1695 // Add them as 2 extra Constant operands "from" and "to".
1696 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1697 unsigned DstAS = I.getType()->getPointerAddressSpace();
1698 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1699 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1700 break;
1701 }
1702 default:
1703 break;
1704 }
1705}
1706
1709 Type *&AccessTy) const {
1710 Value *Ptr = nullptr;
1711 switch (II->getIntrinsicID()) {
1712 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1713 case Intrinsic::amdgcn_cluster_load_b128:
1714 case Intrinsic::amdgcn_cluster_load_b64:
1715 case Intrinsic::amdgcn_cluster_load_b32:
1716 case Intrinsic::amdgcn_ds_append:
1717 case Intrinsic::amdgcn_ds_consume:
1718 case Intrinsic::amdgcn_ds_load_tr8_b64:
1719 case Intrinsic::amdgcn_ds_load_tr16_b128:
1720 case Intrinsic::amdgcn_ds_load_tr4_b64:
1721 case Intrinsic::amdgcn_ds_load_tr6_b96:
1722 case Intrinsic::amdgcn_ds_read_tr4_b64:
1723 case Intrinsic::amdgcn_ds_read_tr6_b96:
1724 case Intrinsic::amdgcn_ds_read_tr8_b64:
1725 case Intrinsic::amdgcn_ds_read_tr16_b64:
1726 case Intrinsic::amdgcn_ds_ordered_add:
1727 case Intrinsic::amdgcn_ds_ordered_swap:
1728 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1729 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1730 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1731 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1732 case Intrinsic::amdgcn_flat_load_monitor_b128:
1733 case Intrinsic::amdgcn_flat_load_monitor_b32:
1734 case Intrinsic::amdgcn_flat_load_monitor_b64:
1735 case Intrinsic::amdgcn_global_atomic_csub:
1736 case Intrinsic::amdgcn_global_atomic_fmax_num:
1737 case Intrinsic::amdgcn_global_atomic_fmin_num:
1738 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1739 case Intrinsic::amdgcn_global_load_monitor_b128:
1740 case Intrinsic::amdgcn_global_load_monitor_b32:
1741 case Intrinsic::amdgcn_global_load_monitor_b64:
1742 case Intrinsic::amdgcn_global_load_tr_b64:
1743 case Intrinsic::amdgcn_global_load_tr_b128:
1744 case Intrinsic::amdgcn_global_load_tr4_b64:
1745 case Intrinsic::amdgcn_global_load_tr6_b96:
1746 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1747 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1748 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1749 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1750 Ptr = II->getArgOperand(0);
1751 break;
1752 case Intrinsic::amdgcn_load_to_lds:
1753 case Intrinsic::amdgcn_global_load_lds:
1754 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1755 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1756 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1757 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1758 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1759 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1760 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1761 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1762 Ptr = II->getArgOperand(1);
1763 break;
1764 default:
1765 return false;
1766 }
1767 AccessTy = II->getType();
1768 Ops.push_back(Ptr);
1769 return true;
1770}
1771
1773 unsigned AddrSpace) const {
1774 if (!Subtarget->hasFlatInstOffsets()) {
1775 // Flat instructions do not have offsets, and only have the register
1776 // address.
1777 return AM.BaseOffs == 0 && AM.Scale == 0;
1778 }
1779
1780 decltype(SIInstrFlags::FLAT) FlatVariant =
1784
1785 return AM.Scale == 0 &&
1786 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1787 AM.BaseOffs, AddrSpace, FlatVariant));
1788}
1789
1791 if (Subtarget->hasFlatGlobalInsts())
1793
1794 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1795 // Assume the we will use FLAT for all global memory accesses
1796 // on VI.
1797 // FIXME: This assumption is currently wrong. On VI we still use
1798 // MUBUF instructions for the r + i addressing mode. As currently
1799 // implemented, the MUBUF instructions only work on buffer < 4GB.
1800 // It may be possible to support > 4GB buffers with MUBUF instructions,
1801 // by setting the stride value in the resource descriptor which would
1802 // increase the size limit to (stride * 4GB). However, this is risky,
1803 // because it has never been validated.
1805 }
1806
1807 return isLegalMUBUFAddressingMode(AM);
1808}
1809
1810bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1811 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1812 // additionally can do r + r + i with addr64. 32-bit has more addressing
1813 // mode options. Depending on the resource constant, it can also do
1814 // (i64 r0) + (i32 r1) * (i14 i).
1815 //
1816 // Private arrays end up using a scratch buffer most of the time, so also
1817 // assume those use MUBUF instructions. Scratch loads / stores are currently
1818 // implemented as mubuf instructions with offen bit set, so slightly
1819 // different than the normal addr64.
1820 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1821 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1822 return false;
1823
1824 // FIXME: Since we can split immediate into soffset and immediate offset,
1825 // would it make sense to allow any immediate?
1826
1827 switch (AM.Scale) {
1828 case 0: // r + i or just i, depending on HasBaseReg.
1829 return true;
1830 case 1:
1831 return true; // We have r + r or r + i.
1832 case 2:
1833 if (AM.HasBaseReg) {
1834 // Reject 2 * r + r.
1835 return false;
1836 }
1837
1838 // Allow 2 * r as r + r
1839 // Or 2 * r + i is allowed as r + r + i.
1840 return true;
1841 default: // Don't allow n * r
1842 return false;
1843 }
1844}
1845
1847 const AddrMode &AM, Type *Ty,
1848 unsigned AS,
1849 Instruction *I) const {
1850 // No global is ever allowed as a base.
1851 if (AM.BaseGV)
1852 return false;
1853
1854 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1855 return isLegalGlobalAddressingMode(AM);
1856
1857 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1861 // If the offset isn't a multiple of 4, it probably isn't going to be
1862 // correctly aligned.
1863 // FIXME: Can we get the real alignment here?
1864 if (AM.BaseOffs % 4 != 0)
1865 return isLegalMUBUFAddressingMode(AM);
1866
1867 if (!Subtarget->hasScalarSubwordLoads()) {
1868 // There are no SMRD extloads, so if we have to do a small type access we
1869 // will use a MUBUF load.
1870 // FIXME?: We also need to do this if unaligned, but we don't know the
1871 // alignment here.
1872 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1873 return isLegalGlobalAddressingMode(AM);
1874 }
1875
1876 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1877 // SMRD instructions have an 8-bit, dword offset on SI.
1878 if (!isUInt<8>(AM.BaseOffs / 4))
1879 return false;
1880 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1881 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1882 // in 8-bits, it can use a smaller encoding.
1883 if (!isUInt<32>(AM.BaseOffs / 4))
1884 return false;
1885 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1886 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1887 if (!isUInt<20>(AM.BaseOffs))
1888 return false;
1889 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1890 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1891 // for S_BUFFER_* instructions).
1892 if (!isInt<21>(AM.BaseOffs))
1893 return false;
1894 } else {
1895 // On GFX12, all offsets are signed 24-bit in bytes.
1896 if (!isInt<24>(AM.BaseOffs))
1897 return false;
1898 }
1899
1900 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1902 AM.BaseOffs < 0) {
1903 // Scalar (non-buffer) loads can only use a negative offset if
1904 // soffset+offset is non-negative. Since the compiler can only prove that
1905 // in a few special cases, it is safer to claim that negative offsets are
1906 // not supported.
1907 return false;
1908 }
1909
1910 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1911 return true;
1912
1913 if (AM.Scale == 1 && AM.HasBaseReg)
1914 return true;
1915
1916 return false;
1917 }
1918
1919 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1920 return Subtarget->enableFlatScratch()
1922 : isLegalMUBUFAddressingMode(AM);
1923
1924 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1925 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1926 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1927 // field.
1928 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1929 // an 8-bit dword offset but we don't know the alignment here.
1930 if (!isUInt<16>(AM.BaseOffs))
1931 return false;
1932
1933 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1934 return true;
1935
1936 if (AM.Scale == 1 && AM.HasBaseReg)
1937 return true;
1938
1939 return false;
1940 }
1941
1943 // For an unknown address space, this usually means that this is for some
1944 // reason being used for pure arithmetic, and not based on some addressing
1945 // computation. We don't have instructions that compute pointers with any
1946 // addressing modes, so treat them as having no offset like flat
1947 // instructions.
1949 }
1950
1951 // Assume a user alias of global for unknown address spaces.
1952 return isLegalGlobalAddressingMode(AM);
1953}
1954
1956 const MachineFunction &MF) const {
1958 return (MemVT.getSizeInBits() <= 4 * 32);
1959 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1960 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1961 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1962 }
1964 return (MemVT.getSizeInBits() <= 2 * 32);
1965 return true;
1966}
1967
1969 unsigned Size, unsigned AddrSpace, Align Alignment,
1970 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1971 if (IsFast)
1972 *IsFast = 0;
1973
1974 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1975 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1976 // Check if alignment requirements for ds_read/write instructions are
1977 // disabled.
1978 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1979 return false;
1980
1981 Align RequiredAlignment(
1982 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1983 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1984 Alignment < RequiredAlignment)
1985 return false;
1986
1987 // Either, the alignment requirements are "enabled", or there is an
1988 // unaligned LDS access related hardware bug though alignment requirements
1989 // are "disabled". In either case, we need to check for proper alignment
1990 // requirements.
1991 //
1992 switch (Size) {
1993 case 64:
1994 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1995 // address is negative, then the instruction is incorrectly treated as
1996 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1997 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1998 // load later in the SILoadStoreOptimizer.
1999 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
2000 return false;
2001
2002 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
2003 // can do a 4 byte aligned, 8 byte access in a single operation using
2004 // ds_read2/write2_b32 with adjacent offsets.
2005 RequiredAlignment = Align(4);
2006
2007 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2008 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
2009 // ds_write2_b32 depending on the alignment. In either case with either
2010 // alignment there is no faster way of doing this.
2011
2012 // The numbers returned here and below are not additive, it is a 'speed
2013 // rank'. They are just meant to be compared to decide if a certain way
2014 // of lowering an operation is faster than another. For that purpose
2015 // naturally aligned operation gets it bitsize to indicate that "it
2016 // operates with a speed comparable to N-bit wide load". With the full
2017 // alignment ds128 is slower than ds96 for example. If underaligned it
2018 // is comparable to a speed of a single dword access, which would then
2019 // mean 32 < 128 and it is faster to issue a wide load regardless.
2020 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
2021 // wider load which will not be aligned anymore the latter is slower.
2022 if (IsFast)
2023 *IsFast = (Alignment >= RequiredAlignment) ? 64
2024 : (Alignment < Align(4)) ? 32
2025 : 1;
2026 return true;
2027 }
2028
2029 break;
2030 case 96:
2031 if (!Subtarget->hasDS96AndDS128())
2032 return false;
2033
2034 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
2035 // gfx8 and older.
2036
2037 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2038 // Naturally aligned access is fastest. However, also report it is Fast
2039 // if memory is aligned less than DWORD. A narrow load or store will be
2040 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
2041 // be more of them, so overall we will pay less penalty issuing a single
2042 // instruction.
2043
2044 // See comment on the values above.
2045 if (IsFast)
2046 *IsFast = (Alignment >= RequiredAlignment) ? 96
2047 : (Alignment < Align(4)) ? 32
2048 : 1;
2049 return true;
2050 }
2051
2052 break;
2053 case 128:
2054 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2055 return false;
2056
2057 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
2058 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
2059 // single operation using ds_read2/write2_b64.
2060 RequiredAlignment = Align(8);
2061
2062 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2063 // Naturally aligned access is fastest. However, also report it is Fast
2064 // if memory is aligned less than DWORD. A narrow load or store will be
2065 // be equally slow as a single ds_read_b128/ds_write_b128, but there
2066 // will be more of them, so overall we will pay less penalty issuing a
2067 // single instruction.
2068
2069 // See comment on the values above.
2070 if (IsFast)
2071 *IsFast = (Alignment >= RequiredAlignment) ? 128
2072 : (Alignment < Align(4)) ? 32
2073 : 1;
2074 return true;
2075 }
2076
2077 break;
2078 default:
2079 if (Size > 32)
2080 return false;
2081
2082 break;
2083 }
2084
2085 // See comment on the values above.
2086 // Note that we have a single-dword or sub-dword here, so if underaligned
2087 // it is a slowest possible access, hence returned value is 0.
2088 if (IsFast)
2089 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
2090
2091 return Alignment >= RequiredAlignment ||
2092 Subtarget->hasUnalignedDSAccessEnabled();
2093 }
2094
2095 // FIXME: We have to be conservative here and assume that flat operations
2096 // will access scratch. If we had access to the IR function, then we
2097 // could determine if any private memory was used in the function.
2098 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
2099 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2100 bool AlignedBy4 = Alignment >= Align(4);
2101 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2102 if (IsFast)
2103 *IsFast = AlignedBy4 ? Size : 1;
2104 return true;
2105 }
2106
2107 if (IsFast)
2108 *IsFast = AlignedBy4;
2109
2110 return AlignedBy4;
2111 }
2112
2113 // So long as they are correct, wide global memory operations perform better
2114 // than multiple smaller memory ops -- even when misaligned
2115 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
2116 if (IsFast)
2117 *IsFast = Size;
2118
2119 return Alignment >= Align(4) ||
2120 Subtarget->hasUnalignedBufferAccessEnabled();
2121 }
2122
2123 // Ensure robust out-of-bounds guarantees for buffer accesses are met if
2124 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
2125 // out-of-bounds behavior, but in the edge case where an access starts
2126 // out-of-bounds and then enter in-bounds, the entire access would be treated
2127 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
2128 // natural alignment of buffer accesses.
2129 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
2130 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
2131 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2132 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2133 Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
2134 return false;
2135 }
2136
2137 // Smaller than dword value must be aligned.
2138 if (Size < 32)
2139 return false;
2140
2141 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2142 // byte-address are ignored, thus forcing Dword alignment.
2143 // This applies to private, global, and constant memory.
2144 if (IsFast)
2145 *IsFast = 1;
2146
2147 return Size >= 32 && Alignment >= Align(4);
2148}
2149
2151 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2152 unsigned *IsFast) const {
2154 Alignment, Flags, IsFast);
2155}
2156
2158 LLVMContext &Context, const MemOp &Op,
2159 const AttributeList &FuncAttributes) const {
2160 // FIXME: Should account for address space here.
2161
2162 // The default fallback uses the private pointer size as a guess for a type to
2163 // use. Make sure we switch these to 64-bit accesses.
2164
2165 if (Op.size() >= 16 &&
2166 Op.isDstAligned(Align(4))) // XXX: Should only do for global
2167 return MVT::v4i32;
2168
2169 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
2170 return MVT::v2i32;
2171
2172 // Use the default.
2173 return MVT::Other;
2174}
2175
2177 const MemSDNode *MemNode = cast<MemSDNode>(N);
2178 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2179}
2180
2185
2187 unsigned DestAS) const {
2188 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2189 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2190 Subtarget->hasGloballyAddressableScratch()) {
2191 // Flat -> private requires subtracting src_flat_scratch_base_lo.
2192 return false;
2193 }
2194
2195 // Flat -> private/local is a simple truncate.
2196 // Flat -> global is no-op
2197 return true;
2198 }
2199
2200 const GCNTargetMachine &TM =
2201 static_cast<const GCNTargetMachine &>(getTargetMachine());
2202 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2203}
2204
2212
2214 Type *Ty) const {
2215 // FIXME: Could be smarter if called for vector constants.
2216 return true;
2217}
2218
2220 unsigned Index) const {
2222 return false;
2223
2224 // TODO: Add more cases that are cheap.
2225 return Index == 0;
2226}
2227
2228bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2229 // TODO: This should be more aggressive, particular for 16-bit element
2230 // vectors. However there are some mixed improvements and regressions.
2231 EVT EltTy = VT.getVectorElementType();
2232 return EltTy.getSizeInBits() % 32 == 0;
2233}
2234
2236 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2237 switch (Op) {
2238 case ISD::LOAD:
2239 case ISD::STORE:
2240 return true;
2241 default:
2242 return false;
2243 }
2244 }
2245
2246 // SimplifySetCC uses this function to determine whether or not it should
2247 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2248 if (VT == MVT::i1 && Op == ISD::SETCC)
2249 return false;
2250
2252}
2253
2254SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2255 const SDLoc &SL,
2256 SDValue Chain,
2257 uint64_t Offset) const {
2258 const DataLayout &DL = DAG.getDataLayout();
2262
2263 auto [InputPtrReg, RC, ArgTy] =
2264 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2265
2266 // We may not have the kernarg segment argument if we have no kernel
2267 // arguments.
2268 if (!InputPtrReg)
2269 return DAG.getConstant(Offset, SL, PtrVT);
2270
2272 SDValue BasePtr = DAG.getCopyFromReg(
2273 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2274
2275 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2276}
2277
2278SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2279 const SDLoc &SL) const {
2282 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2283}
2284
2285SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2286 const SDLoc &SL) const {
2287
2289 std::optional<uint32_t> KnownSize =
2291 if (KnownSize.has_value())
2292 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2293 return SDValue();
2294}
2295
2296SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2297 const SDLoc &SL, SDValue Val,
2298 bool Signed,
2299 const ISD::InputArg *Arg) const {
2300 // First, if it is a widened vector, narrow it.
2301 if (VT.isVector() &&
2303 EVT NarrowedVT =
2306 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2307 DAG.getConstant(0, SL, MVT::i32));
2308 }
2309
2310 // Then convert the vector elements or scalar value.
2311 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2312 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2313 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2314 }
2315
2316 if (MemVT.isFloatingPoint())
2317 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2318 else if (Signed)
2319 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2320 else
2321 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2322
2323 return Val;
2324}
2325
2326SDValue SITargetLowering::lowerKernargMemParameter(
2327 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2328 uint64_t Offset, Align Alignment, bool Signed,
2329 const ISD::InputArg *Arg) const {
2330 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2331
2332 // Try to avoid using an extload by loading earlier than the argument address,
2333 // and extracting the relevant bits. The load should hopefully be merged with
2334 // the previous argument.
2335 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2336 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2337 int64_t AlignDownOffset = alignDown(Offset, 4);
2338 int64_t OffsetDiff = Offset - AlignDownOffset;
2339
2340 EVT IntVT = MemVT.changeTypeToInteger();
2341
2342 // TODO: If we passed in the base kernel offset we could have a better
2343 // alignment than 4, but we don't really need it.
2344 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2345 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2348
2349 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2350 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2351
2352 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2353 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2354 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2355
2356 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2357 }
2358
2359 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2360 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2363
2364 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2365 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2366}
2367
2368/// Coerce an argument which was passed in a different ABI type to the original
2369/// expected value type.
2370SDValue SITargetLowering::convertABITypeToValueType(SelectionDAG &DAG,
2371 SDValue Val,
2372 CCValAssign &VA,
2373 const SDLoc &SL) const {
2374 EVT ValVT = VA.getValVT();
2375
2376 // If this is an 8 or 16-bit value, it is really passed promoted
2377 // to 32 bits. Insert an assert[sz]ext to capture this, then
2378 // truncate to the right size.
2379 switch (VA.getLocInfo()) {
2380 case CCValAssign::Full:
2381 return Val;
2382 case CCValAssign::BCvt:
2383 return DAG.getNode(ISD::BITCAST, SL, ValVT, Val);
2384 case CCValAssign::SExt:
2385 Val = DAG.getNode(ISD::AssertSext, SL, VA.getLocVT(), Val,
2386 DAG.getValueType(ValVT));
2387 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2388 case CCValAssign::ZExt:
2389 Val = DAG.getNode(ISD::AssertZext, SL, VA.getLocVT(), Val,
2390 DAG.getValueType(ValVT));
2391 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2392 case CCValAssign::AExt:
2393 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2394 default:
2395 llvm_unreachable("Unknown loc info!");
2396 }
2397}
2398
2399SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2400 CCValAssign &VA, const SDLoc &SL,
2401 SDValue Chain,
2402 const ISD::InputArg &Arg) const {
2403 MachineFunction &MF = DAG.getMachineFunction();
2404 MachineFrameInfo &MFI = MF.getFrameInfo();
2405
2406 if (Arg.Flags.isByVal()) {
2407 unsigned Size = Arg.Flags.getByValSize();
2408 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2409 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2410 }
2411
2412 unsigned ArgOffset = VA.getLocMemOffset();
2413 unsigned ArgSize = VA.getValVT().getStoreSize();
2414
2415 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2416
2417 // Create load nodes to retrieve arguments from the stack.
2418 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2419
2420 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2422 MVT MemVT = VA.getValVT();
2423
2424 switch (VA.getLocInfo()) {
2425 default:
2426 break;
2427 case CCValAssign::BCvt:
2428 MemVT = VA.getLocVT();
2429 break;
2430 case CCValAssign::SExt:
2431 ExtType = ISD::SEXTLOAD;
2432 break;
2433 case CCValAssign::ZExt:
2434 ExtType = ISD::ZEXTLOAD;
2435 break;
2436 case CCValAssign::AExt:
2437 ExtType = ISD::EXTLOAD;
2438 break;
2439 }
2440
2441 SDValue ArgValue = DAG.getExtLoad(
2442 ExtType, SL, VA.getLocVT(), Chain, FIN,
2444
2445 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2446 if (ConvertedVal == ArgValue)
2447 return ConvertedVal;
2448
2449 return DAG.getMergeValues({ConvertedVal, ArgValue.getValue(1)}, SL);
2450}
2451
2452SDValue SITargetLowering::lowerWorkGroupId(
2453 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2456 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
2457 if (!Subtarget->hasClusters())
2458 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2459
2460 // Clusters are supported. Return the global position in the grid. If clusters
2461 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
2462
2463 // WorkGroupIdXYZ = ClusterId == 0 ?
2464 // ClusterIdXYZ :
2465 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
2466 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2467 SDLoc SL(ClusterIdXYZ);
2468 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2469 SDValue One = DAG.getConstant(1, SL, VT);
2470 SDValue ClusterSizeXYZ = DAG.getNode(ISD::ADD, SL, VT, ClusterMaxIdXYZ, One);
2471 SDValue ClusterWorkGroupIdXYZ =
2472 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2473 SDValue GlobalIdXYZ =
2474 DAG.getNode(ISD::ADD, SL, VT, ClusterWorkGroupIdXYZ,
2475 DAG.getNode(ISD::MUL, SL, VT, ClusterIdXYZ, ClusterSizeXYZ));
2476
2477 switch (MFI.getClusterDims().getKind()) {
2480 return GlobalIdXYZ;
2482 return ClusterIdXYZ;
2484 using namespace AMDGPU::Hwreg;
2485 SDValue ClusterIdField =
2486 DAG.getTargetConstant(HwregEncoding::encode(ID_IB_STS2, 6, 4), SL, VT);
2487 SDNode *GetReg =
2488 DAG.getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2489 SDValue ClusterId(GetReg, 0);
2490 SDValue Zero = DAG.getConstant(0, SL, VT);
2491 return DAG.getNode(ISD::SELECT_CC, SL, VT, ClusterId, Zero, ClusterIdXYZ,
2492 GlobalIdXYZ, DAG.getCondCode(ISD::SETEQ));
2493 }
2494 }
2495
2496 llvm_unreachable("nothing should reach here");
2497}
2498
2499SDValue SITargetLowering::getPreloadedValue(
2500 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2502 const ArgDescriptor *Reg = nullptr;
2503 const TargetRegisterClass *RC;
2504 LLT Ty;
2505
2507 const ArgDescriptor WorkGroupIDX =
2508 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2509 // If GridZ is not programmed in an entry function then the hardware will set
2510 // it to all zeros, so there is no need to mask the GridY value in the low
2511 // order bits.
2512 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2513 AMDGPU::TTMP7,
2514 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2515 const ArgDescriptor WorkGroupIDZ =
2516 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2517 const ArgDescriptor ClusterWorkGroupIDX =
2518 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
2519 const ArgDescriptor ClusterWorkGroupIDY =
2520 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
2521 const ArgDescriptor ClusterWorkGroupIDZ =
2522 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
2523 const ArgDescriptor ClusterWorkGroupMaxIDX =
2524 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
2525 const ArgDescriptor ClusterWorkGroupMaxIDY =
2526 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
2527 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2528 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
2529 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2530 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
2531
2532 auto LoadConstant = [&](unsigned N) {
2533 return DAG.getConstant(N, SDLoc(), VT);
2534 };
2535
2536 if (Subtarget->hasArchitectedSGPRs() &&
2538 AMDGPU::ClusterDimsAttr ClusterDims = MFI.getClusterDims();
2539 bool HasFixedDims = ClusterDims.isFixedDims();
2540
2541 switch (PVID) {
2543 Reg = &WorkGroupIDX;
2544 RC = &AMDGPU::SReg_32RegClass;
2545 Ty = LLT::scalar(32);
2546 break;
2548 Reg = &WorkGroupIDY;
2549 RC = &AMDGPU::SReg_32RegClass;
2550 Ty = LLT::scalar(32);
2551 break;
2553 Reg = &WorkGroupIDZ;
2554 RC = &AMDGPU::SReg_32RegClass;
2555 Ty = LLT::scalar(32);
2556 break;
2558 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
2559 return LoadConstant(0);
2560 Reg = &ClusterWorkGroupIDX;
2561 RC = &AMDGPU::SReg_32RegClass;
2562 Ty = LLT::scalar(32);
2563 break;
2565 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
2566 return LoadConstant(0);
2567 Reg = &ClusterWorkGroupIDY;
2568 RC = &AMDGPU::SReg_32RegClass;
2569 Ty = LLT::scalar(32);
2570 break;
2572 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
2573 return LoadConstant(0);
2574 Reg = &ClusterWorkGroupIDZ;
2575 RC = &AMDGPU::SReg_32RegClass;
2576 Ty = LLT::scalar(32);
2577 break;
2579 if (HasFixedDims)
2580 return LoadConstant(ClusterDims.getDims()[0] - 1);
2581 Reg = &ClusterWorkGroupMaxIDX;
2582 RC = &AMDGPU::SReg_32RegClass;
2583 Ty = LLT::scalar(32);
2584 break;
2586 if (HasFixedDims)
2587 return LoadConstant(ClusterDims.getDims()[1] - 1);
2588 Reg = &ClusterWorkGroupMaxIDY;
2589 RC = &AMDGPU::SReg_32RegClass;
2590 Ty = LLT::scalar(32);
2591 break;
2593 if (HasFixedDims)
2594 return LoadConstant(ClusterDims.getDims()[2] - 1);
2595 Reg = &ClusterWorkGroupMaxIDZ;
2596 RC = &AMDGPU::SReg_32RegClass;
2597 Ty = LLT::scalar(32);
2598 break;
2600 Reg = &ClusterWorkGroupMaxFlatID;
2601 RC = &AMDGPU::SReg_32RegClass;
2602 Ty = LLT::scalar(32);
2603 break;
2604 default:
2605 break;
2606 }
2607 }
2608
2609 if (!Reg)
2610 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2611 if (!Reg) {
2613 // It's possible for a kernarg intrinsic call to appear in a kernel with
2614 // no allocated segment, in which case we do not add the user sgpr
2615 // argument, so just return null.
2616 return DAG.getConstant(0, SDLoc(), VT);
2617 }
2618
2619 // It's undefined behavior if a function marked with the amdgpu-no-*
2620 // attributes uses the corresponding intrinsic.
2621 return DAG.getPOISON(VT);
2622 }
2623
2624 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2625}
2626
2628 CallingConv::ID CallConv,
2629 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2630 FunctionType *FType,
2632 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2633 const ISD::InputArg *Arg = &Ins[I];
2634
2635 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2636 "vector type argument should have been split");
2637
2638 // First check if it's a PS input addr.
2639 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2640 PSInputNum <= 15) {
2641 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2642
2643 // Inconveniently only the first part of the split is marked as isSplit,
2644 // so skip to the end. We only want to increment PSInputNum once for the
2645 // entire split argument.
2646 if (Arg->Flags.isSplit()) {
2647 while (!Arg->Flags.isSplitEnd()) {
2648 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2649 "unexpected vector split in ps argument type");
2650 if (!SkipArg)
2651 Splits.push_back(*Arg);
2652 Arg = &Ins[++I];
2653 }
2654 }
2655
2656 if (SkipArg) {
2657 // We can safely skip PS inputs.
2658 Skipped.set(Arg->getOrigArgIndex());
2659 ++PSInputNum;
2660 continue;
2661 }
2662
2663 Info->markPSInputAllocated(PSInputNum);
2664 if (Arg->Used)
2665 Info->markPSInputEnabled(PSInputNum);
2666
2667 ++PSInputNum;
2668 }
2669
2670 Splits.push_back(*Arg);
2671 }
2672}
2673
2674// Allocate special inputs passed in VGPRs.
2676 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2677 SIMachineFunctionInfo &Info) const {
2678 const LLT S32 = LLT::scalar(32);
2680
2681 if (Info.hasWorkItemIDX()) {
2682 Register Reg = AMDGPU::VGPR0;
2683 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2684
2685 CCInfo.AllocateReg(Reg);
2686 unsigned Mask =
2687 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2688 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2689 }
2690
2691 if (Info.hasWorkItemIDY()) {
2692 assert(Info.hasWorkItemIDX());
2693 if (Subtarget->hasPackedTID()) {
2694 Info.setWorkItemIDY(
2695 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2696 } else {
2697 unsigned Reg = AMDGPU::VGPR1;
2698 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2699
2700 CCInfo.AllocateReg(Reg);
2701 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2702 }
2703 }
2704
2705 if (Info.hasWorkItemIDZ()) {
2706 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2707 if (Subtarget->hasPackedTID()) {
2708 Info.setWorkItemIDZ(
2709 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2710 } else {
2711 unsigned Reg = AMDGPU::VGPR2;
2712 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2713
2714 CCInfo.AllocateReg(Reg);
2715 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2716 }
2717 }
2718}
2719
2720// Try to allocate a VGPR at the end of the argument list, or if no argument
2721// VGPRs are left allocating a stack slot.
2722// If \p Mask is is given it indicates bitfield position in the register.
2723// If \p Arg is given use it with new ]p Mask instead of allocating new.
2724static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2725 ArgDescriptor Arg = ArgDescriptor()) {
2726 if (Arg.isSet())
2727 return ArgDescriptor::createArg(Arg, Mask);
2728
2729 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2730 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2731 if (RegIdx == ArgVGPRs.size()) {
2732 // Spill to stack required.
2733 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2734
2735 return ArgDescriptor::createStack(Offset, Mask);
2736 }
2737
2738 unsigned Reg = ArgVGPRs[RegIdx];
2739 Reg = CCInfo.AllocateReg(Reg);
2740 assert(Reg != AMDGPU::NoRegister);
2741
2742 MachineFunction &MF = CCInfo.getMachineFunction();
2743 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2744 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2745 return ArgDescriptor::createRegister(Reg, Mask);
2746}
2747
2749 const TargetRegisterClass *RC,
2750 unsigned NumArgRegs) {
2751 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2752 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2753 if (RegIdx == ArgSGPRs.size())
2754 report_fatal_error("ran out of SGPRs for arguments");
2755
2756 unsigned Reg = ArgSGPRs[RegIdx];
2757 Reg = CCInfo.AllocateReg(Reg);
2758 assert(Reg != AMDGPU::NoRegister);
2759
2760 MachineFunction &MF = CCInfo.getMachineFunction();
2761 MF.addLiveIn(Reg, RC);
2763}
2764
2765// If this has a fixed position, we still should allocate the register in the
2766// CCInfo state. Technically we could get away with this for values passed
2767// outside of the normal argument range.
2769 const TargetRegisterClass *RC,
2770 MCRegister Reg) {
2771 Reg = CCInfo.AllocateReg(Reg);
2772 assert(Reg != AMDGPU::NoRegister);
2773 MachineFunction &MF = CCInfo.getMachineFunction();
2774 MF.addLiveIn(Reg, RC);
2775}
2776
2777static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2778 if (Arg) {
2779 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2780 Arg.getRegister());
2781 } else
2782 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2783}
2784
2785static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2786 if (Arg) {
2787 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2788 Arg.getRegister());
2789 } else
2790 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2791}
2792
2793/// Allocate implicit function VGPR arguments at the end of allocated user
2794/// arguments.
2796 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2797 SIMachineFunctionInfo &Info) const {
2798 const unsigned Mask = 0x3ff;
2799 ArgDescriptor Arg;
2800
2801 if (Info.hasWorkItemIDX()) {
2802 Arg = allocateVGPR32Input(CCInfo, Mask);
2803 Info.setWorkItemIDX(Arg);
2804 }
2805
2806 if (Info.hasWorkItemIDY()) {
2807 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2808 Info.setWorkItemIDY(Arg);
2809 }
2810
2811 if (Info.hasWorkItemIDZ())
2812 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2813}
2814
2815/// Allocate implicit function VGPR arguments in fixed registers.
2817 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2818 SIMachineFunctionInfo &Info) const {
2819 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2820 if (!Reg)
2821 report_fatal_error("failed to allocate VGPR for implicit arguments");
2822
2823 const unsigned Mask = 0x3ff;
2824 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2825 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2826 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2827}
2828
2830 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2831 SIMachineFunctionInfo &Info) const {
2832 auto &ArgInfo = Info.getArgInfo();
2833 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2834
2835 // TODO: Unify handling with private memory pointers.
2836 if (UserSGPRInfo.hasDispatchPtr())
2837 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2838
2839 if (UserSGPRInfo.hasQueuePtr())
2840 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2841
2842 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2843 // constant offset from the kernarg segment.
2844 if (Info.hasImplicitArgPtr())
2845 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2846
2847 if (UserSGPRInfo.hasDispatchID())
2848 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2849
2850 // flat_scratch_init is not applicable for non-kernel functions.
2851
2852 if (Info.hasWorkGroupIDX())
2853 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2854
2855 if (Info.hasWorkGroupIDY())
2856 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2857
2858 if (Info.hasWorkGroupIDZ())
2859 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2860
2861 if (Info.hasLDSKernelId())
2862 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2863}
2864
2865// Allocate special inputs passed in user SGPRs.
2867 MachineFunction &MF,
2868 const SIRegisterInfo &TRI,
2869 SIMachineFunctionInfo &Info) const {
2870 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2871 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2872 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2873 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2874 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2875 }
2876
2877 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2878 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2879 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2880 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2881 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2882 }
2883
2884 if (UserSGPRInfo.hasDispatchPtr()) {
2885 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2886 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2887 CCInfo.AllocateReg(DispatchPtrReg);
2888 }
2889
2890 if (UserSGPRInfo.hasQueuePtr()) {
2891 Register QueuePtrReg = Info.addQueuePtr(TRI);
2892 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2893 CCInfo.AllocateReg(QueuePtrReg);
2894 }
2895
2896 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2898 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2899 CCInfo.AllocateReg(InputPtrReg);
2900
2901 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2902 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2903 }
2904
2905 if (UserSGPRInfo.hasDispatchID()) {
2906 Register DispatchIDReg = Info.addDispatchID(TRI);
2907 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2908 CCInfo.AllocateReg(DispatchIDReg);
2909 }
2910
2911 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2912 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2913 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2914 CCInfo.AllocateReg(FlatScratchInitReg);
2915 }
2916
2917 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2918 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2919 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2920 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2921 }
2922
2923 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2924 // these from the dispatch pointer.
2925}
2926
2927// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2928// sequential starting from the first argument.
2930 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2932 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2933 Function &F = MF.getFunction();
2934 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2935 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2936 bool InPreloadSequence = true;
2937 unsigned InIdx = 0;
2938 bool AlignedForImplictArgs = false;
2939 unsigned ImplicitArgOffset = 0;
2940 for (auto &Arg : F.args()) {
2941 if (!InPreloadSequence || !Arg.hasInRegAttr())
2942 break;
2943
2944 unsigned ArgIdx = Arg.getArgNo();
2945 // Don't preload non-original args or parts not in the current preload
2946 // sequence.
2947 if (InIdx < Ins.size() &&
2948 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2949 break;
2950
2951 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2952 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2953 InIdx++) {
2954 assert(ArgLocs[ArgIdx].isMemLoc());
2955 auto &ArgLoc = ArgLocs[InIdx];
2956 const Align KernelArgBaseAlign = Align(16);
2957 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2958 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2959 unsigned NumAllocSGPRs =
2960 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2961
2962 // Fix alignment for hidden arguments.
2963 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2964 if (!AlignedForImplictArgs) {
2965 ImplicitArgOffset =
2966 alignTo(LastExplicitArgOffset,
2967 Subtarget->getAlignmentForImplicitArgPtr()) -
2968 LastExplicitArgOffset;
2969 AlignedForImplictArgs = true;
2970 }
2971 ArgOffset += ImplicitArgOffset;
2972 }
2973
2974 // Arg is preloaded into the previous SGPR.
2975 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2976 assert(InIdx >= 1 && "No previous SGPR");
2977 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2978 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2979 continue;
2980 }
2981
2982 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2983 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2984 // Check for free user SGPRs for preloading.
2985 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2986 InPreloadSequence = false;
2987 break;
2988 }
2989
2990 // Preload this argument.
2991 const TargetRegisterClass *RC =
2992 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2993 SmallVectorImpl<MCRegister> *PreloadRegs =
2994 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2995
2996 if (PreloadRegs->size() > 1)
2997 RC = &AMDGPU::SGPR_32RegClass;
2998 for (auto &Reg : *PreloadRegs) {
2999 assert(Reg);
3000 MF.addLiveIn(Reg, RC);
3001 CCInfo.AllocateReg(Reg);
3002 }
3003
3004 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3005 }
3006 }
3007}
3008
3010 const SIRegisterInfo &TRI,
3011 SIMachineFunctionInfo &Info) const {
3012 // Always allocate this last since it is a synthetic preload.
3013 if (Info.hasLDSKernelId()) {
3014 Register Reg = Info.addLDSKernelId();
3015 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3016 CCInfo.AllocateReg(Reg);
3017 }
3018}
3019
3020// Allocate special input registers that are initialized per-wave.
3023 CallingConv::ID CallConv,
3024 bool IsShader) const {
3025 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3026 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
3027 // Note: user SGPRs are handled by the front-end for graphics shaders
3028 // Pad up the used user SGPRs with dead inputs.
3029
3030 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
3031 // before enabling architected SGPRs for workgroup IDs.
3032 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
3033
3034 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3035 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
3036 // rely on it to reach 16 since if we end up having no stack usage, it will
3037 // not really be added.
3038 unsigned NumRequiredSystemSGPRs =
3039 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3040 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3041 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3042 Register Reg = Info.addReservedUserSGPR();
3043 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3044 CCInfo.AllocateReg(Reg);
3045 }
3046 }
3047
3048 if (!HasArchitectedSGPRs) {
3049 if (Info.hasWorkGroupIDX()) {
3050 Register Reg = Info.addWorkGroupIDX();
3051 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3052 CCInfo.AllocateReg(Reg);
3053 }
3054
3055 if (Info.hasWorkGroupIDY()) {
3056 Register Reg = Info.addWorkGroupIDY();
3057 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3058 CCInfo.AllocateReg(Reg);
3059 }
3060
3061 if (Info.hasWorkGroupIDZ()) {
3062 Register Reg = Info.addWorkGroupIDZ();
3063 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3064 CCInfo.AllocateReg(Reg);
3065 }
3066 }
3067
3068 if (Info.hasWorkGroupInfo()) {
3069 Register Reg = Info.addWorkGroupInfo();
3070 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3071 CCInfo.AllocateReg(Reg);
3072 }
3073
3074 if (Info.hasPrivateSegmentWaveByteOffset()) {
3075 // Scratch wave offset passed in system SGPR.
3076 unsigned PrivateSegmentWaveByteOffsetReg;
3077
3078 if (IsShader) {
3079 PrivateSegmentWaveByteOffsetReg =
3080 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3081
3082 // This is true if the scratch wave byte offset doesn't have a fixed
3083 // location.
3084 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3085 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
3086 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3087 }
3088 } else
3089 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3090
3091 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3092 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
3093 }
3094
3095 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
3096 Info.getNumPreloadedSGPRs() >= 16);
3097}
3098
3100 MachineFunction &MF,
3101 const SIRegisterInfo &TRI,
3103 // Now that we've figured out where the scratch register inputs are, see if
3104 // should reserve the arguments and use them directly.
3105 MachineFrameInfo &MFI = MF.getFrameInfo();
3106 bool HasStackObjects = MFI.hasStackObjects();
3107 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3108
3109 // Record that we know we have non-spill stack objects so we don't need to
3110 // check all stack objects later.
3111 if (HasStackObjects)
3112 Info.setHasNonSpillStackObjects(true);
3113
3114 // Everything live out of a block is spilled with fast regalloc, so it's
3115 // almost certain that spilling will be required.
3116 if (TM.getOptLevel() == CodeGenOptLevel::None)
3117 HasStackObjects = true;
3118
3119 // For now assume stack access is needed in any callee functions, so we need
3120 // the scratch registers to pass in.
3121 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
3122
3123 if (!ST.enableFlatScratch()) {
3124 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
3125 // If we have stack objects, we unquestionably need the private buffer
3126 // resource. For the Code Object V2 ABI, this will be the first 4 user
3127 // SGPR inputs. We can reserve those and use them directly.
3128
3129 Register PrivateSegmentBufferReg =
3131 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3132 } else {
3133 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
3134 // We tentatively reserve the last registers (skipping the last registers
3135 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
3136 // we'll replace these with the ones immediately after those which were
3137 // really allocated. In the prologue copies will be inserted from the
3138 // argument to these reserved registers.
3139
3140 // Without HSA, relocations are used for the scratch pointer and the
3141 // buffer resource setup is always inserted in the prologue. Scratch wave
3142 // offset is still in an input SGPR.
3143 Info.setScratchRSrcReg(ReservedBufferReg);
3144 }
3145 }
3146
3148
3149 // For entry functions we have to set up the stack pointer if we use it,
3150 // whereas non-entry functions get this "for free". This means there is no
3151 // intrinsic advantage to using S32 over S34 in cases where we do not have
3152 // calls but do need a frame pointer (i.e. if we are requested to have one
3153 // because frame pointer elimination is disabled). To keep things simple we
3154 // only ever use S32 as the call ABI stack pointer, and so using it does not
3155 // imply we need a separate frame pointer.
3156 //
3157 // Try to use s32 as the SP, but move it if it would interfere with input
3158 // arguments. This won't work with calls though.
3159 //
3160 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
3161 // registers.
3162 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
3163 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3164 } else {
3166
3167 if (MFI.hasCalls())
3168 report_fatal_error("call in graphics shader with too many input SGPRs");
3169
3170 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
3171 if (!MRI.isLiveIn(Reg)) {
3172 Info.setStackPtrOffsetReg(Reg);
3173 break;
3174 }
3175 }
3176
3177 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3178 report_fatal_error("failed to find register for SP");
3179 }
3180
3181 // hasFP should be accurate for entry functions even before the frame is
3182 // finalized, because it does not rely on the known stack size, only
3183 // properties like whether variable sized objects are present.
3184 if (ST.getFrameLowering()->hasFP(MF)) {
3185 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3186 }
3187}
3188
3191 return !Info->isEntryFunction();
3192}
3193
3195
3197 MachineBasicBlock *Entry,
3198 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
3200
3201 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3202 if (!IStart)
3203 return;
3204
3205 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3206 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
3207 MachineBasicBlock::iterator MBBI = Entry->begin();
3208 for (const MCPhysReg *I = IStart; *I; ++I) {
3209 const TargetRegisterClass *RC = nullptr;
3210 if (AMDGPU::SReg_64RegClass.contains(*I))
3211 RC = &AMDGPU::SGPR_64RegClass;
3212 else if (AMDGPU::SReg_32RegClass.contains(*I))
3213 RC = &AMDGPU::SGPR_32RegClass;
3214 else
3215 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3216
3217 Register NewVR = MRI->createVirtualRegister(RC);
3218 // Create copy from CSR to a virtual register.
3219 Entry->addLiveIn(*I);
3220 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
3221 .addReg(*I);
3222
3223 // Insert the copy-back instructions right before the terminator.
3224 for (auto *Exit : Exits)
3225 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
3226 TII->get(TargetOpcode::COPY), *I)
3227 .addReg(NewVR);
3228 }
3229}
3230
3232 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3233 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3234 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3236
3238 const Function &Fn = MF.getFunction();
3241 bool IsError = false;
3242
3243 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
3245 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3246 IsError = true;
3247 }
3248
3251 BitVector Skipped(Ins.size());
3252 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3253 *DAG.getContext());
3254
3255 bool IsGraphics = AMDGPU::isGraphics(CallConv);
3256 bool IsKernel = AMDGPU::isKernel(CallConv);
3257 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
3258
3259 if (IsGraphics) {
3260 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3261 assert(!UserSGPRInfo.hasDispatchPtr() &&
3262 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3263 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3264 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3265 (void)UserSGPRInfo;
3266 if (!Subtarget->enableFlatScratch())
3267 assert(!UserSGPRInfo.hasFlatScratchInit());
3268 if ((CallConv != CallingConv::AMDGPU_CS &&
3269 CallConv != CallingConv::AMDGPU_Gfx &&
3270 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
3271 !Subtarget->hasArchitectedSGPRs())
3272 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3273 !Info->hasWorkGroupIDZ());
3274 }
3275
3276 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3277
3278 if (CallConv == CallingConv::AMDGPU_PS) {
3279 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3280
3281 // At least one interpolation mode must be enabled or else the GPU will
3282 // hang.
3283 //
3284 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3285 // set PSInputAddr, the user wants to enable some bits after the compilation
3286 // based on run-time states. Since we can't know what the final PSInputEna
3287 // will look like, so we shouldn't do anything here and the user should take
3288 // responsibility for the correct programming.
3289 //
3290 // Otherwise, the following restrictions apply:
3291 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
3292 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
3293 // enabled too.
3294 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3295 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3296 CCInfo.AllocateReg(AMDGPU::VGPR0);
3297 CCInfo.AllocateReg(AMDGPU::VGPR1);
3298 Info->markPSInputAllocated(0);
3299 Info->markPSInputEnabled(0);
3300 }
3301 if (Subtarget->isAmdPalOS()) {
3302 // For isAmdPalOS, the user does not enable some bits after compilation
3303 // based on run-time states; the register values being generated here are
3304 // the final ones set in hardware. Therefore we need to apply the
3305 // workaround to PSInputAddr and PSInputEnable together. (The case where
3306 // a bit is set in PSInputAddr but not PSInputEnable is where the
3307 // frontend set up an input arg for a particular interpolation mode, but
3308 // nothing uses that input arg. Really we should have an earlier pass
3309 // that removes such an arg.)
3310 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3311 if ((PsInputBits & 0x7F) == 0 ||
3312 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3313 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
3314 }
3315 } else if (IsKernel) {
3316 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3317 } else {
3318 Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3319 Ins.end());
3320 }
3321
3322 if (IsKernel)
3323 analyzeFormalArgumentsCompute(CCInfo, Ins);
3324
3325 if (IsEntryFunc) {
3326 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
3327 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
3328 if (IsKernel && Subtarget->hasKernargPreload())
3329 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
3330
3331 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
3332 } else if (!IsGraphics) {
3333 // For the fixed ABI, pass workitem IDs in the last argument register.
3334 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
3335
3336 // FIXME: Sink this into allocateSpecialInputSGPRs
3337 if (!Subtarget->enableFlatScratch())
3338 CCInfo.AllocateReg(Info->getScratchRSrcReg());
3339
3340 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
3341 }
3342
3343 if (!IsKernel) {
3344 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
3345 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
3346
3347 // This assumes the registers are allocated by CCInfo in ascending order
3348 // with no gaps.
3349 Info->setNumWaveDispatchSGPRs(
3350 CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
3351 Info->setNumWaveDispatchVGPRs(
3352 CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
3353 } else if (Info->getNumKernargPreloadedSGPRs()) {
3354 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3355 }
3356
3358
3359 if (IsWholeWaveFunc) {
3361 {MVT::i1, MVT::Other}, Chain);
3362 InVals.push_back(Setup.getValue(0));
3363 Chains.push_back(Setup.getValue(1));
3364 }
3365
3366 // FIXME: This is the minimum kernel argument alignment. We should improve
3367 // this to the maximum alignment of the arguments.
3368 //
3369 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3370 // kern arg offset.
3371 const Align KernelArgBaseAlign = Align(16);
3372
3373 for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3374 ++i) {
3375 const ISD::InputArg &Arg = Ins[i];
3376 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3377 InVals.push_back(DAG.getPOISON(Arg.VT));
3378 continue;
3379 }
3380
3381 CCValAssign &VA = ArgLocs[ArgIdx++];
3382 MVT VT = VA.getLocVT();
3383
3384 if (IsEntryFunc && VA.isMemLoc()) {
3385 VT = Ins[i].VT;
3386 EVT MemVT = VA.getLocVT();
3387
3388 const uint64_t Offset = VA.getLocMemOffset();
3389 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
3390
3391 if (Arg.Flags.isByRef()) {
3392 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
3393
3394 const GCNTargetMachine &TM =
3395 static_cast<const GCNTargetMachine &>(getTargetMachine());
3396 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
3397 Arg.Flags.getPointerAddrSpace())) {
3400 }
3401
3402 InVals.push_back(Ptr);
3403 continue;
3404 }
3405
3406 SDValue NewArg;
3407 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3408 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3409 // In this case the argument is packed into the previous preload SGPR.
3410 int64_t AlignDownOffset = alignDown(Offset, 4);
3411 int64_t OffsetDiff = Offset - AlignDownOffset;
3412 EVT IntVT = MemVT.changeTypeToInteger();
3413
3414 const SIMachineFunctionInfo *Info =
3417 Register Reg =
3418 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3419
3420 assert(Reg);
3421 Register VReg = MRI.getLiveInVirtReg(Reg);
3422 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3423
3424 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
3425 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
3426
3427 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
3428 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
3429 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
3430 Ins[i].Flags.isSExt(), &Ins[i]);
3431
3432 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
3433 } else {
3434 const SIMachineFunctionInfo *Info =
3437 const SmallVectorImpl<MCRegister> &PreloadRegs =
3438 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3439
3440 SDValue Copy;
3441 if (PreloadRegs.size() == 1) {
3442 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3443 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3444 NewArg = DAG.getCopyFromReg(
3445 Chain, DL, VReg,
3447 TRI->getRegSizeInBits(*RC)));
3448
3449 } else {
3450 // If the kernarg alignment does not match the alignment of the SGPR
3451 // tuple RC that can accommodate this argument, it will be built up
3452 // via copies from from the individual SGPRs that the argument was
3453 // preloaded to.
3455 for (auto Reg : PreloadRegs) {
3456 Register VReg = MRI.getLiveInVirtReg(Reg);
3457 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3458 Elts.push_back(Copy);
3459 }
3460 NewArg =
3461 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3462 PreloadRegs.size()),
3463 DL, Elts);
3464 }
3465
3466 // If the argument was preloaded to multiple consecutive 32-bit
3467 // registers because of misalignment between addressable SGPR tuples
3468 // and the argument size, we can still assume that because of kernarg
3469 // segment alignment restrictions that NewArg's size is the same as
3470 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3471 // truncate since we cannot preload to less than a single SGPR and the
3472 // MemVT may be smaller.
3473 EVT MemVTInt =
3475 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3476 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3477
3478 NewArg = DAG.getBitcast(MemVT, NewArg);
3479 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3480 Ins[i].Flags.isSExt(), &Ins[i]);
3481 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3482 }
3483 } else {
3484 // Hidden arguments that are in the kernel signature must be preloaded
3485 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3486 // the argument list and is not preloaded.
3487 if (Arg.isOrigArg()) {
3488 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3489 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3491 *OrigArg->getParent(),
3492 "hidden argument in kernel signature was not preloaded",
3493 DL.getDebugLoc()));
3494 }
3495 }
3496
3497 NewArg =
3498 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3499 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3500 }
3501 Chains.push_back(NewArg.getValue(1));
3502
3503 auto *ParamTy =
3504 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3505 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3506 ParamTy &&
3507 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3508 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3509 // On SI local pointers are just offsets into LDS, so they are always
3510 // less than 16-bits. On CI and newer they could potentially be
3511 // real pointers, so we can't guarantee their size.
3512 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3513 DAG.getValueType(MVT::i16));
3514 }
3515
3516 InVals.push_back(NewArg);
3517 continue;
3518 }
3519 if (!IsEntryFunc && VA.isMemLoc()) {
3520 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3521 InVals.push_back(Val);
3522 if (!Arg.Flags.isByVal())
3523 Chains.push_back(Val.getValue(1));
3524 continue;
3525 }
3526
3527 assert(VA.isRegLoc() && "Parameter must be in a register!");
3528
3529 Register Reg = VA.getLocReg();
3530 const TargetRegisterClass *RC = nullptr;
3531 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3532 RC = &AMDGPU::VGPR_32RegClass;
3533 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3534 RC = &AMDGPU::SGPR_32RegClass;
3535 else
3536 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3537
3538 Reg = MF.addLiveIn(Reg, RC);
3539 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3540
3541 if (Arg.Flags.isSRet()) {
3542 // The return object should be reasonably addressable.
3543
3544 // FIXME: This helps when the return is a real sret. If it is a
3545 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3546 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3547 unsigned NumBits =
3549 Val = DAG.getNode(
3550 ISD::AssertZext, DL, VT, Val,
3551 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3552 }
3553
3554 Val = convertABITypeToValueType(DAG, Val, VA, DL);
3555 InVals.push_back(Val);
3556 }
3557
3558 // Start adding system SGPRs.
3559 if (IsEntryFunc)
3560 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3561
3562 // DAG.getPass() returns nullptr when using new pass manager.
3563 // TODO: Use DAG.getMFAM() to access analysis result.
3564 if (DAG.getPass()) {
3565 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3566 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3567 }
3568
3569 unsigned StackArgSize = CCInfo.getStackSize();
3570 Info->setBytesInStackArgArea(StackArgSize);
3571
3572 return Chains.empty() ? Chain
3573 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3574}
3575
3576// TODO: If return values can't fit in registers, we should return as many as
3577// possible in registers before passing on stack.
3579 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3580 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3581 const Type *RetTy) const {
3582 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3583 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3584 // for shaders. Vector types should be explicitly handled by CC.
3585 if (AMDGPU::isEntryFunctionCC(CallConv))
3586 return true;
3587
3589 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3590 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3591 return false;
3592
3593 // We must use the stack if return would require unavailable registers.
3594 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3595 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3596 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3597 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3598 return false;
3599
3600 return true;
3601}
3602
3603SDValue
3605 bool isVarArg,
3607 const SmallVectorImpl<SDValue> &OutVals,
3608 const SDLoc &DL, SelectionDAG &DAG) const {
3612
3613 if (AMDGPU::isKernel(CallConv)) {
3614 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3615 OutVals, DL, DAG);
3616 }
3617
3618 bool IsShader = AMDGPU::isShader(CallConv);
3619
3620 Info->setIfReturnsVoid(Outs.empty());
3621 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3622
3623 // CCValAssign - represent the assignment of the return value to a location.
3625
3626 // CCState - Info about the registers and stack slots.
3627 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3628 *DAG.getContext());
3629
3630 // Analyze outgoing return values.
3631 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3632
3633 SDValue Glue;
3635 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3636
3637 SDValue ReadFirstLane =
3638 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3639 // Copy the result values into the output registers.
3640 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3641 ++I, ++RealRVLocIdx) {
3642 CCValAssign &VA = RVLocs[I];
3643 assert(VA.isRegLoc() && "Can only return in registers!");
3644 // TODO: Partially return in registers if return values don't fit.
3645 SDValue Arg = OutVals[RealRVLocIdx];
3646
3647 // Copied from other backends.
3648 switch (VA.getLocInfo()) {
3649 case CCValAssign::Full:
3650 break;
3651 case CCValAssign::BCvt:
3652 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3653 break;
3654 case CCValAssign::SExt:
3655 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3656 break;
3657 case CCValAssign::ZExt:
3658 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3659 break;
3660 case CCValAssign::AExt:
3661 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3662 break;
3663 default:
3664 llvm_unreachable("Unknown loc info!");
3665 }
3666 if (TRI->isSGPRPhysReg(VA.getLocReg()))
3668 ReadFirstLane, Arg);
3669 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3670 Glue = Chain.getValue(1);
3671 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3672 }
3673
3674 // FIXME: Does sret work properly?
3675 if (!Info->isEntryFunction()) {
3676 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3677 const MCPhysReg *I =
3678 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3679 if (I) {
3680 for (; *I; ++I) {
3681 if (AMDGPU::SReg_64RegClass.contains(*I))
3682 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3683 else if (AMDGPU::SReg_32RegClass.contains(*I))
3684 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3685 else
3686 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3687 }
3688 }
3689 }
3690
3691 // Update chain and glue.
3692 RetOps[0] = Chain;
3693 if (Glue.getNode())
3694 RetOps.push_back(Glue);
3695
3696 unsigned Opc = AMDGPUISD::ENDPGM;
3697 if (!IsWaveEnd)
3698 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3699 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3701 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3702}
3703
3705 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3706 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3707 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3708 SDValue ThisVal) const {
3709 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3710
3711 // Assign locations to each value returned by this call.
3713 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3714 *DAG.getContext());
3715 CCInfo.AnalyzeCallResult(Ins, RetCC);
3716
3717 // Copy all of the result registers out of their specified physreg.
3718 for (CCValAssign VA : RVLocs) {
3719 SDValue Val;
3720
3721 if (VA.isRegLoc()) {
3722 Val =
3723 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3724 Chain = Val.getValue(1);
3725 InGlue = Val.getValue(2);
3726 } else if (VA.isMemLoc()) {
3727 report_fatal_error("TODO: return values in memory");
3728 } else
3729 llvm_unreachable("unknown argument location type");
3730
3731 switch (VA.getLocInfo()) {
3732 case CCValAssign::Full:
3733 break;
3734 case CCValAssign::BCvt:
3735 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3736 break;
3737 case CCValAssign::ZExt:
3738 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3739 DAG.getValueType(VA.getValVT()));
3740 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3741 break;
3742 case CCValAssign::SExt:
3743 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3744 DAG.getValueType(VA.getValVT()));
3745 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3746 break;
3747 case CCValAssign::AExt:
3748 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3749 break;
3750 default:
3751 llvm_unreachable("Unknown loc info!");
3752 }
3753
3754 InVals.push_back(Val);
3755 }
3756
3757 return Chain;
3758}
3759
3760// Add code to pass special inputs required depending on used features separate
3761// from the explicit user arguments present in the IR.
3763 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3764 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3765 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3766 // If we don't have a call site, this was a call inserted by
3767 // legalization. These can never use special inputs.
3768 if (!CLI.CB)
3769 return;
3770
3771 SelectionDAG &DAG = CLI.DAG;
3772 const SDLoc &DL = CLI.DL;
3773 const Function &F = DAG.getMachineFunction().getFunction();
3774
3775 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3776 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3777
3778 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3780 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3781 // DAG.getPass() returns nullptr when using new pass manager.
3782 // TODO: Use DAG.getMFAM() to access analysis result.
3783 if (DAG.getPass()) {
3784 auto &ArgUsageInfo =
3786 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3787 }
3788 }
3789
3790 // TODO: Unify with private memory register handling. This is complicated by
3791 // the fact that at least in kernels, the input argument is not necessarily
3792 // in the same location as the input.
3793 // clang-format off
3794 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3795 std::array<StringLiteral, 2>> ImplicitAttrs[] = {
3796 {AMDGPUFunctionArgInfo::DISPATCH_PTR, {"amdgpu-no-dispatch-ptr", ""}},
3797 {AMDGPUFunctionArgInfo::QUEUE_PTR, {"amdgpu-no-queue-ptr", ""}},
3798 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, {"amdgpu-no-implicitarg-ptr", ""}},
3799 {AMDGPUFunctionArgInfo::DISPATCH_ID, {"amdgpu-no-dispatch-id", ""}},
3800 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, {"amdgpu-no-workgroup-id-x", "amdgpu-no-cluster-id-x"}},
3801 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, {"amdgpu-no-workgroup-id-y", "amdgpu-no-cluster-id-y"}},
3802 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, {"amdgpu-no-workgroup-id-z", "amdgpu-no-cluster-id-z"}},
3803 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID, {"amdgpu-no-lds-kernel-id", ""}},
3804 };
3805 // clang-format on
3806
3807 for (auto [InputID, Attrs] : ImplicitAttrs) {
3808 // If the callee does not use the attribute value, skip copying the value.
3809 if (all_of(Attrs, [&](StringRef Attr) {
3810 return Attr.empty() || CLI.CB->hasFnAttr(Attr);
3811 }))
3812 continue;
3813
3814 const auto [OutgoingArg, ArgRC, ArgTy] =
3815 CalleeArgInfo->getPreloadedValue(InputID);
3816 if (!OutgoingArg)
3817 continue;
3818
3819 const auto [IncomingArg, IncomingArgRC, Ty] =
3820 CallerArgInfo.getPreloadedValue(InputID);
3821 assert(IncomingArgRC == ArgRC);
3822
3823 // All special arguments are ints for now.
3824 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3825 SDValue InputReg;
3826
3827 if (IncomingArg) {
3828 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3829 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3830 // The implicit arg ptr is special because it doesn't have a corresponding
3831 // input for kernels, and is computed from the kernarg segment pointer.
3832 InputReg = getImplicitArgPtr(DAG, DL);
3833 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3834 std::optional<uint32_t> Id =
3836 if (Id.has_value()) {
3837 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3838 } else {
3839 InputReg = DAG.getPOISON(ArgVT);
3840 }
3841 } else {
3842 // We may have proven the input wasn't needed, although the ABI is
3843 // requiring it. We just need to allocate the register appropriately.
3844 InputReg = DAG.getPOISON(ArgVT);
3845 }
3846
3847 if (OutgoingArg->isRegister()) {
3848 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3849 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3850 report_fatal_error("failed to allocate implicit input argument");
3851 } else {
3852 unsigned SpecialArgOffset =
3853 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3854 SDValue ArgStore =
3855 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3856 MemOpChains.push_back(ArgStore);
3857 }
3858 }
3859
3860 // Pack workitem IDs into a single register or pass it as is if already
3861 // packed.
3862
3863 auto [OutgoingArg, ArgRC, Ty] =
3865 if (!OutgoingArg)
3866 std::tie(OutgoingArg, ArgRC, Ty) =
3868 if (!OutgoingArg)
3869 std::tie(OutgoingArg, ArgRC, Ty) =
3871 if (!OutgoingArg)
3872 return;
3873
3874 const ArgDescriptor *IncomingArgX = std::get<0>(
3876 const ArgDescriptor *IncomingArgY = std::get<0>(
3878 const ArgDescriptor *IncomingArgZ = std::get<0>(
3880
3881 SDValue InputReg;
3882 SDLoc SL;
3883
3884 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3885 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3886 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3887
3888 // If incoming ids are not packed we need to pack them.
3889 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3890 NeedWorkItemIDX) {
3891 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3892 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3893 } else {
3894 InputReg = DAG.getConstant(0, DL, MVT::i32);
3895 }
3896 }
3897
3898 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3899 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3900 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3901 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3902 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3903 InputReg = InputReg.getNode()
3904 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3905 : Y;
3906 }
3907
3908 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3909 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3910 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3911 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3912 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3913 InputReg = InputReg.getNode()
3914 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3915 : Z;
3916 }
3917
3918 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3919 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3920 // We're in a situation where the outgoing function requires the workitem
3921 // ID, but the calling function does not have it (e.g a graphics function
3922 // calling a C calling convention function). This is illegal, but we need
3923 // to produce something.
3924 InputReg = DAG.getPOISON(MVT::i32);
3925 } else {
3926 // Workitem ids are already packed, any of present incoming arguments
3927 // will carry all required fields.
3928 ArgDescriptor IncomingArg =
3929 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3930 : IncomingArgY ? *IncomingArgY
3931 : *IncomingArgZ,
3932 ~0u);
3933 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3934 }
3935 }
3936
3937 if (OutgoingArg->isRegister()) {
3938 if (InputReg)
3939 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3940
3941 CCInfo.AllocateReg(OutgoingArg->getRegister());
3942 } else {
3943 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3944 if (InputReg) {
3945 SDValue ArgStore =
3946 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3947 MemOpChains.push_back(ArgStore);
3948 }
3949 }
3950}
3951
3953 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3955 const SmallVectorImpl<SDValue> &OutVals,
3956 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3957 if (AMDGPU::isChainCC(CalleeCC))
3958 return true;
3959
3960 if (!AMDGPU::mayTailCallThisCC(CalleeCC))
3961 return false;
3962
3963 // For a divergent call target, we need to do a waterfall loop over the
3964 // possible callees which precludes us from using a simple jump.
3965 if (Callee->isDivergent())
3966 return false;
3967
3969 const Function &CallerF = MF.getFunction();
3970 CallingConv::ID CallerCC = CallerF.getCallingConv();
3972 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3973
3974 // Kernels aren't callable, and don't have a live in return address so it
3975 // doesn't make sense to do a tail call with entry functions.
3976 if (!CallerPreserved)
3977 return false;
3978
3979 bool CCMatch = CallerCC == CalleeCC;
3980
3982 if (AMDGPU::canGuaranteeTCO(CalleeCC) && CCMatch)
3983 return true;
3984 return false;
3985 }
3986
3987 // TODO: Can we handle var args?
3988 if (IsVarArg)
3989 return false;
3990
3991 for (const Argument &Arg : CallerF.args()) {
3992 if (Arg.hasByValAttr())
3993 return false;
3994 }
3995
3996 LLVMContext &Ctx = *DAG.getContext();
3997
3998 // Check that the call results are passed in the same way.
3999 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
4000 CCAssignFnForCall(CalleeCC, IsVarArg),
4001 CCAssignFnForCall(CallerCC, IsVarArg)))
4002 return false;
4003
4004 // The callee has to preserve all registers the caller needs to preserve.
4005 if (!CCMatch) {
4006 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4007 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4008 return false;
4009 }
4010
4011 // Nothing more to check if the callee is taking no arguments.
4012 if (Outs.empty())
4013 return true;
4014
4016 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4017
4018 // FIXME: We are not allocating special input registers, so we will be
4019 // deciding based on incorrect register assignments.
4020 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
4021
4022 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4023 // If the stack arguments for this call do not fit into our own save area then
4024 // the call cannot be made tail.
4025 // TODO: Is this really necessary?
4026 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
4027 return false;
4028
4029 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
4030 // FIXME: What about inreg arguments that end up passed in memory?
4031 if (!CCVA.isRegLoc())
4032 continue;
4033
4034 // If we are passing an argument in an SGPR, and the value is divergent,
4035 // this call requires a waterfall loop.
4036 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4037 LLVM_DEBUG(
4038 dbgs() << "Cannot tail call due to divergent outgoing argument in "
4039 << printReg(CCVA.getLocReg(), TRI) << '\n');
4040 return false;
4041 }
4042 }
4043
4044 const MachineRegisterInfo &MRI = MF.getRegInfo();
4045 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
4046}
4047
4049 if (!CI->isTailCall())
4050 return false;
4051
4052 const Function *ParentFn = CI->getParent()->getParent();
4054 return false;
4055 return true;
4056}
4057
4058namespace {
4059// Chain calls have special arguments that we need to handle. These are
4060// tagging along at the end of the arguments list(s), after the SGPR and VGPR
4061// arguments (index 0 and 1 respectively).
4062enum ChainCallArgIdx {
4063 Exec = 2,
4064 Flags,
4065 NumVGPRs,
4066 FallbackExec,
4067 FallbackCallee
4068};
4069} // anonymous namespace
4070
4071// The wave scratch offset register is used as the global base pointer.
4073 SmallVectorImpl<SDValue> &InVals) const {
4074 CallingConv::ID CallConv = CLI.CallConv;
4075 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
4076
4077 SelectionDAG &DAG = CLI.DAG;
4078
4079 const SDLoc &DL = CLI.DL;
4080 SDValue Chain = CLI.Chain;
4081 SDValue Callee = CLI.Callee;
4082
4083 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
4084 bool UsesDynamicVGPRs = false;
4085 if (IsChainCallConv) {
4086 // The last arguments should be the value that we need to put in EXEC,
4087 // followed by the flags and any other arguments with special meanings.
4088 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
4089 // we don't treat them like the "real" arguments.
4090 auto RequestedExecIt =
4091 llvm::find_if(CLI.Outs, [](const ISD::OutputArg &Arg) {
4092 return Arg.OrigArgIndex == 2;
4093 });
4094 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
4095
4096 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
4097 CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx,
4098 CLI.OutVals.end());
4099 CLI.Outs.erase(RequestedExecIt, CLI.Outs.end());
4100
4101 assert(CLI.Outs.back().OrigArgIndex < 2 &&
4102 "Haven't popped all the special args");
4103
4104 TargetLowering::ArgListEntry RequestedExecArg =
4105 CLI.Args[ChainCallArgIdx::Exec];
4106 if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
4107 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
4108
4109 // Convert constants into TargetConstants, so they become immediate operands
4110 // instead of being selected into S_MOV.
4111 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
4112 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {
4113 ChainCallSpecialArgs.push_back(DAG.getTargetConstant(
4114 ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0)));
4115 } else
4116 ChainCallSpecialArgs.push_back(Arg.Node);
4117 };
4118
4119 PushNodeOrTargetConstant(RequestedExecArg);
4120
4121 // Process any other special arguments depending on the value of the flags.
4122 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
4123
4124 const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
4125 if (FlagsValue.isZero()) {
4126 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
4127 return lowerUnhandledCall(CLI, InVals,
4128 "no additional args allowed if flags == 0");
4129 } else if (FlagsValue.isOneBitSet(0)) {
4130 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4131 return lowerUnhandledCall(CLI, InVals, "expected 3 additional args");
4132 }
4133
4134 if (!Subtarget->isWave32()) {
4135 return lowerUnhandledCall(
4136 CLI, InVals, "dynamic VGPR mode is only supported for wave32");
4137 }
4138
4139 UsesDynamicVGPRs = true;
4140 std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
4141 CLI.Args.end(), PushNodeOrTargetConstant);
4142 }
4143 }
4144
4146 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
4148 bool &IsTailCall = CLI.IsTailCall;
4149 bool IsVarArg = CLI.IsVarArg;
4150 bool IsSibCall = false;
4152
4153 if (Callee.isUndef() || isNullConstant(Callee)) {
4154 if (!CLI.IsTailCall) {
4155 for (ISD::InputArg &Arg : CLI.Ins)
4156 InVals.push_back(DAG.getPOISON(Arg.VT));
4157 }
4158
4159 return Chain;
4160 }
4161
4162 if (IsVarArg) {
4163 return lowerUnhandledCall(CLI, InVals,
4164 "unsupported call to variadic function ");
4165 }
4166
4167 if (!CLI.CB)
4168 return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization");
4169
4170 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
4171 return lowerUnhandledCall(CLI, InVals,
4172 "unsupported required tail call to function ");
4173 }
4174
4175 if (IsTailCall) {
4176 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
4177 Outs, OutVals, Ins, DAG);
4178 if (!IsTailCall &&
4179 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
4180 report_fatal_error("failed to perform tail call elimination on a call "
4181 "site marked musttail or on llvm.amdgcn.cs.chain");
4182 }
4183
4184 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4185
4186 // A sibling call is one where we're under the usual C ABI and not planning
4187 // to change that but can still do a tail call:
4188 if (!TailCallOpt && IsTailCall)
4189 IsSibCall = true;
4190
4191 if (IsTailCall)
4192 ++NumTailCalls;
4193 }
4194
4197 SmallVector<SDValue, 8> MemOpChains;
4198
4199 // Analyze operands of the call, assigning locations to each operand.
4201 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4202 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
4203
4204 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
4206 // With a fixed ABI, allocate fixed registers before user arguments.
4207 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
4208 }
4209
4210 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
4211
4212 // Get a count of how many bytes are to be pushed on the stack.
4213 unsigned NumBytes = CCInfo.getStackSize();
4214
4215 if (IsSibCall) {
4216 // Since we're not changing the ABI to make this a tail call, the memory
4217 // operands are already available in the caller's incoming argument space.
4218 NumBytes = 0;
4219 }
4220
4221 // FPDiff is the byte offset of the call's argument area from the callee's.
4222 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4223 // by this amount for a tail call. In a sibling call it must be 0 because the
4224 // caller will deallocate the entire stack and the callee still expects its
4225 // arguments to begin at SP+0. Completely unused for non-tail calls.
4226 int32_t FPDiff = 0;
4227 MachineFrameInfo &MFI = MF.getFrameInfo();
4228 auto *TRI = Subtarget->getRegisterInfo();
4229
4230 // Adjust the stack pointer for the new arguments...
4231 // These operations are automatically eliminated by the prolog/epilog pass
4232 if (!IsSibCall)
4233 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
4234
4235 if (!IsSibCall || IsChainCallConv) {
4236 if (!Subtarget->enableFlatScratch()) {
4237 SmallVector<SDValue, 4> CopyFromChains;
4238
4239 // In the HSA case, this should be an identity copy.
4240 SDValue ScratchRSrcReg =
4241 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
4242 RegsToPass.emplace_back(IsChainCallConv
4243 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4244 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4245 ScratchRSrcReg);
4246 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
4247 Chain = DAG.getTokenFactor(DL, CopyFromChains);
4248 }
4249 }
4250
4251 const unsigned NumSpecialInputs = RegsToPass.size();
4252
4253 MVT PtrVT = MVT::i32;
4254
4255 // Walk the register/memloc assignments, inserting copies/loads.
4256 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4257 CCValAssign &VA = ArgLocs[i];
4258 SDValue Arg = OutVals[i];
4259
4260 // Promote the value if needed.
4261 switch (VA.getLocInfo()) {
4262 case CCValAssign::Full:
4263 break;
4264 case CCValAssign::BCvt:
4265 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
4266 break;
4267 case CCValAssign::ZExt:
4268 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
4269 break;
4270 case CCValAssign::SExt:
4271 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
4272 break;
4273 case CCValAssign::AExt:
4274 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
4275 break;
4276 case CCValAssign::FPExt:
4277 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
4278 break;
4279 default:
4280 llvm_unreachable("Unknown loc info!");
4281 }
4282
4283 if (VA.isRegLoc()) {
4284 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
4285 } else {
4286 assert(VA.isMemLoc());
4287
4288 SDValue DstAddr;
4289 MachinePointerInfo DstInfo;
4290
4291 unsigned LocMemOffset = VA.getLocMemOffset();
4292 int32_t Offset = LocMemOffset;
4293
4294 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
4295 MaybeAlign Alignment;
4296
4297 if (IsTailCall) {
4298 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4299 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4300 : VA.getValVT().getStoreSize();
4301
4302 // FIXME: We can have better than the minimum byval required alignment.
4303 Alignment =
4304 Flags.isByVal()
4305 ? Flags.getNonZeroByValAlign()
4306 : commonAlignment(Subtarget->getStackAlignment(), Offset);
4307
4308 Offset = Offset + FPDiff;
4309 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
4310
4311 DstAddr = DAG.getFrameIndex(FI, PtrVT);
4312 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4313
4314 // Make sure any stack arguments overlapping with where we're storing
4315 // are loaded before this eventual operation. Otherwise they'll be
4316 // clobbered.
4317
4318 // FIXME: Why is this really necessary? This seems to just result in a
4319 // lot of code to copy the stack and write them back to the same
4320 // locations, which are supposed to be immutable?
4321 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
4322 } else {
4323 // Stores to the argument stack area are relative to the stack pointer.
4324 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
4325 MVT::i32);
4326 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
4327 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
4328 Alignment =
4329 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
4330 }
4331
4332 if (Outs[i].Flags.isByVal()) {
4333 SDValue SizeNode =
4334 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
4335 SDValue Cpy =
4336 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
4337 Outs[i].Flags.getNonZeroByValAlign(),
4338 /*isVol = */ false, /*AlwaysInline = */ true,
4339 /*CI=*/nullptr, std::nullopt, DstInfo,
4341
4342 MemOpChains.push_back(Cpy);
4343 } else {
4344 SDValue Store =
4345 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
4346 MemOpChains.push_back(Store);
4347 }
4348 }
4349 }
4350
4351 if (!MemOpChains.empty())
4352 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
4353
4354 SDValue ReadFirstLaneID =
4355 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4356
4357 SDValue TokenGlue;
4358 if (CLI.ConvergenceControlToken) {
4359 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
4361 }
4362
4363 // Build a sequence of copy-to-reg nodes chained together with token chain
4364 // and flag operands which copy the outgoing args into the appropriate regs.
4365 SDValue InGlue;
4366
4367 unsigned ArgIdx = 0;
4368 for (auto [Reg, Val] : RegsToPass) {
4369 if (ArgIdx++ >= NumSpecialInputs &&
4370 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4371 // For chain calls, the inreg arguments are required to be
4372 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4373 // they are uniform.
4374 //
4375 // For other calls, if an inreg arguments is known to be uniform,
4376 // speculatively insert a readfirstlane in case it is in a VGPR.
4377 //
4378 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4379 // value, so let that continue to produce invalid code.
4380
4381 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4382 if (TokenGlue)
4383 ReadfirstlaneArgs.push_back(TokenGlue);
4385 ReadfirstlaneArgs);
4386 }
4387
4388 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4389 InGlue = Chain.getValue(1);
4390 }
4391
4392 // We don't usually want to end the call-sequence here because we would tidy
4393 // the frame up *after* the call, however in the ABI-changing tail-call case
4394 // we've carefully laid out the parameters so that when sp is reset they'll be
4395 // in the correct location.
4396 if (IsTailCall && !IsSibCall) {
4397 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
4398 InGlue = Chain.getValue(1);
4399 }
4400
4401 std::vector<SDValue> Ops({Chain});
4402
4403 // Add a redundant copy of the callee global which will not be legalized, as
4404 // we need direct access to the callee later.
4406 const GlobalValue *GV = GSD->getGlobal();
4407 Ops.push_back(Callee);
4408 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
4409 } else {
4410 if (IsTailCall) {
4411 // isEligibleForTailCallOptimization considered whether the call target is
4412 // divergent, but we may still end up with a uniform value in a VGPR.
4413 // Insert a readfirstlane just in case.
4414 SDValue ReadFirstLaneID =
4415 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4416
4417 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4418 if (TokenGlue)
4419 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
4420 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
4421 ReadfirstlaneArgs);
4422 }
4423
4424 Ops.push_back(Callee);
4425 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
4426 }
4427
4428 if (IsTailCall) {
4429 // Each tail call may have to adjust the stack by a different amount, so
4430 // this information must travel along with the operation for eventual
4431 // consumption by emitEpilogue.
4432 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
4433 }
4434
4435 if (IsChainCallConv)
4436 llvm::append_range(Ops, ChainCallSpecialArgs);
4437
4438 // Add argument registers to the end of the list so that they are known live
4439 // into the call.
4440 for (auto &[Reg, Val] : RegsToPass)
4441 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
4442
4443 // Add a register mask operand representing the call-preserved registers.
4444 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4445 assert(Mask && "Missing call preserved mask for calling convention");
4446 Ops.push_back(DAG.getRegisterMask(Mask));
4447
4448 if (SDValue Token = CLI.ConvergenceControlToken) {
4450 GlueOps.push_back(Token);
4451 if (InGlue)
4452 GlueOps.push_back(InGlue);
4453
4454 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
4455 MVT::Glue, GlueOps),
4456 0);
4457 }
4458
4459 if (InGlue)
4460 Ops.push_back(InGlue);
4461
4462 // If we're doing a tall call, use a TC_RETURN here rather than an
4463 // actual call instruction.
4464 if (IsTailCall) {
4465 MFI.setHasTailCall();
4466 unsigned OPC = AMDGPUISD::TC_RETURN;
4467 switch (CallConv) {
4470 break;
4473 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4475 break;
4476 }
4477
4478 // If the caller is a whole wave function, we need to use a special opcode
4479 // so we can patch up EXEC.
4480 if (Info->isWholeWaveFunction())
4482
4483 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4484 }
4485
4486 // Returns a chain and a flag for retval copy to use.
4487 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4488 Chain = Call.getValue(0);
4489 InGlue = Call.getValue(1);
4490
4491 uint64_t CalleePopBytes = NumBytes;
4492 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4493 if (!Ins.empty())
4494 InGlue = Chain.getValue(1);
4495
4496 // Handle result values, copying them out of physregs into vregs that we
4497 // return.
4498 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4499 InVals, /*IsThisReturn=*/false, SDValue());
4500}
4501
4502// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4503// except for:
4504// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4505// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4507 SelectionDAG &DAG) const {
4508 const MachineFunction &MF = DAG.getMachineFunction();
4510
4511 SDLoc dl(Op);
4512 EVT VT = Op.getValueType();
4513 SDValue Chain = Op.getOperand(0);
4514 Register SPReg = Info->getStackPtrOffsetReg();
4515
4516 // Chain the dynamic stack allocation so that it doesn't modify the stack
4517 // pointer when other instructions are using the stack.
4518 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4519
4520 SDValue Size = Op.getOperand(1);
4521 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4522 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4523
4524 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4526 "Stack grows upwards for AMDGPU");
4527
4528 Chain = BaseAddr.getValue(1);
4529 Align StackAlign = TFL->getStackAlign();
4530 if (Alignment > StackAlign) {
4531 uint64_t ScaledAlignment = Alignment.value()
4532 << Subtarget->getWavefrontSizeLog2();
4533 uint64_t StackAlignMask = ScaledAlignment - 1;
4534 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4535 DAG.getConstant(StackAlignMask, dl, VT));
4536 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4537 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4538 }
4539
4540 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4541 SDValue NewSP;
4543 // For constant sized alloca, scale alloca size by wave-size
4544 SDValue ScaledSize = DAG.getNode(
4545 ISD::SHL, dl, VT, Size,
4546 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4547 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4548 } else {
4549 // For dynamic sized alloca, perform wave-wide reduction to get max of
4550 // alloca size(divergent) and then scale it by wave-size
4551 SDValue WaveReduction =
4552 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4553 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4554 Size, DAG.getConstant(0, dl, MVT::i32));
4555 SDValue ScaledSize = DAG.getNode(
4556 ISD::SHL, dl, VT, Size,
4557 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4558 NewSP =
4559 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4560 SDValue ReadFirstLaneID =
4561 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4562 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4563 NewSP);
4564 }
4565
4566 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4567 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4568
4569 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4570}
4571
4573 if (Op.getValueType() != MVT::i32)
4574 return Op; // Defer to cannot select error.
4575
4577 SDLoc SL(Op);
4578
4579 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4580
4581 // Convert from wave uniform to swizzled vector address. This should protect
4582 // from any edge cases where the stacksave result isn't directly used with
4583 // stackrestore.
4584 SDValue VectorAddress =
4585 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4586 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4587}
4588
4590 SelectionDAG &DAG) const {
4591 SDLoc SL(Op);
4592 assert(Op.getValueType() == MVT::i32);
4593
4594 uint32_t BothRoundHwReg =
4596 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4597
4598 SDValue IntrinID =
4599 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4600 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4601 Op.getOperand(0), IntrinID, GetRoundBothImm);
4602
4603 // There are two rounding modes, one for f32 and one for f64/f16. We only
4604 // report in the standard value range if both are the same.
4605 //
4606 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4607 // ties away from zero is not supported, and the other values are rotated by
4608 // 1.
4609 //
4610 // If the two rounding modes are not the same, report a target defined value.
4611
4612 // Mode register rounding mode fields:
4613 //
4614 // [1:0] Single-precision round mode.
4615 // [3:2] Double/Half-precision round mode.
4616 //
4617 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4618 //
4619 // Hardware Spec
4620 // Toward-0 3 0
4621 // Nearest Even 0 1
4622 // +Inf 1 2
4623 // -Inf 2 3
4624 // NearestAway0 N/A 4
4625 //
4626 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4627 // table we can index by the raw hardware mode.
4628 //
4629 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4630
4631 SDValue BitTable =
4633
4634 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4635 SDValue RoundModeTimesNumBits =
4636 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4637
4638 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4639 // knew only one mode was demanded.
4640 SDValue TableValue =
4641 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4642 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4643
4644 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4645 SDValue TableEntry =
4646 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4647
4648 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4649 // if it's an extended value.
4650 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4651 SDValue IsStandardValue =
4652 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4653 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4654 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4655 TableEntry, EnumOffset);
4656
4657 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4658}
4659
4661 SelectionDAG &DAG) const {
4662 SDLoc SL(Op);
4663
4664 SDValue NewMode = Op.getOperand(1);
4665 assert(NewMode.getValueType() == MVT::i32);
4666
4667 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4668 // hardware MODE.fp_round values.
4669 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4670 uint32_t ClampedVal = std::min(
4671 static_cast<uint32_t>(ConstMode->getZExtValue()),
4673 NewMode = DAG.getConstant(
4674 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4675 } else {
4676 // If we know the input can only be one of the supported standard modes in
4677 // the range 0-3, we can use a simplified mapping to hardware values.
4678 KnownBits KB = DAG.computeKnownBits(NewMode);
4679 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4680 // The supported standard values are 0-3. The extended values start at 8. We
4681 // need to offset by 4 if the value is in the extended range.
4682
4683 if (UseReducedTable) {
4684 // Truncate to the low 32-bits.
4685 SDValue BitTable = DAG.getConstant(
4686 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4687
4688 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4689 SDValue RoundModeTimesNumBits =
4690 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4691
4692 NewMode =
4693 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4694
4695 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4696 // the table extracted bits into inline immediates.
4697 } else {
4698 // table_index = umin(value, value - 4)
4699 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4700 SDValue BitTable =
4702
4703 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4704 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4705 SDValue IndexVal =
4706 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4707
4708 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4709 SDValue RoundModeTimesNumBits =
4710 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4711
4712 SDValue TableValue =
4713 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4714 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4715
4716 // No need to mask out the high bits since the setreg will ignore them
4717 // anyway.
4718 NewMode = TruncTable;
4719 }
4720
4721 // Insert a readfirstlane in case the value is a VGPR. We could do this
4722 // earlier and keep more operations scalar, but that interferes with
4723 // combining the source.
4724 SDValue ReadFirstLaneID =
4725 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4726 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4727 ReadFirstLaneID, NewMode);
4728 }
4729
4730 // N.B. The setreg will be later folded into s_round_mode on supported
4731 // targets.
4732 SDValue IntrinID =
4733 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4734 uint32_t BothRoundHwReg =
4736 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4737
4738 SDValue SetReg =
4739 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4740 IntrinID, RoundBothImm, NewMode);
4741
4742 return SetReg;
4743}
4744
4746 if (Op->isDivergent() &&
4747 (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
4748 // Cannot do I$ prefetch with divergent pointer.
4749 return SDValue();
4750
4751 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4755 break;
4757 if (Subtarget->hasSafeSmemPrefetch())
4758 break;
4759 [[fallthrough]];
4760 default:
4761 return SDValue();
4762 }
4763
4764 // I$ prefetch
4765 if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
4766 return SDValue();
4767
4768 return Op;
4769}
4770
4771// Work around DAG legality rules only based on the result type.
4773 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4774 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4775 EVT SrcVT = Src.getValueType();
4776
4777 if (SrcVT.getScalarType() != MVT::bf16)
4778 return Op;
4779
4780 SDLoc SL(Op);
4781 SDValue BitCast =
4782 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4783
4784 EVT DstVT = Op.getValueType();
4785 if (IsStrict)
4786 llvm_unreachable("Need STRICT_BF16_TO_FP");
4787
4788 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4789}
4790
4792 SDLoc SL(Op);
4793 if (Op.getValueType() != MVT::i64)
4794 return Op;
4795
4796 uint32_t ModeHwReg =
4798 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4799 uint32_t TrapHwReg =
4801 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4802
4803 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4804 SDValue IntrinID =
4805 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4806 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4807 Op.getOperand(0), IntrinID, ModeHwRegImm);
4808 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4809 Op.getOperand(0), IntrinID, TrapHwRegImm);
4810 SDValue TokenReg =
4811 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4812 GetTrapReg.getValue(1));
4813
4814 SDValue CvtPtr =
4815 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4816 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4817
4818 return DAG.getMergeValues({Result, TokenReg}, SL);
4819}
4820
4822 SDLoc SL(Op);
4823 if (Op.getOperand(1).getValueType() != MVT::i64)
4824 return Op;
4825
4826 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4827 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4828 DAG.getConstant(0, SL, MVT::i32));
4829 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4830 DAG.getConstant(1, SL, MVT::i32));
4831
4832 SDValue ReadFirstLaneID =
4833 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4834 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4835 ReadFirstLaneID, NewModeReg);
4836 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4837 ReadFirstLaneID, NewTrapReg);
4838
4839 unsigned ModeHwReg =
4841 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4842 unsigned TrapHwReg =
4844 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4845
4846 SDValue IntrinID =
4847 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4848 SDValue SetModeReg =
4849 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4850 IntrinID, ModeHwRegImm, NewModeReg);
4851 SDValue SetTrapReg =
4852 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4853 IntrinID, TrapHwRegImm, NewTrapReg);
4854 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4855}
4856
4858 const MachineFunction &MF) const {
4859 const Function &Fn = MF.getFunction();
4860
4862 .Case("m0", AMDGPU::M0)
4863 .Case("exec", AMDGPU::EXEC)
4864 .Case("exec_lo", AMDGPU::EXEC_LO)
4865 .Case("exec_hi", AMDGPU::EXEC_HI)
4866 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4867 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4868 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4869 .Default(Register());
4870 if (!Reg)
4871 return Reg;
4872
4873 if (!Subtarget->hasFlatScrRegister() &&
4874 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4875 Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) +
4876 "\" for subtarget."));
4877 }
4878
4879 switch (Reg) {
4880 case AMDGPU::M0:
4881 case AMDGPU::EXEC_LO:
4882 case AMDGPU::EXEC_HI:
4883 case AMDGPU::FLAT_SCR_LO:
4884 case AMDGPU::FLAT_SCR_HI:
4885 if (VT.getSizeInBits() == 32)
4886 return Reg;
4887 break;
4888 case AMDGPU::EXEC:
4889 case AMDGPU::FLAT_SCR:
4890 if (VT.getSizeInBits() == 64)
4891 return Reg;
4892 break;
4893 default:
4894 llvm_unreachable("missing register type checking");
4895 }
4896
4898 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4899}
4900
4901// If kill is not the last instruction, split the block so kill is always a
4902// proper terminator.
4905 MachineBasicBlock *BB) const {
4906 MachineBasicBlock *SplitBB = BB->splitAt(MI, /*UpdateLiveIns=*/true);
4908 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4909 return SplitBB;
4910}
4911
4912// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4913// \p MI will be the only instruction in the loop body block. Otherwise, it will
4914// be the first instruction in the remainder block.
4915//
4916/// \returns { LoopBody, Remainder }
4917static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4919 MachineFunction *MF = MBB.getParent();
4921
4922 // To insert the loop we need to split the block. Move everything after this
4923 // point to a new block, and insert a new empty block between the two.
4925 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4927 ++MBBI;
4928
4929 MF->insert(MBBI, LoopBB);
4930 MF->insert(MBBI, RemainderBB);
4931
4932 LoopBB->addSuccessor(LoopBB);
4933 LoopBB->addSuccessor(RemainderBB);
4934
4935 // Move the rest of the block into a new block.
4936 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4937
4938 if (InstInLoop) {
4939 auto Next = std::next(I);
4940
4941 // Move instruction to loop body.
4942 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4943
4944 // Move the rest of the block.
4945 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4946 } else {
4947 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4948 }
4949
4950 MBB.addSuccessor(LoopBB);
4951
4952 return std::pair(LoopBB, RemainderBB);
4953}
4954
4955/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4957 MachineBasicBlock *MBB = MI.getParent();
4959 auto I = MI.getIterator();
4960 auto E = std::next(I);
4961
4962 // clang-format off
4963 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4964 .addImm(0);
4965 // clang-format on
4966
4967 MIBundleBuilder Bundler(*MBB, I, E);
4968 finalizeBundle(*MBB, Bundler.begin());
4969}
4970
4973 MachineBasicBlock *BB) const {
4974 const DebugLoc &DL = MI.getDebugLoc();
4975
4977
4979
4980 // Apparently kill flags are only valid if the def is in the same block?
4981 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4982 Src->setIsKill(false);
4983
4984 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
4985
4986 MachineBasicBlock::iterator I = LoopBB->end();
4987
4988 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4990
4991 // Clear TRAP_STS.MEM_VIOL
4992 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4993 .addImm(0)
4994 .addImm(EncodedReg);
4995
4997
4998 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4999
5000 // Load and check TRAP_STS.MEM_VIOL
5001 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
5002 .addImm(EncodedReg);
5003
5004 // FIXME: Do we need to use an isel pseudo that may clobber scc?
5005 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5006 .addReg(Reg, RegState::Kill)
5007 .addImm(0);
5008 // clang-format off
5009 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5010 .addMBB(LoopBB);
5011 // clang-format on
5012
5013 return RemainderBB;
5014}
5015
5016// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
5017// wavefront. If the value is uniform and just happens to be in a VGPR, this
5018// will only do one iteration. In the worst case, this will loop 64 times.
5019//
5020// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
5023 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
5024 const DebugLoc &DL, const MachineOperand &Idx,
5025 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
5026 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
5027 Register &SGPRIdxReg) {
5028
5029 MachineFunction *MF = OrigBB.getParent();
5030 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5031 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5034
5035 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5036 Register PhiExec = MRI.createVirtualRegister(BoolRC);
5037 Register NewExec = MRI.createVirtualRegister(BoolRC);
5038 Register CurrentIdxReg =
5039 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5040 Register CondReg = MRI.createVirtualRegister(BoolRC);
5041
5042 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
5043 .addReg(InitReg)
5044 .addMBB(&OrigBB)
5045 .addReg(ResultReg)
5046 .addMBB(&LoopBB);
5047
5048 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
5049 .addReg(InitSaveExecReg)
5050 .addMBB(&OrigBB)
5051 .addReg(NewExec)
5052 .addMBB(&LoopBB);
5053
5054 // Read the next variant <- also loop target.
5055 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5056 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
5057
5058 // Compare the just read M0 value to all possible Idx values.
5059 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5060 .addReg(CurrentIdxReg)
5061 .addReg(Idx.getReg(), 0, Idx.getSubReg());
5062
5063 // Update EXEC, save the original EXEC value to VCC.
5064 BuildMI(LoopBB, I, DL, TII->get(LMC.AndSaveExecOpc), NewExec)
5065 .addReg(CondReg, RegState::Kill);
5066
5067 MRI.setSimpleHint(NewExec, CondReg);
5068
5069 if (UseGPRIdxMode) {
5070 if (Offset == 0) {
5071 SGPRIdxReg = CurrentIdxReg;
5072 } else {
5073 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5074 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5075 .addReg(CurrentIdxReg, RegState::Kill)
5076 .addImm(Offset);
5077 }
5078 } else {
5079 // Move index from VCC into M0
5080 if (Offset == 0) {
5081 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5082 .addReg(CurrentIdxReg, RegState::Kill);
5083 } else {
5084 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5085 .addReg(CurrentIdxReg, RegState::Kill)
5086 .addImm(Offset);
5087 }
5088 }
5089
5090 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
5091 MachineInstr *InsertPt =
5092 BuildMI(LoopBB, I, DL, TII->get(LMC.XorTermOpc), LMC.ExecReg)
5093 .addReg(LMC.ExecReg)
5094 .addReg(NewExec);
5095
5096 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
5097 // s_cbranch_scc0?
5098
5099 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
5100 // clang-format off
5101 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5102 .addMBB(&LoopBB);
5103 // clang-format on
5104
5105 return InsertPt->getIterator();
5106}
5107
5108// This has slightly sub-optimal regalloc when the source vector is killed by
5109// the read. The register allocator does not understand that the kill is
5110// per-workitem, so is kept alive for the whole loop so we end up not re-using a
5111// subregister from it, using 1 more VGPR than necessary. This was saved when
5112// this was expanded after register allocation.
5115 unsigned InitResultReg, unsigned PhiReg, int Offset,
5116 bool UseGPRIdxMode, Register &SGPRIdxReg) {
5117 MachineFunction *MF = MBB.getParent();
5118 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5119 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5121 const DebugLoc &DL = MI.getDebugLoc();
5123
5124 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
5125 Register DstReg = MI.getOperand(0).getReg();
5126 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
5127 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
5129
5130 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
5131
5132 // Save the EXEC mask
5133 // clang-format off
5134 BuildMI(MBB, I, DL, TII->get(LMC.MovOpc), SaveExec)
5135 .addReg(LMC.ExecReg);
5136 // clang-format on
5137
5138 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
5139
5140 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5141
5142 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
5143 InitResultReg, DstReg, PhiReg, TmpExec,
5144 Offset, UseGPRIdxMode, SGPRIdxReg);
5145
5146 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
5148 ++MBBI;
5149 MF->insert(MBBI, LandingPad);
5150 LoopBB->removeSuccessor(RemainderBB);
5151 LandingPad->addSuccessor(RemainderBB);
5152 LoopBB->addSuccessor(LandingPad);
5153 MachineBasicBlock::iterator First = LandingPad->begin();
5154 // clang-format off
5155 BuildMI(*LandingPad, First, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
5156 .addReg(SaveExec);
5157 // clang-format on
5158
5159 return InsPt;
5160}
5161
5162// Returns subreg index, offset
5163static std::pair<unsigned, int>
5165 const TargetRegisterClass *SuperRC, unsigned VecReg,
5166 int Offset) {
5167 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
5168
5169 // Skip out of bounds offsets, or else we would end up using an undefined
5170 // register.
5171 if (Offset >= NumElts || Offset < 0)
5172 return std::pair(AMDGPU::sub0, Offset);
5173
5174 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
5175}
5176
5179 int Offset) {
5180 MachineBasicBlock *MBB = MI.getParent();
5181 const DebugLoc &DL = MI.getDebugLoc();
5183
5184 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5185
5186 assert(Idx->getReg() != AMDGPU::NoRegister);
5187
5188 if (Offset == 0) {
5189 // clang-format off
5190 BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5191 .add(*Idx);
5192 // clang-format on
5193 } else {
5194 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5195 .add(*Idx)
5196 .addImm(Offset);
5197 }
5198}
5199
5202 int Offset) {
5203 MachineBasicBlock *MBB = MI.getParent();
5204 const DebugLoc &DL = MI.getDebugLoc();
5206
5207 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5208
5209 if (Offset == 0)
5210 return Idx->getReg();
5211
5212 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5213 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
5214 .add(*Idx)
5215 .addImm(Offset);
5216 return Tmp;
5217}
5218
5221 const GCNSubtarget &ST) {
5222 const SIInstrInfo *TII = ST.getInstrInfo();
5223 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5224 MachineFunction *MF = MBB.getParent();
5226
5227 Register Dst = MI.getOperand(0).getReg();
5228 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5229 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
5230 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5231
5232 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
5233 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5234
5235 unsigned SubReg;
5236 std::tie(SubReg, Offset) =
5237 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
5238
5239 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5240
5241 // Check for a SGPR index.
5242 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5244 const DebugLoc &DL = MI.getDebugLoc();
5245
5246 if (UseGPRIdxMode) {
5247 // TODO: Look at the uses to avoid the copy. This may require rescheduling
5248 // to avoid interfering with other uses, so probably requires a new
5249 // optimization pass.
5251
5252 const MCInstrDesc &GPRIDXDesc =
5253 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5254 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5255 .addReg(SrcReg)
5256 .addReg(Idx)
5257 .addImm(SubReg);
5258 } else {
5260
5261 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5262 .addReg(SrcReg, 0, SubReg)
5263 .addReg(SrcReg, RegState::Implicit);
5264 }
5265
5266 MI.eraseFromParent();
5267
5268 return &MBB;
5269 }
5270
5271 // Control flow needs to be inserted if indexing with a VGPR.
5272 const DebugLoc &DL = MI.getDebugLoc();
5274
5275 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5276 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5277
5278 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
5279
5280 Register SGPRIdxReg;
5281 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
5282 UseGPRIdxMode, SGPRIdxReg);
5283
5284 MachineBasicBlock *LoopBB = InsPt->getParent();
5285
5286 if (UseGPRIdxMode) {
5287 const MCInstrDesc &GPRIDXDesc =
5288 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5289
5290 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5291 .addReg(SrcReg)
5292 .addReg(SGPRIdxReg)
5293 .addImm(SubReg);
5294 } else {
5295 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5296 .addReg(SrcReg, 0, SubReg)
5297 .addReg(SrcReg, RegState::Implicit);
5298 }
5299
5300 MI.eraseFromParent();
5301
5302 return LoopBB;
5303}
5304
5307 const GCNSubtarget &ST) {
5308 const SIInstrInfo *TII = ST.getInstrInfo();
5309 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5310 MachineFunction *MF = MBB.getParent();
5312
5313 Register Dst = MI.getOperand(0).getReg();
5314 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
5315 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5316 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
5317 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5318 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
5319 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5320
5321 // This can be an immediate, but will be folded later.
5322 assert(Val->getReg());
5323
5324 unsigned SubReg;
5325 std::tie(SubReg, Offset) =
5326 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
5327 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5328
5329 if (Idx->getReg() == AMDGPU::NoRegister) {
5331 const DebugLoc &DL = MI.getDebugLoc();
5332
5333 assert(Offset == 0);
5334
5335 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
5336 .add(*SrcVec)
5337 .add(*Val)
5338 .addImm(SubReg);
5339
5340 MI.eraseFromParent();
5341 return &MBB;
5342 }
5343
5344 // Check for a SGPR index.
5345 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5347 const DebugLoc &DL = MI.getDebugLoc();
5348
5349 if (UseGPRIdxMode) {
5351
5352 const MCInstrDesc &GPRIDXDesc =
5353 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5354 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5355 .addReg(SrcVec->getReg())
5356 .add(*Val)
5357 .addReg(Idx)
5358 .addImm(SubReg);
5359 } else {
5361
5362 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5363 TRI.getRegSizeInBits(*VecRC), 32, false);
5364 BuildMI(MBB, I, DL, MovRelDesc, Dst)
5365 .addReg(SrcVec->getReg())
5366 .add(*Val)
5367 .addImm(SubReg);
5368 }
5369 MI.eraseFromParent();
5370 return &MBB;
5371 }
5372
5373 // Control flow needs to be inserted if indexing with a VGPR.
5374 if (Val->isReg())
5375 MRI.clearKillFlags(Val->getReg());
5376
5377 const DebugLoc &DL = MI.getDebugLoc();
5378
5379 Register PhiReg = MRI.createVirtualRegister(VecRC);
5380
5381 Register SGPRIdxReg;
5382 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
5383 UseGPRIdxMode, SGPRIdxReg);
5384 MachineBasicBlock *LoopBB = InsPt->getParent();
5385
5386 if (UseGPRIdxMode) {
5387 const MCInstrDesc &GPRIDXDesc =
5388 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5389
5390 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5391 .addReg(PhiReg)
5392 .add(*Val)
5393 .addReg(SGPRIdxReg)
5394 .addImm(SubReg);
5395 } else {
5396 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5397 TRI.getRegSizeInBits(*VecRC), 32, false);
5398 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
5399 .addReg(PhiReg)
5400 .add(*Val)
5401 .addImm(SubReg);
5402 }
5403
5404 MI.eraseFromParent();
5405 return LoopBB;
5406}
5407
5409 MachineBasicBlock *BB) {
5410 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5411 // For GFX12, we emit s_add_u64 and s_sub_u64.
5412 MachineFunction *MF = BB->getParent();
5413 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5414 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5416 const DebugLoc &DL = MI.getDebugLoc();
5417 MachineOperand &Dest = MI.getOperand(0);
5418 MachineOperand &Src0 = MI.getOperand(1);
5419 MachineOperand &Src1 = MI.getOperand(2);
5420 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5421 if (ST.hasScalarAddSub64()) {
5422 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5423 // clang-format off
5424 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5425 .add(Src0)
5426 .add(Src1);
5427 // clang-format on
5428 } else {
5429 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5430 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5431
5432 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5433 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5434
5435 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5436 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5437 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5438 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5439
5440 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5441 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5442 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5443 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5444
5445 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5446 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5447 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5448 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5449 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5450 .addReg(DestSub0)
5451 .addImm(AMDGPU::sub0)
5452 .addReg(DestSub1)
5453 .addImm(AMDGPU::sub1);
5454 }
5455 MI.eraseFromParent();
5456 return BB;
5457}
5458
5460 switch (Opc) {
5461 case AMDGPU::S_MIN_U32:
5462 return std::numeric_limits<uint32_t>::max();
5463 case AMDGPU::S_MIN_I32:
5464 return std::numeric_limits<int32_t>::max();
5465 case AMDGPU::S_MAX_U32:
5466 return std::numeric_limits<uint32_t>::min();
5467 case AMDGPU::S_MAX_I32:
5468 return std::numeric_limits<int32_t>::min();
5469 case AMDGPU::S_ADD_I32:
5470 case AMDGPU::S_SUB_I32:
5471 case AMDGPU::S_OR_B32:
5472 case AMDGPU::S_XOR_B32:
5473 return std::numeric_limits<uint32_t>::min();
5474 case AMDGPU::S_AND_B32:
5475 return std::numeric_limits<uint32_t>::max();
5476 default:
5478 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5479 }
5480}
5481
5483 switch (Opc) {
5484 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5485 return std::numeric_limits<uint64_t>::max();
5486 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5487 return std::numeric_limits<int64_t>::max();
5488 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5489 return std::numeric_limits<uint64_t>::min();
5490 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5491 return std::numeric_limits<int64_t>::min();
5492 case AMDGPU::S_ADD_U64_PSEUDO:
5493 case AMDGPU::S_SUB_U64_PSEUDO:
5494 case AMDGPU::S_OR_B64:
5495 case AMDGPU::S_XOR_B64:
5496 return std::numeric_limits<uint64_t>::min();
5497 case AMDGPU::S_AND_B64:
5498 return std::numeric_limits<uint64_t>::max();
5499 default:
5501 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5502 }
5503}
5504
5505static bool is32bitWaveReduceOperation(unsigned Opc) {
5506 return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
5507 Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
5508 Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
5509 Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5510 Opc == AMDGPU::S_XOR_B32;
5511}
5512
5515 const GCNSubtarget &ST,
5516 unsigned Opc) {
5518 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5519 const DebugLoc &DL = MI.getDebugLoc();
5520 const SIInstrInfo *TII = ST.getInstrInfo();
5521
5522 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5523 Register SrcReg = MI.getOperand(1).getReg();
5524 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
5525 Register DstReg = MI.getOperand(0).getReg();
5526 MachineBasicBlock *RetBB = nullptr;
5527 if (isSGPR) {
5528 switch (Opc) {
5529 case AMDGPU::S_MIN_U32:
5530 case AMDGPU::S_MIN_I32:
5531 case AMDGPU::S_MAX_U32:
5532 case AMDGPU::S_MAX_I32:
5533 case AMDGPU::S_AND_B32:
5534 case AMDGPU::S_OR_B32: {
5535 // Idempotent operations.
5536 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5537 RetBB = &BB;
5538 break;
5539 }
5540 case AMDGPU::V_CMP_LT_U64_e64: // umin
5541 case AMDGPU::V_CMP_LT_I64_e64: // min
5542 case AMDGPU::V_CMP_GT_U64_e64: // umax
5543 case AMDGPU::V_CMP_GT_I64_e64: // max
5544 case AMDGPU::S_AND_B64:
5545 case AMDGPU::S_OR_B64: {
5546 // Idempotent operations.
5547 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
5548 RetBB = &BB;
5549 break;
5550 }
5551 case AMDGPU::S_XOR_B32:
5552 case AMDGPU::S_XOR_B64:
5553 case AMDGPU::S_ADD_I32:
5554 case AMDGPU::S_ADD_U64_PSEUDO:
5555 case AMDGPU::S_SUB_I32:
5556 case AMDGPU::S_SUB_U64_PSEUDO: {
5557 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5558 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5559 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5560 Register NumActiveLanes =
5561 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5562
5563 bool IsWave32 = ST.isWave32();
5564 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5565 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5566 unsigned BitCountOpc =
5567 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5568
5569 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5570
5571 auto NewAccumulator =
5572 BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
5573 .addReg(ExecMask);
5574
5575 switch (Opc) {
5576 case AMDGPU::S_XOR_B32:
5577 case AMDGPU::S_XOR_B64: {
5578 // Performing an XOR operation on a uniform value
5579 // depends on the parity of the number of active lanes.
5580 // For even parity, the result will be 0, for odd
5581 // parity the result will be the same as the input value.
5582 Register ParityRegister =
5583 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5584
5585 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5586 .addReg(NewAccumulator->getOperand(0).getReg())
5587 .addImm(1)
5588 .setOperandDead(3); // Dead scc
5589 if (Opc == AMDGPU::S_XOR_B32) {
5590 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5591 .addReg(SrcReg)
5592 .addReg(ParityRegister);
5593 } else {
5594 Register DestSub0 =
5595 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5596 Register DestSub1 =
5597 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5598
5599 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5600 const TargetRegisterClass *SrcSubRC =
5601 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5602
5603 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5604 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5605 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5606 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5607
5608 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5609 .add(Op1L)
5610 .addReg(ParityRegister);
5611
5612 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5613 .add(Op1H)
5614 .addReg(ParityRegister);
5615
5616 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5617 .addReg(DestSub0)
5618 .addImm(AMDGPU::sub0)
5619 .addReg(DestSub1)
5620 .addImm(AMDGPU::sub1);
5621 }
5622 break;
5623 }
5624 case AMDGPU::S_SUB_I32: {
5625 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5626
5627 // Take the negation of the source operand.
5628 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5629 .addImm(0)
5630 .addReg(SrcReg);
5631 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5632 .addReg(NegatedVal)
5633 .addReg(NewAccumulator->getOperand(0).getReg());
5634 break;
5635 }
5636 case AMDGPU::S_ADD_I32: {
5637 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5638 .addReg(SrcReg)
5639 .addReg(NewAccumulator->getOperand(0).getReg());
5640 break;
5641 }
5642 case AMDGPU::S_ADD_U64_PSEUDO:
5643 case AMDGPU::S_SUB_U64_PSEUDO: {
5644 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5645 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5646 Register Op1H_Op0L_Reg =
5647 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5648 Register Op1L_Op0H_Reg =
5649 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5650 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5651 Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5652 Register NegatedValLo =
5653 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5654 Register NegatedValHi =
5655 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5656
5657 const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5658 const TargetRegisterClass *Src1SubRC =
5659 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5660
5661 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5662 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5663 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5664 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5665
5666 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5667 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5668 .addImm(0)
5669 .addReg(NewAccumulator->getOperand(0).getReg())
5670 .setOperandDead(3); // Dead scc
5671 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5672 .addReg(NegatedValLo)
5673 .addImm(31)
5674 .setOperandDead(3); // Dead scc
5675 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5676 .add(Op1L)
5677 .addReg(NegatedValHi);
5678 }
5679 Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5680 ? NegatedValLo
5681 : NewAccumulator->getOperand(0).getReg();
5682 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5683 .add(Op1L)
5684 .addReg(LowOpcode);
5685 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5686 .add(Op1L)
5687 .addReg(LowOpcode);
5688 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5689 .add(Op1H)
5690 .addReg(LowOpcode);
5691
5692 Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5693 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5694 .addReg(CarryReg)
5695 .addReg(Op1H_Op0L_Reg)
5696 .setOperandDead(3); // Dead scc
5697
5698 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5699 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5700 .addReg(HiVal)
5701 .addReg(Op1L_Op0H_Reg)
5702 .setOperandDead(3); // Dead scc
5703 }
5704 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5705 .addReg(DestSub0)
5706 .addImm(AMDGPU::sub0)
5707 .addReg(DestSub1)
5708 .addImm(AMDGPU::sub1);
5709 break;
5710 }
5711 }
5712 RetBB = &BB;
5713 }
5714 }
5715 } else {
5716 // TODO: Implement DPP Strategy and switch based on immediate strategy
5717 // operand. For now, for all the cases (default, Iterative and DPP we use
5718 // iterative approach by default.)
5719
5720 // To reduce the VGPR using iterative approach, we need to iterate
5721 // over all the active lanes. Lowering consists of ComputeLoop,
5722 // which iterate over only active lanes. We use copy of EXEC register
5723 // as induction variable and every active lane modifies it using bitset0
5724 // so that we will get the next active lane for next iteration.
5726 Register SrcReg = MI.getOperand(1).getReg();
5727 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5728
5729 // Create Control flow for loop
5730 // Split MI's Machine Basic block into For loop
5731 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
5732
5733 // Create virtual registers required for lowering.
5734 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5735 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5736 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5737 Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
5738 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5739 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5740 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5741 Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5742 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
5743
5744 bool IsWave32 = ST.isWave32();
5745 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5746 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5747
5748 // Create initial values of induction variable from Exec, Accumulator and
5749 // insert branch instr to newly created ComputeBlock
5750 BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
5751 if (is32BitOpc) {
5753 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5754 .addImm(IdentityValue);
5755 } else {
5757 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5758 .addImm(IdentityValue);
5759 }
5760 // clang-format off
5761 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5762 .addMBB(ComputeLoop);
5763 // clang-format on
5764
5765 // Start constructing ComputeLoop
5766 I = ComputeLoop->begin();
5767 auto Accumulator =
5768 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5769 .addReg(IdentityValReg)
5770 .addMBB(&BB);
5771 auto ActiveBits =
5772 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5773 .addReg(LoopIterator)
5774 .addMBB(&BB);
5775
5776 I = ComputeLoop->end();
5777 MachineInstr *NewAccumulator;
5778 // Perform the computations
5779 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5780 BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5781 .addReg(ActiveBitsReg);
5782 if (is32BitOpc) {
5783 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5784 LaneValueReg)
5785 .addReg(SrcReg)
5786 .addReg(FF1Reg);
5787 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5788 .addReg(Accumulator->getOperand(0).getReg())
5789 .addReg(LaneValueReg);
5790 } else {
5791 Register LaneValueLoReg =
5792 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5793 Register LaneValueHiReg =
5794 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5795 Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5796 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5797 const TargetRegisterClass *SrcSubRC =
5798 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5799 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5800 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5801 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5802 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5803 // lane value input should be in an sgpr
5804 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5805 LaneValueLoReg)
5806 .add(Op1L)
5807 .addReg(FF1Reg);
5808 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5809 LaneValueHiReg)
5810 .add(Op1H)
5811 .addReg(FF1Reg);
5812 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5813 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5814 .addReg(LaneValueLoReg)
5815 .addImm(AMDGPU::sub0)
5816 .addReg(LaneValueHiReg)
5817 .addImm(AMDGPU::sub1);
5818 switch (Opc) {
5819 case AMDGPU::S_OR_B64:
5820 case AMDGPU::S_AND_B64:
5821 case AMDGPU::S_XOR_B64: {
5822 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5823 .addReg(Accumulator->getOperand(0).getReg())
5824 .addReg(LaneValue->getOperand(0).getReg())
5825 .setOperandDead(3); // Dead scc
5826 break;
5827 }
5828 case AMDGPU::V_CMP_GT_I64_e64:
5829 case AMDGPU::V_CMP_GT_U64_e64:
5830 case AMDGPU::V_CMP_LT_I64_e64:
5831 case AMDGPU::V_CMP_LT_U64_e64: {
5832 Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
5833 Register ComparisonResultReg =
5834 MRI.createVirtualRegister(WaveMaskRegClass);
5835 const TargetRegisterClass *VregClass = TRI->getVGPR64Class();
5836 const TargetRegisterClass *VSubRegClass =
5837 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5838 Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
5839 MachineOperand SrcReg0Sub0 =
5840 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5841 VregClass, AMDGPU::sub0, VSubRegClass);
5842 MachineOperand SrcReg0Sub1 =
5843 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5844 VregClass, AMDGPU::sub1, VSubRegClass);
5845 BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
5846 AccumulatorVReg)
5847 .add(SrcReg0Sub0)
5848 .addImm(AMDGPU::sub0)
5849 .add(SrcReg0Sub1)
5850 .addImm(AMDGPU::sub1);
5851 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
5852 .addReg(LaneValue->getOperand(0).getReg())
5853 .addReg(AccumulatorVReg);
5854
5855 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5856 BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
5857 .addReg(LaneMaskReg)
5858 .addReg(ActiveBitsReg);
5859
5860 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5861 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5862 .addReg(LaneValue->getOperand(0).getReg())
5863 .addReg(Accumulator->getOperand(0).getReg());
5864 break;
5865 }
5866 case AMDGPU::S_ADD_U64_PSEUDO:
5867 case AMDGPU::S_SUB_U64_PSEUDO: {
5868 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5869 .addReg(Accumulator->getOperand(0).getReg())
5870 .addReg(LaneValue->getOperand(0).getReg());
5871 ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
5872 break;
5873 }
5874 }
5875 }
5876 // Manipulate the iterator to get the next active lane
5877 unsigned BITSETOpc =
5878 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5879 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5880 .addReg(FF1Reg)
5881 .addReg(ActiveBitsReg);
5882
5883 // Add phi nodes
5884 Accumulator.addReg(DstReg).addMBB(ComputeLoop);
5885 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5886
5887 // Creating branching
5888 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5889 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5890 .addReg(NewActiveBitsReg)
5891 .addImm(0);
5892 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5893 .addMBB(ComputeLoop);
5894
5895 RetBB = ComputeEnd;
5896 }
5897 MI.eraseFromParent();
5898 return RetBB;
5899}
5900
5903 MachineBasicBlock *BB) const {
5904 MachineFunction *MF = BB->getParent();
5906 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5908 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
5910 const DebugLoc &DL = MI.getDebugLoc();
5911
5912 switch (MI.getOpcode()) {
5913 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5914 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5915 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5916 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
5917 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5918 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5919 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5920 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
5921 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5922 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5923 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5924 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
5925 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5926 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5927 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
5928 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
5929 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5930 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5931 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5932 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
5933 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5934 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5935 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5936 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
5937 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5938 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5939 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
5940 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
5941 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5942 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5943 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
5944 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
5945 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5946 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
5947 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
5948 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
5949 case AMDGPU::S_UADDO_PSEUDO:
5950 case AMDGPU::S_USUBO_PSEUDO: {
5951 MachineOperand &Dest0 = MI.getOperand(0);
5952 MachineOperand &Dest1 = MI.getOperand(1);
5953 MachineOperand &Src0 = MI.getOperand(2);
5954 MachineOperand &Src1 = MI.getOperand(3);
5955
5956 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5957 ? AMDGPU::S_ADD_U32
5958 : AMDGPU::S_SUB_U32;
5959 // clang-format off
5960 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
5961 .add(Src0)
5962 .add(Src1);
5963 // clang-format on
5964
5965 unsigned SelOpc =
5966 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5967 BuildMI(*BB, MI, DL, TII->get(SelOpc), Dest1.getReg()).addImm(-1).addImm(0);
5968
5969 MI.eraseFromParent();
5970 return BB;
5971 }
5972 case AMDGPU::S_ADD_U64_PSEUDO:
5973 case AMDGPU::S_SUB_U64_PSEUDO: {
5974 return Expand64BitScalarArithmetic(MI, BB);
5975 }
5976 case AMDGPU::V_ADD_U64_PSEUDO:
5977 case AMDGPU::V_SUB_U64_PSEUDO: {
5978 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5979
5980 MachineOperand &Dest = MI.getOperand(0);
5981 MachineOperand &Src0 = MI.getOperand(1);
5982 MachineOperand &Src1 = MI.getOperand(2);
5983
5984 if (ST.hasAddSubU64Insts()) {
5985 auto I = BuildMI(*BB, MI, DL,
5986 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
5987 : AMDGPU::V_SUB_U64_e64),
5988 Dest.getReg())
5989 .add(Src0)
5990 .add(Src1)
5991 .addImm(0); // clamp
5992 TII->legalizeOperands(*I);
5993 MI.eraseFromParent();
5994 return BB;
5995 }
5996
5997 if (IsAdd && ST.hasLshlAddU64Inst()) {
5998 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
5999 Dest.getReg())
6000 .add(Src0)
6001 .addImm(0)
6002 .add(Src1);
6003 TII->legalizeOperands(*Add);
6004 MI.eraseFromParent();
6005 return BB;
6006 }
6007
6008 const auto *CarryRC = TRI->getWaveMaskRegClass();
6009
6010 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6011 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6012
6013 Register CarryReg = MRI.createVirtualRegister(CarryRC);
6014 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
6015
6016 const TargetRegisterClass *Src0RC = Src0.isReg()
6017 ? MRI.getRegClass(Src0.getReg())
6018 : &AMDGPU::VReg_64RegClass;
6019 const TargetRegisterClass *Src1RC = Src1.isReg()
6020 ? MRI.getRegClass(Src1.getReg())
6021 : &AMDGPU::VReg_64RegClass;
6022
6023 const TargetRegisterClass *Src0SubRC =
6024 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6025 const TargetRegisterClass *Src1SubRC =
6026 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6027
6028 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
6029 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6030 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
6031 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6032
6033 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
6034 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6035 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
6036 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6037
6038 unsigned LoOpc =
6039 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6040 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
6041 .addReg(CarryReg, RegState::Define)
6042 .add(SrcReg0Sub0)
6043 .add(SrcReg1Sub0)
6044 .addImm(0); // clamp bit
6045
6046 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6047 MachineInstr *HiHalf =
6048 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
6049 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
6050 .add(SrcReg0Sub1)
6051 .add(SrcReg1Sub1)
6052 .addReg(CarryReg, RegState::Kill)
6053 .addImm(0); // clamp bit
6054
6055 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
6056 .addReg(DestSub0)
6057 .addImm(AMDGPU::sub0)
6058 .addReg(DestSub1)
6059 .addImm(AMDGPU::sub1);
6060 TII->legalizeOperands(*LoHalf);
6061 TII->legalizeOperands(*HiHalf);
6062 MI.eraseFromParent();
6063 return BB;
6064 }
6065 case AMDGPU::S_ADD_CO_PSEUDO:
6066 case AMDGPU::S_SUB_CO_PSEUDO: {
6067 // This pseudo has a chance to be selected
6068 // only from uniform add/subcarry node. All the VGPR operands
6069 // therefore assumed to be splat vectors.
6071 MachineOperand &Dest = MI.getOperand(0);
6072 MachineOperand &CarryDest = MI.getOperand(1);
6073 MachineOperand &Src0 = MI.getOperand(2);
6074 MachineOperand &Src1 = MI.getOperand(3);
6075 MachineOperand &Src2 = MI.getOperand(4);
6076 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
6077 ? AMDGPU::S_ADDC_U32
6078 : AMDGPU::S_SUBB_U32;
6079 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
6080 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6081 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6082 .addReg(Src0.getReg());
6083 Src0.setReg(RegOp0);
6084 }
6085 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
6086 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6087 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6088 .addReg(Src1.getReg());
6089 Src1.setReg(RegOp1);
6090 }
6091 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6092 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
6093 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6094 .addReg(Src2.getReg());
6095 Src2.setReg(RegOp2);
6096 }
6097
6098 if (ST.isWave64()) {
6099 if (ST.hasScalarCompareEq64()) {
6100 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
6101 .addReg(Src2.getReg())
6102 .addImm(0);
6103 } else {
6104 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
6105 const TargetRegisterClass *SubRC =
6106 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6107 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6108 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6109 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6110 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6111 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6112
6113 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
6114 .add(Src2Sub0)
6115 .add(Src2Sub1);
6116
6117 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6118 .addReg(Src2_32, RegState::Kill)
6119 .addImm(0);
6120 }
6121 } else {
6122 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6123 .addReg(Src2.getReg())
6124 .addImm(0);
6125 }
6126
6127 // clang-format off
6128 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg())
6129 .add(Src0)
6130 .add(Src1);
6131 // clang-format on
6132
6133 unsigned SelOpc =
6134 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6135
6136 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
6137 .addImm(-1)
6138 .addImm(0);
6139
6140 MI.eraseFromParent();
6141 return BB;
6142 }
6143 case AMDGPU::SI_INIT_M0: {
6144 MachineOperand &M0Init = MI.getOperand(0);
6145 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6146 TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6147 AMDGPU::M0)
6148 .add(M0Init);
6149 MI.eraseFromParent();
6150 return BB;
6151 }
6152 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6153 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6154 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6155 TII->get(AMDGPU::S_CMP_EQ_U32))
6156 .addImm(0)
6157 .addImm(0);
6158 return BB;
6159 }
6160 case AMDGPU::GET_GROUPSTATICSIZE: {
6161 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
6162 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
6163 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
6164 .add(MI.getOperand(0))
6165 .addImm(MFI->getLDSSize());
6166 MI.eraseFromParent();
6167 return BB;
6168 }
6169 case AMDGPU::GET_SHADERCYCLESHILO: {
6171 // The algorithm is:
6172 //
6173 // hi1 = getreg(SHADER_CYCLES_HI)
6174 // lo1 = getreg(SHADER_CYCLES_LO)
6175 // hi2 = getreg(SHADER_CYCLES_HI)
6176 //
6177 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
6178 // Otherwise there was overflow and the result is hi2:0. In both cases the
6179 // result should represent the actual time at some point during the sequence
6180 // of three getregs.
6181 using namespace AMDGPU::Hwreg;
6182 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6183 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
6184 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6185 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6186 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
6187 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6188 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6189 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
6190 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6191 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
6192 .addReg(RegHi1)
6193 .addReg(RegHi2);
6194 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6195 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
6196 .addReg(RegLo1)
6197 .addImm(0);
6198 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
6199 .add(MI.getOperand(0))
6200 .addReg(RegLo)
6201 .addImm(AMDGPU::sub0)
6202 .addReg(RegHi2)
6203 .addImm(AMDGPU::sub1);
6204 MI.eraseFromParent();
6205 return BB;
6206 }
6207 case AMDGPU::SI_INDIRECT_SRC_V1:
6208 case AMDGPU::SI_INDIRECT_SRC_V2:
6209 case AMDGPU::SI_INDIRECT_SRC_V4:
6210 case AMDGPU::SI_INDIRECT_SRC_V8:
6211 case AMDGPU::SI_INDIRECT_SRC_V9:
6212 case AMDGPU::SI_INDIRECT_SRC_V10:
6213 case AMDGPU::SI_INDIRECT_SRC_V11:
6214 case AMDGPU::SI_INDIRECT_SRC_V12:
6215 case AMDGPU::SI_INDIRECT_SRC_V16:
6216 case AMDGPU::SI_INDIRECT_SRC_V32:
6217 return emitIndirectSrc(MI, *BB, *getSubtarget());
6218 case AMDGPU::SI_INDIRECT_DST_V1:
6219 case AMDGPU::SI_INDIRECT_DST_V2:
6220 case AMDGPU::SI_INDIRECT_DST_V4:
6221 case AMDGPU::SI_INDIRECT_DST_V8:
6222 case AMDGPU::SI_INDIRECT_DST_V9:
6223 case AMDGPU::SI_INDIRECT_DST_V10:
6224 case AMDGPU::SI_INDIRECT_DST_V11:
6225 case AMDGPU::SI_INDIRECT_DST_V12:
6226 case AMDGPU::SI_INDIRECT_DST_V16:
6227 case AMDGPU::SI_INDIRECT_DST_V32:
6228 return emitIndirectDst(MI, *BB, *getSubtarget());
6229 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6230 case AMDGPU::SI_KILL_I1_PSEUDO:
6231 return splitKillBlock(MI, BB);
6232 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6233 Register Dst = MI.getOperand(0).getReg();
6234 const MachineOperand &Src0 = MI.getOperand(1);
6235 const MachineOperand &Src1 = MI.getOperand(2);
6236 Register SrcCond = MI.getOperand(3).getReg();
6237
6238 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6239 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6240 const auto *CondRC = TRI->getWaveMaskRegClass();
6241 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
6242
6243 const TargetRegisterClass *Src0RC = Src0.isReg()
6244 ? MRI.getRegClass(Src0.getReg())
6245 : &AMDGPU::VReg_64RegClass;
6246 const TargetRegisterClass *Src1RC = Src1.isReg()
6247 ? MRI.getRegClass(Src1.getReg())
6248 : &AMDGPU::VReg_64RegClass;
6249
6250 const TargetRegisterClass *Src0SubRC =
6251 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6252 const TargetRegisterClass *Src1SubRC =
6253 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6254
6255 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
6256 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6257 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
6258 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6259
6260 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
6261 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6262 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
6263 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6264
6265 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
6266 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
6267 .addImm(0)
6268 .add(Src0Sub0)
6269 .addImm(0)
6270 .add(Src1Sub0)
6271 .addReg(SrcCondCopy);
6272 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
6273 .addImm(0)
6274 .add(Src0Sub1)
6275 .addImm(0)
6276 .add(Src1Sub1)
6277 .addReg(SrcCondCopy);
6278
6279 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
6280 .addReg(DstLo)
6281 .addImm(AMDGPU::sub0)
6282 .addReg(DstHi)
6283 .addImm(AMDGPU::sub1);
6284 MI.eraseFromParent();
6285 return BB;
6286 }
6287 case AMDGPU::SI_BR_UNDEF: {
6288 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
6289 .add(MI.getOperand(0));
6290 Br->getOperand(1).setIsUndef(); // read undef SCC
6291 MI.eraseFromParent();
6292 return BB;
6293 }
6294 case AMDGPU::ADJCALLSTACKUP:
6295 case AMDGPU::ADJCALLSTACKDOWN: {
6297 MachineInstrBuilder MIB(*MF, &MI);
6298 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
6299 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
6300 return BB;
6301 }
6302 case AMDGPU::SI_CALL_ISEL: {
6303 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
6304
6306 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6307
6308 for (const MachineOperand &MO : MI.operands())
6309 MIB.add(MO);
6310
6311 MIB.cloneMemRefs(MI);
6312 MI.eraseFromParent();
6313 return BB;
6314 }
6315 case AMDGPU::V_ADD_CO_U32_e32:
6316 case AMDGPU::V_SUB_CO_U32_e32:
6317 case AMDGPU::V_SUBREV_CO_U32_e32: {
6318 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
6319 unsigned Opc = MI.getOpcode();
6320
6321 bool NeedClampOperand = false;
6322 if (TII->pseudoToMCOpcode(Opc) == -1) {
6324 NeedClampOperand = true;
6325 }
6326
6327 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
6328 if (TII->isVOP3(*I)) {
6329 I.addReg(TRI->getVCC(), RegState::Define);
6330 }
6331 I.add(MI.getOperand(1)).add(MI.getOperand(2));
6332 if (NeedClampOperand)
6333 I.addImm(0); // clamp bit for e64 encoding
6334
6335 TII->legalizeOperands(*I);
6336
6337 MI.eraseFromParent();
6338 return BB;
6339 }
6340 case AMDGPU::V_ADDC_U32_e32:
6341 case AMDGPU::V_SUBB_U32_e32:
6342 case AMDGPU::V_SUBBREV_U32_e32:
6343 // These instructions have an implicit use of vcc which counts towards the
6344 // constant bus limit.
6345 TII->legalizeOperands(MI);
6346 return BB;
6347 case AMDGPU::DS_GWS_INIT:
6348 case AMDGPU::DS_GWS_SEMA_BR:
6349 case AMDGPU::DS_GWS_BARRIER:
6350 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
6351 [[fallthrough]];
6352 case AMDGPU::DS_GWS_SEMA_V:
6353 case AMDGPU::DS_GWS_SEMA_P:
6354 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6355 // A s_waitcnt 0 is required to be the instruction immediately following.
6356 if (getSubtarget()->hasGWSAutoReplay()) {
6358 return BB;
6359 }
6360
6361 return emitGWSMemViolTestLoop(MI, BB);
6362 case AMDGPU::S_SETREG_B32: {
6363 // Try to optimize cases that only set the denormal mode or rounding mode.
6364 //
6365 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
6366 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
6367 // instead.
6368 //
6369 // FIXME: This could be predicates on the immediate, but tablegen doesn't
6370 // allow you to have a no side effect instruction in the output of a
6371 // sideeffecting pattern.
6372 auto [ID, Offset, Width] =
6373 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
6375 return BB;
6376
6377 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
6378 const unsigned SetMask = WidthMask << Offset;
6379
6380 if (getSubtarget()->hasDenormModeInst()) {
6381 unsigned SetDenormOp = 0;
6382 unsigned SetRoundOp = 0;
6383
6384 // The dedicated instructions can only set the whole denorm or round mode
6385 // at once, not a subset of bits in either.
6386 if (SetMask ==
6388 // If this fully sets both the round and denorm mode, emit the two
6389 // dedicated instructions for these.
6390 SetRoundOp = AMDGPU::S_ROUND_MODE;
6391 SetDenormOp = AMDGPU::S_DENORM_MODE;
6392 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
6393 SetRoundOp = AMDGPU::S_ROUND_MODE;
6394 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
6395 SetDenormOp = AMDGPU::S_DENORM_MODE;
6396 }
6397
6398 if (SetRoundOp || SetDenormOp) {
6399 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
6400 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6401 unsigned ImmVal = Def->getOperand(1).getImm();
6402 if (SetRoundOp) {
6403 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
6404 .addImm(ImmVal & 0xf);
6405
6406 // If we also have the denorm mode, get just the denorm mode bits.
6407 ImmVal >>= 4;
6408 }
6409
6410 if (SetDenormOp) {
6411 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
6412 .addImm(ImmVal & 0xf);
6413 }
6414
6415 MI.eraseFromParent();
6416 return BB;
6417 }
6418 }
6419 }
6420
6421 // If only FP bits are touched, used the no side effects pseudo.
6422 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
6423 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
6424 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
6425
6426 return BB;
6427 }
6428 case AMDGPU::S_INVERSE_BALLOT_U32:
6429 case AMDGPU::S_INVERSE_BALLOT_U64:
6430 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
6431 // necessary. After that they are equivalent to a COPY.
6432 MI.setDesc(TII->get(AMDGPU::COPY));
6433 return BB;
6434 case AMDGPU::ENDPGM_TRAP: {
6435 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
6436 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
6437 MI.addOperand(MachineOperand::CreateImm(0));
6438 return BB;
6439 }
6440
6441 // We need a block split to make the real endpgm a terminator. We also don't
6442 // want to break phis in successor blocks, so we can't just delete to the
6443 // end of the block.
6444
6445 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
6447 MF->push_back(TrapBB);
6448 // clang-format off
6449 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
6450 .addImm(0);
6451 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
6452 .addMBB(TrapBB);
6453 // clang-format on
6454
6455 BB->addSuccessor(TrapBB);
6456 MI.eraseFromParent();
6457 return SplitBB;
6458 }
6459 case AMDGPU::SIMULATED_TRAP: {
6460 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6461 MachineBasicBlock *SplitBB =
6462 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
6463 MI.eraseFromParent();
6464 return SplitBB;
6465 }
6466 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6467 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6469
6470 // During ISel, it's difficult to propagate the original EXEC mask to use as
6471 // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
6472 MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
6473 assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6474 Register OriginalExec = Setup->getOperand(0).getReg();
6475 MF->getRegInfo().clearKillFlags(OriginalExec);
6476 MI.getOperand(0).setReg(OriginalExec);
6477 return BB;
6478 }
6479 default:
6480 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
6481 if (!MI.mayStore())
6483 return BB;
6484 }
6486 }
6487}
6488
6490 // This currently forces unfolding various combinations of fsub into fma with
6491 // free fneg'd operands. As long as we have fast FMA (controlled by
6492 // isFMAFasterThanFMulAndFAdd), we should perform these.
6493
6494 // When fma is quarter rate, for f64 where add / sub are at best half rate,
6495 // most of these combines appear to be cycle neutral but save on instruction
6496 // count / code size.
6497 return true;
6498}
6499
6501
6503 EVT VT) const {
6504 if (!VT.isVector()) {
6505 return MVT::i1;
6506 }
6507 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
6508}
6509
6511 // TODO: Should i16 be used always if legal? For now it would force VALU
6512 // shifts.
6513 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6514}
6515
6517 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6518 ? Ty.changeElementSize(16)
6519 : Ty.changeElementSize(32);
6520}
6521
6522// Answering this is somewhat tricky and depends on the specific device which
6523// have different rates for fma or all f64 operations.
6524//
6525// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
6526// regardless of which device (although the number of cycles differs between
6527// devices), so it is always profitable for f64.
6528//
6529// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
6530// only on full rate devices. Normally, we should prefer selecting v_mad_f32
6531// which we can always do even without fused FP ops since it returns the same
6532// result as the separate operations and since it is always full
6533// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
6534// however does not support denormals, so we do report fma as faster if we have
6535// a fast fma device and require denormals.
6536//
6538 EVT VT) const {
6539 VT = VT.getScalarType();
6540
6541 switch (VT.getSimpleVT().SimpleTy) {
6542 case MVT::f32: {
6543 // If mad is not available this depends only on if f32 fma is full rate.
6544 if (!Subtarget->hasMadMacF32Insts())
6545 return Subtarget->hasFastFMAF32();
6546
6547 // Otherwise f32 mad is always full rate and returns the same result as
6548 // the separate operations so should be preferred over fma.
6549 // However does not support denormals.
6551 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6552
6553 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
6554 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6555 }
6556 case MVT::f64:
6557 return true;
6558 case MVT::f16:
6559 case MVT::bf16:
6560 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
6561 default:
6562 break;
6563 }
6564
6565 return false;
6566}
6567
6569 LLT Ty) const {
6570 switch (Ty.getScalarSizeInBits()) {
6571 case 16:
6572 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
6573 case 32:
6574 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
6575 case 64:
6576 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
6577 default:
6578 break;
6579 }
6580
6581 return false;
6582}
6583
6585 if (!Ty.isScalar())
6586 return false;
6587
6588 if (Ty.getScalarSizeInBits() == 16)
6589 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
6590 if (Ty.getScalarSizeInBits() == 32)
6591 return Subtarget->hasMadMacF32Insts() &&
6592 denormalModeIsFlushAllF32(*MI.getMF());
6593
6594 return false;
6595}
6596
6598 const SDNode *N) const {
6599 // TODO: Check future ftz flag
6600 // v_mad_f32/v_mac_f32 do not support denormals.
6601 EVT VT = N->getValueType(0);
6602 if (VT == MVT::f32)
6603 return Subtarget->hasMadMacF32Insts() &&
6605 if (VT == MVT::f16) {
6606 return Subtarget->hasMadF16() &&
6608 }
6609
6610 return false;
6611}
6612
6613//===----------------------------------------------------------------------===//
6614// Custom DAG Lowering Operations
6615//===----------------------------------------------------------------------===//
6616
6617// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6618// wider vector type is legal.
6620 SelectionDAG &DAG) const {
6621 unsigned Opc = Op.getOpcode();
6622 EVT VT = Op.getValueType();
6623 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6624 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6625 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6626 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6627 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6628 VT == MVT::v32bf16);
6629
6630 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
6631
6632 SDLoc SL(Op);
6633 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
6634 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
6635
6636 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6637}
6638
6639// Enable lowering of ROTR for vxi32 types. This is a workaround for a
6640// regression whereby extra unnecessary instructions were added to codegen
6641// for rotr operations, casued by legalising v2i32 or. This resulted in extra
6642// instructions to extract the result from the vector.
6644 [[maybe_unused]] EVT VT = Op.getValueType();
6645
6646 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6647 VT == MVT::v16i32) &&
6648 "Unexpected ValueType.");
6649
6650 return DAG.UnrollVectorOp(Op.getNode());
6651}
6652
6653// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6654// wider vector type is legal.
6656 SelectionDAG &DAG) const {
6657 unsigned Opc = Op.getOpcode();
6658 EVT VT = Op.getValueType();
6659 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6660 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6661 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6662 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6663 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6664 VT == MVT::v32bf16);
6665
6666 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
6667 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6668
6669 SDLoc SL(Op);
6670
6671 SDValue OpLo =
6672 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
6673 SDValue OpHi =
6674 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
6675
6676 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6677}
6678
6680 SelectionDAG &DAG) const {
6681 unsigned Opc = Op.getOpcode();
6682 EVT VT = Op.getValueType();
6683 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6684 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6685 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6686 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6687 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6688 VT == MVT::v32bf16);
6689
6690 SDValue Op0 = Op.getOperand(0);
6691 auto [Lo0, Hi0] = Op0.getValueType().isVector()
6692 ? DAG.SplitVectorOperand(Op.getNode(), 0)
6693 : std::pair(Op0, Op0);
6694
6695 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6696 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
6697
6698 SDLoc SL(Op);
6699 auto ResVT = DAG.GetSplitDestVTs(VT);
6700
6701 SDValue OpLo =
6702 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
6703 SDValue OpHi =
6704 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
6705
6706 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6707}
6708
6710 switch (Op.getOpcode()) {
6711 default:
6713 case ISD::BRCOND:
6714 return LowerBRCOND(Op, DAG);
6715 case ISD::RETURNADDR:
6716 return LowerRETURNADDR(Op, DAG);
6717 case ISD::LOAD: {
6718 SDValue Result = LowerLOAD(Op, DAG);
6719 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6720 "Load should return a value and a chain");
6721 return Result;
6722 }
6723 case ISD::FSQRT: {
6724 EVT VT = Op.getValueType();
6725 if (VT == MVT::f32)
6726 return lowerFSQRTF32(Op, DAG);
6727 if (VT == MVT::f64)
6728 return lowerFSQRTF64(Op, DAG);
6729 return SDValue();
6730 }
6731 case ISD::FSIN:
6732 case ISD::FCOS:
6733 return LowerTrig(Op, DAG);
6734 case ISD::SELECT:
6735 return LowerSELECT(Op, DAG);
6736 case ISD::FDIV:
6737 return LowerFDIV(Op, DAG);
6738 case ISD::FFREXP:
6739 return LowerFFREXP(Op, DAG);
6740 case ISD::ATOMIC_CMP_SWAP:
6741 return LowerATOMIC_CMP_SWAP(Op, DAG);
6742 case ISD::STORE:
6743 return LowerSTORE(Op, DAG);
6744 case ISD::GlobalAddress: {
6747 return LowerGlobalAddress(MFI, Op, DAG);
6748 }
6750 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6752 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6754 return LowerINTRINSIC_VOID(Op, DAG);
6755 case ISD::ADDRSPACECAST:
6756 return lowerADDRSPACECAST(Op, DAG);
6758 return lowerINSERT_SUBVECTOR(Op, DAG);
6760 return lowerINSERT_VECTOR_ELT(Op, DAG);
6762 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
6764 return lowerVECTOR_SHUFFLE(Op, DAG);
6766 return lowerSCALAR_TO_VECTOR(Op, DAG);
6767 case ISD::BUILD_VECTOR:
6768 return lowerBUILD_VECTOR(Op, DAG);
6769 case ISD::FP_ROUND:
6771 return lowerFP_ROUND(Op, DAG);
6772 case ISD::TRAP:
6773 return lowerTRAP(Op, DAG);
6774 case ISD::DEBUGTRAP:
6775 return lowerDEBUGTRAP(Op, DAG);
6776 case ISD::ABS:
6777 case ISD::FABS:
6778 case ISD::FNEG:
6779 case ISD::FCANONICALIZE:
6780 case ISD::BSWAP:
6781 return splitUnaryVectorOp(Op, DAG);
6782 case ISD::FMINNUM:
6783 case ISD::FMAXNUM:
6784 return lowerFMINNUM_FMAXNUM(Op, DAG);
6785 case ISD::FMINIMUMNUM:
6786 case ISD::FMAXIMUMNUM:
6787 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
6788 case ISD::FMINIMUM:
6789 case ISD::FMAXIMUM:
6790 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
6791 case ISD::FLDEXP:
6792 case ISD::STRICT_FLDEXP:
6793 return lowerFLDEXP(Op, DAG);
6794 case ISD::FMA:
6795 return splitTernaryVectorOp(Op, DAG);
6796 case ISD::FP_TO_SINT:
6797 case ISD::FP_TO_UINT:
6798 return LowerFP_TO_INT(Op, DAG);
6799 case ISD::SHL:
6800 case ISD::SRA:
6801 case ISD::SRL:
6802 case ISD::ADD:
6803 case ISD::SUB:
6804 case ISD::SMIN:
6805 case ISD::SMAX:
6806 case ISD::UMIN:
6807 case ISD::UMAX:
6808 case ISD::FADD:
6809 case ISD::FMUL:
6810 case ISD::FMINNUM_IEEE:
6811 case ISD::FMAXNUM_IEEE:
6812 case ISD::UADDSAT:
6813 case ISD::USUBSAT:
6814 case ISD::SADDSAT:
6815 case ISD::SSUBSAT:
6816 return splitBinaryVectorOp(Op, DAG);
6817 case ISD::FCOPYSIGN:
6818 return lowerFCOPYSIGN(Op, DAG);
6819 case ISD::MUL:
6820 return lowerMUL(Op, DAG);
6821 case ISD::SMULO:
6822 case ISD::UMULO:
6823 return lowerXMULO(Op, DAG);
6824 case ISD::SMUL_LOHI:
6825 case ISD::UMUL_LOHI:
6826 return lowerXMUL_LOHI(Op, DAG);
6827 case ISD::DYNAMIC_STACKALLOC:
6828 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6829 case ISD::STACKSAVE:
6830 return LowerSTACKSAVE(Op, DAG);
6831 case ISD::GET_ROUNDING:
6832 return lowerGET_ROUNDING(Op, DAG);
6833 case ISD::SET_ROUNDING:
6834 return lowerSET_ROUNDING(Op, DAG);
6835 case ISD::PREFETCH:
6836 return lowerPREFETCH(Op, DAG);
6837 case ISD::FP_EXTEND:
6839 return lowerFP_EXTEND(Op, DAG);
6840 case ISD::GET_FPENV:
6841 return lowerGET_FPENV(Op, DAG);
6842 case ISD::SET_FPENV:
6843 return lowerSET_FPENV(Op, DAG);
6844 case ISD::ROTR:
6845 return lowerROTR(Op, DAG);
6846 }
6847 return SDValue();
6848}
6849
6850// Used for D16: Casts the result of an instruction into the right vector,
6851// packs values if loads return unpacked values.
6853 const SDLoc &DL, SelectionDAG &DAG,
6854 bool Unpacked) {
6855 if (!LoadVT.isVector())
6856 return Result;
6857
6858 // Cast back to the original packed type or to a larger type that is a
6859 // multiple of 32 bit for D16. Widening the return type is a required for
6860 // legalization.
6861 EVT FittingLoadVT = LoadVT;
6862 if ((LoadVT.getVectorNumElements() % 2) == 1) {
6863 FittingLoadVT =
6865 LoadVT.getVectorNumElements() + 1);
6866 }
6867
6868 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6869 // Truncate to v2i16/v4i16.
6870 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6871
6872 // Workaround legalizer not scalarizing truncate after vector op
6873 // legalization but not creating intermediate vector trunc.
6875 DAG.ExtractVectorElements(Result, Elts);
6876 for (SDValue &Elt : Elts)
6877 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6878
6879 // Pad illegal v1i16/v3fi6 to v4i16
6880 if ((LoadVT.getVectorNumElements() % 2) == 1)
6881 Elts.push_back(DAG.getPOISON(MVT::i16));
6882
6883 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6884
6885 // Bitcast to original type (v2f16/v4f16).
6886 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6887 }
6888
6889 // Cast back to the original packed type.
6890 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6891}
6892
6893SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6894 SelectionDAG &DAG,
6896 bool IsIntrinsic) const {
6897 SDLoc DL(M);
6898
6899 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6900 EVT LoadVT = M->getValueType(0);
6901
6902 EVT EquivLoadVT = LoadVT;
6903 if (LoadVT.isVector()) {
6904 if (Unpacked) {
6905 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6906 LoadVT.getVectorNumElements());
6907 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
6908 // Widen v3f16 to legal type
6909 EquivLoadVT =
6911 LoadVT.getVectorNumElements() + 1);
6912 }
6913 }
6914
6915 // Change from v4f16/v2f16 to EquivLoadVT.
6916 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
6917
6919 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
6920 M->getMemoryVT(), M->getMemOperand());
6921
6922 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
6923
6924 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
6925}
6926
6927SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
6928 SelectionDAG &DAG,
6929 ArrayRef<SDValue> Ops) const {
6930 SDLoc DL(M);
6931 EVT LoadVT = M->getValueType(0);
6932 EVT EltType = LoadVT.getScalarType();
6933 EVT IntVT = LoadVT.changeTypeToInteger();
6934
6935 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
6936
6937 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
6938 bool IsTFE = M->getNumValues() == 3;
6939
6940 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
6942 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
6943 : AMDGPUISD::BUFFER_LOAD;
6944
6945 if (IsD16) {
6946 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
6947 }
6948
6949 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6950 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
6951 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
6952 IsTFE);
6953
6954 if (isTypeLegal(LoadVT)) {
6955 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
6956 M->getMemOperand(), DAG);
6957 }
6958
6959 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
6960 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
6961 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
6962 M->getMemOperand(), DAG);
6963 return DAG.getMergeValues(
6964 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
6965 DL);
6966}
6967
6969 SelectionDAG &DAG) {
6970 EVT VT = N->getValueType(0);
6971 unsigned CondCode = N->getConstantOperandVal(3);
6972 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
6973 return DAG.getPOISON(VT);
6974
6975 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
6976
6977 SDValue LHS = N->getOperand(1);
6978 SDValue RHS = N->getOperand(2);
6979
6980 SDLoc DL(N);
6981
6982 EVT CmpVT = LHS.getValueType();
6983 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
6984 unsigned PromoteOp =
6986 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
6987 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
6988 }
6989
6990 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
6991
6992 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6993 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6994
6995 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
6996 DAG.getCondCode(CCOpcode));
6997 if (VT.bitsEq(CCVT))
6998 return SetCC;
6999 return DAG.getZExtOrTrunc(SetCC, DL, VT);
7000}
7001
7003 SelectionDAG &DAG) {
7004 EVT VT = N->getValueType(0);
7005
7006 unsigned CondCode = N->getConstantOperandVal(3);
7007 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
7008 return DAG.getPOISON(VT);
7009
7010 SDValue Src0 = N->getOperand(1);
7011 SDValue Src1 = N->getOperand(2);
7012 EVT CmpVT = Src0.getValueType();
7013 SDLoc SL(N);
7014
7015 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
7016 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7017 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7018 }
7019
7020 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
7021 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
7022 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7023 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7024 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7025 DAG.getCondCode(CCOpcode));
7026 if (VT.bitsEq(CCVT))
7027 return SetCC;
7028 return DAG.getZExtOrTrunc(SetCC, SL, VT);
7029}
7030
7032 SelectionDAG &DAG) {
7033 EVT VT = N->getValueType(0);
7034 SDValue Src = N->getOperand(1);
7035 SDLoc SL(N);
7036
7037 if (Src.getOpcode() == ISD::SETCC) {
7038 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
7039 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
7040 Src.getOperand(1), Src.getOperand(2));
7041 }
7042 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
7043 // (ballot 0) -> 0
7044 if (Arg->isZero())
7045 return DAG.getConstant(0, SL, VT);
7046
7047 // (ballot 1) -> EXEC/EXEC_LO
7048 if (Arg->isOne()) {
7049 Register Exec;
7050 if (VT.getScalarSizeInBits() == 32)
7051 Exec = AMDGPU::EXEC_LO;
7052 else if (VT.getScalarSizeInBits() == 64)
7053 Exec = AMDGPU::EXEC;
7054 else
7055 return SDValue();
7056
7057 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
7058 }
7059 }
7060
7061 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
7062 // ISD::SETNE)
7063 return DAG.getNode(
7064 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
7065 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
7066}
7067
7069 SelectionDAG &DAG) {
7070 EVT VT = N->getValueType(0);
7071 unsigned ValSize = VT.getSizeInBits();
7072 unsigned IID = N->getConstantOperandVal(0);
7073 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7074 IID == Intrinsic::amdgcn_permlanex16;
7075 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7076 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7077 SDLoc SL(N);
7078 MVT IntVT = MVT::getIntegerVT(ValSize);
7079 const GCNSubtarget *ST = TLI.getSubtarget();
7080 unsigned SplitSize = 32;
7081 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7082 ST->hasDPALU_DPP() &&
7083 AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))
7084 SplitSize = 64;
7085
7086 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
7087 SDValue Src2, MVT ValT) -> SDValue {
7089 switch (IID) {
7090 case Intrinsic::amdgcn_permlane16:
7091 case Intrinsic::amdgcn_permlanex16:
7092 case Intrinsic::amdgcn_update_dpp:
7093 Operands.push_back(N->getOperand(6));
7094 Operands.push_back(N->getOperand(5));
7095 Operands.push_back(N->getOperand(4));
7096 [[fallthrough]];
7097 case Intrinsic::amdgcn_writelane:
7098 Operands.push_back(Src2);
7099 [[fallthrough]];
7100 case Intrinsic::amdgcn_readlane:
7101 case Intrinsic::amdgcn_set_inactive:
7102 case Intrinsic::amdgcn_set_inactive_chain_arg:
7103 case Intrinsic::amdgcn_mov_dpp8:
7104 Operands.push_back(Src1);
7105 [[fallthrough]];
7106 case Intrinsic::amdgcn_readfirstlane:
7107 case Intrinsic::amdgcn_permlane64:
7108 Operands.push_back(Src0);
7109 break;
7110 default:
7111 llvm_unreachable("unhandled lane op");
7112 }
7113
7114 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
7115 std::reverse(Operands.begin(), Operands.end());
7116
7117 if (SDNode *GL = N->getGluedNode()) {
7118 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7119 GL = GL->getOperand(0).getNode();
7120 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7121 SDValue(GL, 0)));
7122 }
7123
7124 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
7125 };
7126
7127 SDValue Src0 = N->getOperand(1);
7128 SDValue Src1, Src2;
7129 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7130 IID == Intrinsic::amdgcn_mov_dpp8 ||
7131 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7132 Src1 = N->getOperand(2);
7133 if (IID == Intrinsic::amdgcn_writelane ||
7134 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7135 Src2 = N->getOperand(3);
7136 }
7137
7138 if (ValSize == SplitSize) {
7139 // Already legal
7140 return SDValue();
7141 }
7142
7143 if (ValSize < 32) {
7144 bool IsFloat = VT.isFloatingPoint();
7145 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
7146 SL, MVT::i32);
7147
7148 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7149 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
7150 SL, MVT::i32);
7151 }
7152
7153 if (IID == Intrinsic::amdgcn_writelane) {
7154 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
7155 SL, MVT::i32);
7156 }
7157
7158 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7159 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
7160 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
7161 }
7162
7163 if (ValSize % SplitSize != 0)
7164 return SDValue();
7165
7166 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
7167 EVT VT = N->getValueType(0);
7168 unsigned NE = VT.getVectorNumElements();
7169 EVT EltVT = VT.getVectorElementType();
7171 unsigned NumOperands = N->getNumOperands();
7172 SmallVector<SDValue, 4> Operands(NumOperands);
7173 SDNode *GL = N->getGluedNode();
7174
7175 // only handle convergencectrl_glue
7176 assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7177
7178 for (unsigned i = 0; i != NE; ++i) {
7179 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7180 ++j) {
7181 SDValue Operand = N->getOperand(j);
7182 EVT OperandVT = Operand.getValueType();
7183 if (OperandVT.isVector()) {
7184 // A vector operand; extract a single element.
7185 EVT OperandEltVT = OperandVT.getVectorElementType();
7186 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
7187 Operand, DAG.getVectorIdxConstant(i, SL));
7188 } else {
7189 // A scalar operand; just use it as is.
7190 Operands[j] = Operand;
7191 }
7192 }
7193
7194 if (GL)
7195 Operands[NumOperands - 1] =
7196 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7197 SDValue(GL->getOperand(0).getNode(), 0));
7198
7199 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
7200 }
7201
7202 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
7203 return DAG.getBuildVector(VecVT, SL, Scalars);
7204 };
7205
7206 if (VT.isVector()) {
7207 switch (MVT::SimpleValueType EltTy =
7209 case MVT::i32:
7210 case MVT::f32:
7211 if (SplitSize == 32) {
7212 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
7213 return unrollLaneOp(LaneOp.getNode());
7214 }
7215 [[fallthrough]];
7216 case MVT::i16:
7217 case MVT::f16:
7218 case MVT::bf16: {
7219 unsigned SubVecNumElt =
7220 SplitSize / VT.getVectorElementType().getSizeInBits();
7221 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
7223 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7224 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7225 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
7226 DAG.getConstant(EltIdx, SL, MVT::i32));
7227
7228 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7229 IsPermLane16)
7230 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
7231 DAG.getConstant(EltIdx, SL, MVT::i32));
7232
7233 if (IID == Intrinsic::amdgcn_writelane)
7234 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
7235 DAG.getConstant(EltIdx, SL, MVT::i32));
7236
7237 Pieces.push_back(
7238 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7239 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7240 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7241 EltIdx += SubVecNumElt;
7242 }
7243 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
7244 }
7245 default:
7246 // Handle all other cases by bitcasting to i32 vectors
7247 break;
7248 }
7249 }
7250
7251 MVT VecVT =
7252 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
7253 Src0 = DAG.getBitcast(VecVT, Src0);
7254
7255 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7256 Src1 = DAG.getBitcast(VecVT, Src1);
7257
7258 if (IID == Intrinsic::amdgcn_writelane)
7259 Src2 = DAG.getBitcast(VecVT, Src2);
7260
7261 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7262 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
7263 return DAG.getBitcast(VT, UnrolledLaneOp);
7264}
7265
7268 SelectionDAG &DAG) const {
7269 switch (N->getOpcode()) {
7271 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
7272 Results.push_back(Res);
7273 return;
7274 }
7276 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
7277 Results.push_back(Res);
7278 return;
7279 }
7281 unsigned IID = N->getConstantOperandVal(0);
7282 switch (IID) {
7283 case Intrinsic::amdgcn_make_buffer_rsrc:
7284 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
7285 return;
7286 case Intrinsic::amdgcn_cvt_pkrtz: {
7287 SDValue Src0 = N->getOperand(1);
7288 SDValue Src1 = N->getOperand(2);
7289 SDLoc SL(N);
7290 SDValue Cvt =
7291 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
7292 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7293 return;
7294 }
7295 case Intrinsic::amdgcn_cvt_pknorm_i16:
7296 case Intrinsic::amdgcn_cvt_pknorm_u16:
7297 case Intrinsic::amdgcn_cvt_pk_i16:
7298 case Intrinsic::amdgcn_cvt_pk_u16: {
7299 SDValue Src0 = N->getOperand(1);
7300 SDValue Src1 = N->getOperand(2);
7301 SDLoc SL(N);
7302 unsigned Opcode;
7303
7304 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7306 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7308 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7310 else
7312
7313 EVT VT = N->getValueType(0);
7314 if (isTypeLegal(VT))
7315 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
7316 else {
7317 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
7318 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7319 }
7320 return;
7321 }
7322 case Intrinsic::amdgcn_s_buffer_load: {
7323 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
7324 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
7325 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
7326 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
7327 // s_buffer_load_i8.
7328 if (!Subtarget->hasScalarSubwordLoads())
7329 return;
7330 SDValue Op = SDValue(N, 0);
7331 SDValue Rsrc = Op.getOperand(1);
7332 SDValue Offset = Op.getOperand(2);
7333 SDValue CachePolicy = Op.getOperand(3);
7334 EVT VT = Op.getValueType();
7335 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
7336 SDLoc DL(Op);
7338 const DataLayout &DataLayout = DAG.getDataLayout();
7339 Align Alignment =
7345 VT.getStoreSize(), Alignment);
7346 SDValue LoadVal;
7347 if (!Offset->isDivergent()) {
7348 SDValue Ops[] = {Rsrc, // source register
7349 Offset, CachePolicy};
7350 SDValue BufferLoad =
7352 DAG.getVTList(MVT::i32), Ops, VT, MMO);
7353 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
7354 } else {
7355 SDValue Ops[] = {
7356 DAG.getEntryNode(), // Chain
7357 Rsrc, // rsrc
7358 DAG.getConstant(0, DL, MVT::i32), // vindex
7359 {}, // voffset
7360 {}, // soffset
7361 {}, // offset
7362 CachePolicy, // cachepolicy
7363 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7364 };
7365 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
7366 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
7367 }
7368 Results.push_back(LoadVal);
7369 return;
7370 }
7371 case Intrinsic::amdgcn_dead: {
7372 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
7373 Results.push_back(DAG.getPOISON(N->getValueType(I)));
7374 return;
7375 }
7376 }
7377 break;
7378 }
7380 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
7381 if (Res.getOpcode() == ISD::MERGE_VALUES) {
7382 // FIXME: Hacky
7383 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
7384 Results.push_back(Res.getOperand(I));
7385 }
7386 } else {
7387 Results.push_back(Res);
7388 Results.push_back(Res.getValue(1));
7389 }
7390 return;
7391 }
7392
7393 break;
7394 }
7395 case ISD::SELECT: {
7396 SDLoc SL(N);
7397 EVT VT = N->getValueType(0);
7398 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
7399 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
7400 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
7401
7402 EVT SelectVT = NewVT;
7403 if (NewVT.bitsLT(MVT::i32)) {
7404 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
7405 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
7406 SelectVT = MVT::i32;
7407 }
7408
7409 SDValue NewSelect =
7410 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
7411
7412 if (NewVT != SelectVT)
7413 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
7414 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
7415 return;
7416 }
7417 case ISD::FNEG: {
7418 if (N->getValueType(0) != MVT::v2f16)
7419 break;
7420
7421 SDLoc SL(N);
7422 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7423
7424 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
7425 DAG.getConstant(0x80008000, SL, MVT::i32));
7426 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7427 return;
7428 }
7429 case ISD::FABS: {
7430 if (N->getValueType(0) != MVT::v2f16)
7431 break;
7432
7433 SDLoc SL(N);
7434 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7435
7436 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
7437 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
7438 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7439 return;
7440 }
7441 case ISD::FSQRT: {
7442 if (N->getValueType(0) != MVT::f16)
7443 break;
7444 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
7445 break;
7446 }
7447 default:
7449 break;
7450 }
7451}
7452
7453/// Helper function for LowerBRCOND
7454static SDNode *findUser(SDValue Value, unsigned Opcode) {
7455
7456 for (SDUse &U : Value->uses()) {
7457 if (U.get() != Value)
7458 continue;
7459
7460 if (U.getUser()->getOpcode() == Opcode)
7461 return U.getUser();
7462 }
7463 return nullptr;
7464}
7465
7466unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
7467 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
7468 switch (Intr->getConstantOperandVal(1)) {
7469 case Intrinsic::amdgcn_if:
7470 return AMDGPUISD::IF;
7471 case Intrinsic::amdgcn_else:
7472 return AMDGPUISD::ELSE;
7473 case Intrinsic::amdgcn_loop:
7474 return AMDGPUISD::LOOP;
7475 case Intrinsic::amdgcn_end_cf:
7476 llvm_unreachable("should not occur");
7477 default:
7478 return 0;
7479 }
7480 }
7481
7482 // break, if_break, else_break are all only used as inputs to loop, not
7483 // directly as branch conditions.
7484 return 0;
7485}
7486
7493
7495 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7496 return false;
7497
7498 // FIXME: Either avoid relying on address space here or change the default
7499 // address space for functions to avoid the explicit check.
7500 return (GV->getValueType()->isFunctionTy() ||
7503}
7504
7506 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
7507}
7508
7510 if (!GV->hasExternalLinkage())
7511 return true;
7512
7513 const auto OS = getTargetMachine().getTargetTriple().getOS();
7514 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
7515}
7516
7517/// This transforms the control flow intrinsics to get the branch destination as
7518/// last parameter, also switches branch target with BR if the need arise
7519SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
7520 SDLoc DL(BRCOND);
7521
7522 SDNode *Intr = BRCOND.getOperand(1).getNode();
7523 SDValue Target = BRCOND.getOperand(2);
7524 SDNode *BR = nullptr;
7525 SDNode *SetCC = nullptr;
7526
7527 switch (Intr->getOpcode()) {
7528 case ISD::SETCC: {
7529 // As long as we negate the condition everything is fine
7530 SetCC = Intr;
7531 Intr = SetCC->getOperand(0).getNode();
7532 break;
7533 }
7534 case ISD::XOR: {
7535 // Similar to SETCC, if we have (xor c, -1), we will be fine.
7536 SDValue LHS = Intr->getOperand(0);
7537 SDValue RHS = Intr->getOperand(1);
7538 if (auto *C = dyn_cast<ConstantSDNode>(RHS); C && C->getZExtValue()) {
7539 Intr = LHS.getNode();
7540 break;
7541 }
7542 [[fallthrough]];
7543 }
7544 default: {
7545 // Get the target from BR if we don't negate the condition
7546 BR = findUser(BRCOND, ISD::BR);
7547 assert(BR && "brcond missing unconditional branch user");
7548 Target = BR->getOperand(1);
7549 }
7550 }
7551
7552 unsigned CFNode = isCFIntrinsic(Intr);
7553 if (CFNode == 0) {
7554 // This is a uniform branch so we don't need to legalize.
7555 return BRCOND;
7556 }
7557
7558 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
7560
7561 assert(!SetCC ||
7562 (SetCC->getConstantOperandVal(1) == 1 &&
7563 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
7564 ISD::SETNE));
7565
7566 // operands of the new intrinsic call
7568 if (HaveChain)
7569 Ops.push_back(BRCOND.getOperand(0));
7570
7571 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
7572 Ops.push_back(Target);
7573
7574 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
7575
7576 // build the new intrinsic call
7577 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
7578
7579 if (!HaveChain) {
7580 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
7581
7583 }
7584
7585 if (BR) {
7586 // Give the branch instruction our target
7587 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
7588 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
7589 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
7590 }
7591
7592 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
7593
7594 // Copy the intrinsic results to registers
7595 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
7596 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
7597 if (!CopyToReg)
7598 continue;
7599
7600 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
7601 SDValue(Result, i - 1), SDValue());
7602
7603 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
7604 }
7605
7606 // Remove the old intrinsic from the chain
7607 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
7608 Intr->getOperand(0));
7609
7610 return Chain;
7611}
7612
7613SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
7614 MVT VT = Op.getSimpleValueType();
7615 SDLoc DL(Op);
7616 // Checking the depth
7617 if (Op.getConstantOperandVal(0) != 0)
7618 return DAG.getConstant(0, DL, VT);
7619
7620 MachineFunction &MF = DAG.getMachineFunction();
7621 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7622 // Check for kernel and shader functions
7623 if (Info->isEntryFunction())
7624 return DAG.getConstant(0, DL, VT);
7625
7626 MachineFrameInfo &MFI = MF.getFrameInfo();
7627 // There is a call to @llvm.returnaddress in this function
7628 MFI.setReturnAddressIsTaken(true);
7629
7630 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
7631 // Get the return address reg and mark it as an implicit live-in
7632 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
7633 getRegClassFor(VT, Op.getNode()->isDivergent()));
7634
7635 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
7636}
7637
7638SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
7639 const SDLoc &DL, EVT VT) const {
7640 return Op.getValueType().bitsLE(VT)
7641 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
7642 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
7643 DAG.getTargetConstant(0, DL, MVT::i32));
7644}
7645
7646SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
7647 SelectionDAG &DAG) const {
7648 EVT DstVT = Op.getValueType();
7649 unsigned NumElts = DstVT.getVectorNumElements();
7650 assert(NumElts > 2 && isPowerOf2_32(NumElts));
7651
7652 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
7653
7654 SDLoc DL(Op);
7655 unsigned Opc = Op.getOpcode();
7656 SDValue Flags = Op.getOperand(1);
7657 EVT HalfDstVT =
7658 EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2);
7659 SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags);
7660 SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags);
7661
7662 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi);
7663}
7664
7665SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
7666 SDValue Src = Op.getOperand(0);
7667 EVT SrcVT = Src.getValueType();
7668 EVT DstVT = Op.getValueType();
7669
7670 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
7671 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
7672 if (SrcVT.getScalarType() != MVT::f32)
7673 return SDValue();
7674 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
7675 }
7676
7677 if (SrcVT.getScalarType() != MVT::f64)
7678 return Op;
7679
7680 SDLoc DL(Op);
7681 if (DstVT == MVT::f16) {
7682 // TODO: Handle strictfp
7683 if (Op.getOpcode() != ISD::FP_ROUND)
7684 return Op;
7685
7686 if (!Subtarget->has16BitInsts()) {
7687 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
7688 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7689 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7690 }
7691 if (Op->getFlags().hasApproximateFuncs()) {
7692 SDValue Flags = Op.getOperand(1);
7693 SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
7694 return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
7695 }
7696 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
7697 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7698 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7699 }
7700
7701 assert(DstVT.getScalarType() == MVT::bf16 &&
7702 "custom lower FP_ROUND for f16 or bf16");
7703 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
7704
7705 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
7706 // hardware f32 -> bf16 instruction.
7707 EVT F32VT = SrcVT.isVector() ? SrcVT.changeVectorElementType(MVT::f32) :
7708 MVT::f32;
7709 SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);
7710 return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,
7711 DAG.getTargetConstant(0, DL, MVT::i32));
7712}
7713
7714SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
7715 SelectionDAG &DAG) const {
7716 EVT VT = Op.getValueType();
7717 const MachineFunction &MF = DAG.getMachineFunction();
7718 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7719 bool IsIEEEMode = Info->getMode().IEEE;
7720
7721 // FIXME: Assert during selection that this is only selected for
7722 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
7723 // mode functions, but this happens to be OK since it's only done in cases
7724 // where there is known no sNaN.
7725 if (IsIEEEMode)
7726 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
7727
7728 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7729 VT == MVT::v16bf16)
7730 return splitBinaryVectorOp(Op, DAG);
7731 return Op;
7732}
7733
7734SDValue
7735SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
7736 SelectionDAG &DAG) const {
7737 EVT VT = Op.getValueType();
7738 const MachineFunction &MF = DAG.getMachineFunction();
7739 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7740 bool IsIEEEMode = Info->getMode().IEEE;
7741
7742 if (IsIEEEMode)
7743 return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
7744
7745 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7746 VT == MVT::v16bf16)
7747 return splitBinaryVectorOp(Op, DAG);
7748 return Op;
7749}
7750
7751SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
7752 SelectionDAG &DAG) const {
7753 EVT VT = Op.getValueType();
7754 if (VT.isVector())
7755 return splitBinaryVectorOp(Op, DAG);
7756
7757 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7758 !Subtarget->hasMinimum3Maximum3F16() &&
7759 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7760 "should not need to widen f16 minimum/maximum to v2f16");
7761
7762 // Widen f16 operation to v2f16
7763
7764 // fminimum f16:x, f16:y ->
7765 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
7766 // (v2f16 (scalar_to_vector y))), 0
7767 SDLoc SL(Op);
7768 SDValue WideSrc0 =
7769 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
7770 SDValue WideSrc1 =
7771 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
7772
7773 SDValue Widened =
7774 DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7775
7776 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
7777 DAG.getConstant(0, SL, MVT::i32));
7778}
7779
7780SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
7781 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
7782 EVT VT = Op.getValueType();
7783 assert(VT == MVT::f16);
7784
7785 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
7786 EVT ExpVT = Exp.getValueType();
7787 if (ExpVT == MVT::i16)
7788 return Op;
7789
7790 SDLoc DL(Op);
7791
7792 // Correct the exponent type for f16 to i16.
7793 // Clamp the range of the exponent to the instruction's range.
7794
7795 // TODO: This should be a generic narrowing legalization, and can easily be
7796 // for GlobalISel.
7797
7798 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
7799 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
7800
7801 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
7802 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
7803
7804 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
7805
7806 if (IsStrict) {
7807 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
7808 {Op.getOperand(0), Op.getOperand(1), TruncExp});
7809 }
7810
7811 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
7812}
7813
7815 switch (Op->getOpcode()) {
7816 case ISD::SRA:
7817 case ISD::SMIN:
7818 case ISD::SMAX:
7819 return ISD::SIGN_EXTEND;
7820 case ISD::SRL:
7821 case ISD::UMIN:
7822 case ISD::UMAX:
7823 return ISD::ZERO_EXTEND;
7824 case ISD::ADD:
7825 case ISD::SUB:
7826 case ISD::AND:
7827 case ISD::OR:
7828 case ISD::XOR:
7829 case ISD::SHL:
7830 case ISD::SELECT:
7831 case ISD::MUL:
7832 // operation result won't be influenced by garbage high bits.
7833 // TODO: are all of those cases correct, and are there more?
7834 return ISD::ANY_EXTEND;
7835 case ISD::SETCC: {
7836 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7838 }
7839 default:
7840 llvm_unreachable("unexpected opcode!");
7841 }
7842}
7843
7844SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
7845 DAGCombinerInfo &DCI) const {
7846 const unsigned Opc = Op.getOpcode();
7847 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
7848 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
7849 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
7850 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
7851 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
7852
7853 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
7854 : Op->getOperand(0).getValueType();
7855 auto ExtTy = OpTy.changeElementType(MVT::i32);
7856
7857 if (DCI.isBeforeLegalizeOps() ||
7858 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
7859 return SDValue();
7860
7861 auto &DAG = DCI.DAG;
7862
7863 SDLoc DL(Op);
7864 SDValue LHS;
7865 SDValue RHS;
7866 if (Opc == ISD::SELECT) {
7867 LHS = Op->getOperand(1);
7868 RHS = Op->getOperand(2);
7869 } else {
7870 LHS = Op->getOperand(0);
7871 RHS = Op->getOperand(1);
7872 }
7873
7874 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
7875 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
7876
7877 // Special case: for shifts, the RHS always needs a zext.
7878 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
7879 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
7880 else
7881 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
7882
7883 // setcc always return i1/i1 vec so no need to truncate after.
7884 if (Opc == ISD::SETCC) {
7885 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7886 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
7887 }
7888
7889 // For other ops, we extend the operation's return type as well so we need to
7890 // truncate back to the original type.
7891 SDValue NewVal;
7892 if (Opc == ISD::SELECT)
7893 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
7894 else
7895 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
7896
7897 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
7898}
7899
7900SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
7901 SDValue Mag = Op.getOperand(0);
7902 EVT MagVT = Mag.getValueType();
7903
7904 if (MagVT.getVectorNumElements() > 2)
7905 return splitBinaryVectorOp(Op, DAG);
7906
7907 SDValue Sign = Op.getOperand(1);
7908 EVT SignVT = Sign.getValueType();
7909
7910 if (MagVT == SignVT)
7911 return Op;
7912
7913 // fcopysign v2f16:mag, v2f32:sign ->
7914 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
7915
7916 SDLoc SL(Op);
7917 SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
7918 SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32);
7919
7920 SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
7921
7922 return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
7923}
7924
7925// Custom lowering for vector multiplications and s_mul_u64.
7926SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
7927 EVT VT = Op.getValueType();
7928
7929 // Split vector operands.
7930 if (VT.isVector())
7931 return splitBinaryVectorOp(Op, DAG);
7932
7933 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
7934
7935 // There are four ways to lower s_mul_u64:
7936 //
7937 // 1. If all the operands are uniform, then we lower it as it is.
7938 //
7939 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
7940 // multiplications because there is not a vector equivalent of s_mul_u64.
7941 //
7942 // 3. If the cost model decides that it is more efficient to use vector
7943 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
7944 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
7945 //
7946 // 4. If the cost model decides to use vector registers and both of the
7947 // operands are zero-extended/sign-extended from 32-bits, then we split the
7948 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
7949 // possible to check if the operands are zero-extended or sign-extended in
7950 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
7951 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
7952 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
7953 // If the cost model decides that we have to use vector registers, then
7954 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
7955 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
7956 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
7957 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
7958 // SIInstrInfo.cpp .
7959
7960 if (Op->isDivergent())
7961 return SDValue();
7962
7963 SDValue Op0 = Op.getOperand(0);
7964 SDValue Op1 = Op.getOperand(1);
7965 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
7966 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
7967 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
7968 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
7969 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
7970 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
7971 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
7972 SDLoc SL(Op);
7973 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7974 return SDValue(
7975 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7976 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
7977 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
7978 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7979 return SDValue(
7980 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7981 // If all the operands are uniform, then we lower s_mul_u64 as it is.
7982 return Op;
7983}
7984
7985SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
7986 EVT VT = Op.getValueType();
7987 SDLoc SL(Op);
7988 SDValue LHS = Op.getOperand(0);
7989 SDValue RHS = Op.getOperand(1);
7990 bool isSigned = Op.getOpcode() == ISD::SMULO;
7991
7992 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
7993 const APInt &C = RHSC->getAPIntValue();
7994 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
7995 if (C.isPowerOf2()) {
7996 // smulo(x, signed_min) is same as umulo(x, signed_min).
7997 bool UseArithShift = isSigned && !C.isMinSignedValue();
7998 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
7999 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
8000 SDValue Overflow =
8001 DAG.getSetCC(SL, MVT::i1,
8002 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
8003 Result, ShiftAmt),
8004 LHS, ISD::SETNE);
8005 return DAG.getMergeValues({Result, Overflow}, SL);
8006 }
8007 }
8008
8009 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
8010 SDValue Top =
8011 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
8012
8013 SDValue Sign = isSigned
8014 ? DAG.getNode(ISD::SRA, SL, VT, Result,
8015 DAG.getConstant(VT.getScalarSizeInBits() - 1,
8016 SL, MVT::i32))
8017 : DAG.getConstant(0, SL, VT);
8018 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
8019
8020 return DAG.getMergeValues({Result, Overflow}, SL);
8021}
8022
8023SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
8024 if (Op->isDivergent()) {
8025 // Select to V_MAD_[IU]64_[IU]32.
8026 return Op;
8027 }
8028 if (Subtarget->hasSMulHi()) {
8029 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
8030 return SDValue();
8031 }
8032 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
8033 // calculate the high part, so we might as well do the whole thing with
8034 // V_MAD_[IU]64_[IU]32.
8035 return Op;
8036}
8037
8038SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
8039 if (!Subtarget->isTrapHandlerEnabled() ||
8040 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
8041 return lowerTrapEndpgm(Op, DAG);
8042
8043 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
8044 : lowerTrapHsaQueuePtr(Op, DAG);
8045}
8046
8047SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
8048 SDLoc SL(Op);
8049 SDValue Chain = Op.getOperand(0);
8050 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8051}
8052
8053SDValue
8054SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
8055 const SDLoc &DL, Align Alignment,
8056 ImplicitParameter Param) const {
8057 MachineFunction &MF = DAG.getMachineFunction();
8058 uint64_t Offset = getImplicitParameterOffset(MF, Param);
8059 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
8060 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8061 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
8064}
8065
8066SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
8067 SelectionDAG &DAG) const {
8068 SDLoc SL(Op);
8069 SDValue Chain = Op.getOperand(0);
8070
8071 SDValue QueuePtr;
8072 // For code object version 5, QueuePtr is passed through implicit kernarg.
8073 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8075 QueuePtr =
8076 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
8077 } else {
8078 MachineFunction &MF = DAG.getMachineFunction();
8079 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8080 Register UserSGPR = Info->getQueuePtrUserSGPR();
8081
8082 if (UserSGPR == AMDGPU::NoRegister) {
8083 // We probably are in a function incorrectly marked with
8084 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
8085 // trap, so just use a null pointer.
8086 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
8087 } else {
8088 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
8089 MVT::i64);
8090 }
8091 }
8092
8093 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
8094 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
8095
8096 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8097 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
8098 ToReg.getValue(1)};
8099 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8100}
8101
8102SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
8103 SDLoc SL(Op);
8104 SDValue Chain = Op.getOperand(0);
8105
8106 // We need to simulate the 's_trap 2' instruction on targets that run in
8107 // PRIV=1 (where it is treated as a nop).
8108 if (Subtarget->hasPrivEnabledTrap2NopBug())
8109 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
8110
8111 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8112 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8113 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8114}
8115
8116SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
8117 SDLoc SL(Op);
8118 SDValue Chain = Op.getOperand(0);
8119 MachineFunction &MF = DAG.getMachineFunction();
8120
8121 if (!Subtarget->isTrapHandlerEnabled() ||
8122 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
8123 LLVMContext &Ctx = MF.getFunction().getContext();
8124 Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(),
8125 "debugtrap handler not supported",
8126 Op.getDebugLoc(), DS_Warning));
8127 return Chain;
8128 }
8129
8130 uint64_t TrapID =
8131 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
8132 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8133 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8134}
8135
8136SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
8137 SelectionDAG &DAG) const {
8138 if (Subtarget->hasApertureRegs()) {
8139 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
8140 ? AMDGPU::SRC_SHARED_BASE
8141 : AMDGPU::SRC_PRIVATE_BASE;
8142 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8143 !Subtarget->hasGloballyAddressableScratch()) &&
8144 "Cannot use src_private_base with globally addressable scratch!");
8145 // Note: this feature (register) is broken. When used as a 32-bit operand,
8146 // it returns a wrong value (all zeroes?). The real value is in the upper 32
8147 // bits.
8148 //
8149 // To work around the issue, emit a 64 bit copy from this register
8150 // then extract the high bits. Note that this shouldn't even result in a
8151 // shift being emitted and simply become a pair of registers (e.g.):
8152 // s_mov_b64 s[6:7], src_shared_base
8153 // v_mov_b32_e32 v1, s7
8154 SDValue Copy =
8155 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ApertureRegNo, MVT::v2i32);
8156 return DAG.getExtractVectorElt(DL, MVT::i32, Copy, 1);
8157 }
8158
8159 // For code object version 5, private_base and shared_base are passed through
8160 // implicit kernargs.
8161 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8165 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
8166 }
8167
8168 MachineFunction &MF = DAG.getMachineFunction();
8169 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8170 Register UserSGPR = Info->getQueuePtrUserSGPR();
8171 if (UserSGPR == AMDGPU::NoRegister) {
8172 // We probably are in a function incorrectly marked with
8173 // amdgpu-no-queue-ptr. This is undefined.
8174 return DAG.getPOISON(MVT::i32);
8175 }
8176
8177 SDValue QueuePtr =
8178 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
8179
8180 // Offset into amd_queue_t for group_segment_aperture_base_hi /
8181 // private_segment_aperture_base_hi.
8182 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
8183
8184 SDValue Ptr =
8185 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
8186
8187 // TODO: Use custom target PseudoSourceValue.
8188 // TODO: We should use the value from the IR intrinsic call, but it might not
8189 // be available and how do we get it?
8190 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8191 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
8192 commonAlignment(Align(64), StructOffset),
8195}
8196
8197/// Return true if the value is a known valid address, such that a null check is
8198/// not necessary.
8200 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
8202 return true;
8203
8204 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
8205 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
8206
8207 // TODO: Search through arithmetic, handle arguments and loads
8208 // marked nonnull.
8209 return false;
8210}
8211
8212SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
8213 SelectionDAG &DAG) const {
8214 SDLoc SL(Op);
8215
8216 const AMDGPUTargetMachine &TM =
8217 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
8218
8219 unsigned DestAS, SrcAS;
8220 SDValue Src;
8221 bool IsNonNull = false;
8222 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
8223 SrcAS = ASC->getSrcAddressSpace();
8224 Src = ASC->getOperand(0);
8225 DestAS = ASC->getDestAddressSpace();
8226 } else {
8227 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
8228 Op.getConstantOperandVal(0) ==
8229 Intrinsic::amdgcn_addrspacecast_nonnull);
8230 Src = Op->getOperand(1);
8231 SrcAS = Op->getConstantOperandVal(2);
8232 DestAS = Op->getConstantOperandVal(3);
8233 IsNonNull = true;
8234 }
8235
8236 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
8237
8238 // flat -> local/private
8239 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
8240 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
8241 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
8242 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8243
8244 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
8245 Subtarget->hasGloballyAddressableScratch()) {
8246 // flat -> private with globally addressable scratch: subtract
8247 // src_flat_scratch_base_lo.
8248 SDValue FlatScratchBaseLo(
8249 DAG.getMachineNode(
8250 AMDGPU::S_MOV_B32, SL, MVT::i32,
8251 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8252 0);
8253 Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
8254 }
8255
8256 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8257 return Ptr;
8258
8259 unsigned NullVal = TM.getNullPointerValue(DestAS);
8260 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8261 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
8262
8263 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
8264 SegmentNullPtr);
8265 }
8266 }
8267
8268 // local/private -> flat
8269 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
8270 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
8271 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
8272 SDValue CvtPtr;
8273 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
8274 Subtarget->hasGloballyAddressableScratch()) {
8275 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
8276 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
8277 SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
8278 SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
8279 ThreadID = DAG.getNode(
8280 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8281 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
8282 AllOnes, ThreadID);
8283 if (Subtarget->isWave64())
8284 ThreadID = DAG.getNode(
8285 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8286 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
8287 AllOnes, ThreadID);
8288 SDValue ShAmt = DAG.getShiftAmountConstant(
8289 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8290 SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
8291 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
8292 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8293 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
8294 // 64-bit hi:lo value.
8295 SDValue FlatScratchBase = {
8296 DAG.getMachineNode(
8297 AMDGPU::S_MOV_B64, SL, MVT::i64,
8298 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8299 0};
8300 CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8301 } else {
8302 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8303 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
8304 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8305 }
8306
8307 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8308 return CvtPtr;
8309
8310 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8311 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8312
8313 SDValue NonNull =
8314 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
8315
8316 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
8317 FlatNullPtr);
8318 }
8319 }
8320
8321 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8322 Op.getValueType() == MVT::i64) {
8323 const SIMachineFunctionInfo *Info =
8324 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
8325 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
8326 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
8327 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8328 }
8329
8330 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8331 Src.getValueType() == MVT::i64)
8332 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8333
8334 // global <-> flat are no-ops and never emitted.
8335
8336 // Invalid casts are poison.
8337 return DAG.getPOISON(Op->getValueType(0));
8338}
8339
8340// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
8341// the small vector and inserting them into the big vector. That is better than
8342// the default expansion of doing it via a stack slot. Even though the use of
8343// the stack slot would be optimized away afterwards, the stack slot itself
8344// remains.
8345SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
8346 SelectionDAG &DAG) const {
8347 SDValue Vec = Op.getOperand(0);
8348 SDValue Ins = Op.getOperand(1);
8349 SDValue Idx = Op.getOperand(2);
8350 EVT VecVT = Vec.getValueType();
8351 EVT InsVT = Ins.getValueType();
8352 EVT EltVT = VecVT.getVectorElementType();
8353 unsigned InsNumElts = InsVT.getVectorNumElements();
8354 unsigned IdxVal = Idx->getAsZExtVal();
8355 SDLoc SL(Op);
8356
8357 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
8358 // Insert 32-bit registers at a time.
8359 assert(InsNumElts % 2 == 0 && "expect legal vector types");
8360
8361 unsigned VecNumElts = VecVT.getVectorNumElements();
8362 EVT NewVecVT =
8363 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
8364 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8366 MVT::i32, InsNumElts / 2);
8367
8368 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8369 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8370
8371 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
8372 SDValue Elt;
8373 if (InsNumElts == 2) {
8374 Elt = Ins;
8375 } else {
8376 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
8377 DAG.getConstant(I, SL, MVT::i32));
8378 }
8379 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
8380 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
8381 }
8382
8383 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
8384 }
8385
8386 for (unsigned I = 0; I != InsNumElts; ++I) {
8387 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
8388 DAG.getConstant(I, SL, MVT::i32));
8389 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
8390 DAG.getConstant(IdxVal + I, SL, MVT::i32));
8391 }
8392 return Vec;
8393}
8394
8395SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
8396 SelectionDAG &DAG) const {
8397 SDValue Vec = Op.getOperand(0);
8398 SDValue InsVal = Op.getOperand(1);
8399 SDValue Idx = Op.getOperand(2);
8400 EVT VecVT = Vec.getValueType();
8401 EVT EltVT = VecVT.getVectorElementType();
8402 unsigned VecSize = VecVT.getSizeInBits();
8403 unsigned EltSize = EltVT.getSizeInBits();
8404 SDLoc SL(Op);
8405
8406 // Specially handle the case of v4i16 with static indexing.
8407 unsigned NumElts = VecVT.getVectorNumElements();
8408 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
8409 if (NumElts == 4 && EltSize == 16 && KIdx) {
8410 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
8411
8412 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8413 DAG.getConstant(0, SL, MVT::i32));
8414 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8415 DAG.getConstant(1, SL, MVT::i32));
8416
8417 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8418 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8419
8420 unsigned Idx = KIdx->getZExtValue();
8421 bool InsertLo = Idx < 2;
8422 SDValue InsHalf = DAG.getNode(
8423 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
8424 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8425 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8426
8427 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8428
8429 SDValue Concat =
8430 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
8431 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8432
8433 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
8434 }
8435
8436 // Static indexing does not lower to stack access, and hence there is no need
8437 // for special custom lowering to avoid stack access.
8438 if (isa<ConstantSDNode>(Idx))
8439 return SDValue();
8440
8441 // Avoid stack access for dynamic indexing by custom lowering to
8442 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
8443
8444 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
8445
8446 MVT IntVT = MVT::getIntegerVT(VecSize);
8447
8448 // Convert vector index to bit-index and get the required bit mask.
8449 assert(isPowerOf2_32(EltSize));
8450 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
8451 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8452 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8453 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
8454 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
8455
8456 // 1. Create a congruent vector with the target value in each element.
8457 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
8458 DAG.getSplatBuildVector(VecVT, SL, InsVal));
8459
8460 // 2. Mask off all other indices except the required index within (1).
8461 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
8462
8463 // 3. Mask off the required index within the target vector.
8464 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8465 SDValue RHS =
8466 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
8467
8468 // 4. Get (2) and (3) ORed into the target vector.
8469 SDValue BFI =
8470 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
8471
8472 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
8473}
8474
8475SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
8476 SelectionDAG &DAG) const {
8477 SDLoc SL(Op);
8478
8479 EVT ResultVT = Op.getValueType();
8480 SDValue Vec = Op.getOperand(0);
8481 SDValue Idx = Op.getOperand(1);
8482 EVT VecVT = Vec.getValueType();
8483 unsigned VecSize = VecVT.getSizeInBits();
8484 EVT EltVT = VecVT.getVectorElementType();
8485
8486 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
8487
8488 // Make sure we do any optimizations that will make it easier to fold
8489 // source modifiers before obscuring it with bit operations.
8490
8491 // XXX - Why doesn't this get called when vector_shuffle is expanded?
8492 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
8493 return Combined;
8494
8495 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8496 SDValue Lo, Hi;
8497 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
8498
8499 if (VecSize == 128) {
8500 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
8501 Lo = DAG.getBitcast(LoVT,
8502 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8503 DAG.getConstant(0, SL, MVT::i32)));
8504 Hi = DAG.getBitcast(HiVT,
8505 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8506 DAG.getConstant(1, SL, MVT::i32)));
8507 } else if (VecSize == 256) {
8508 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
8509 SDValue Parts[4];
8510 for (unsigned P = 0; P < 4; ++P) {
8511 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8512 DAG.getConstant(P, SL, MVT::i32));
8513 }
8514
8515 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8516 Parts[0], Parts[1]));
8517 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8518 Parts[2], Parts[3]));
8519 } else {
8520 assert(VecSize == 512);
8521
8522 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
8523 SDValue Parts[8];
8524 for (unsigned P = 0; P < 8; ++P) {
8525 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8526 DAG.getConstant(P, SL, MVT::i32));
8527 }
8528
8529 Lo = DAG.getBitcast(LoVT,
8530 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8531 Parts[0], Parts[1], Parts[2], Parts[3]));
8532 Hi = DAG.getBitcast(HiVT,
8533 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8534 Parts[4], Parts[5], Parts[6], Parts[7]));
8535 }
8536
8537 EVT IdxVT = Idx.getValueType();
8538 unsigned NElem = VecVT.getVectorNumElements();
8539 assert(isPowerOf2_32(NElem));
8540 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
8541 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
8542 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
8543 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
8544 }
8545
8546 assert(VecSize <= 64);
8547
8548 MVT IntVT = MVT::getIntegerVT(VecSize);
8549
8550 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
8551 SDValue VecBC = peekThroughBitcasts(Vec);
8552 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8553 SDValue Src = VecBC.getOperand(0);
8554 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8555 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
8556 }
8557
8558 unsigned EltSize = EltVT.getSizeInBits();
8559 assert(isPowerOf2_32(EltSize));
8560
8561 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8562
8563 // Convert vector index to bit-index (* EltSize)
8564 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8565
8566 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8567 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
8568
8569 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8570 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
8571 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
8572 }
8573
8574 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
8575}
8576
8577static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
8578 assert(Elt % 2 == 0);
8579 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8580}
8581
8582static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
8583 assert(Elt % 2 == 0);
8584 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8585 !(Mask[Elt + 1] & 1);
8586}
8587
8588SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
8589 SelectionDAG &DAG) const {
8590 SDLoc SL(Op);
8591 EVT ResultVT = Op.getValueType();
8592 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
8593 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
8594 const int NewSrcNumElts = 2;
8595 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
8596 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
8597
8598 // Break up the shuffle into registers sized pieces.
8599 //
8600 // We're trying to form sub-shuffles that the register allocation pipeline
8601 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
8602 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
8603 // pair of copies into a consecutive register copy, so use the ordinary
8604 // extract_vector_elt lowering unless we can use the shuffle.
8605 //
8606 // TODO: This is a bit of hack, and we should probably always use
8607 // extract_subvector for the largest possible subvector we can (or at least
8608 // use it for PackVT aligned pieces). However we have worse support for
8609 // combines on them don't directly treat extract_subvector / insert_subvector
8610 // as legal. The DAG scheduler also ends up doing a worse job with the
8611 // extract_subvectors.
8612 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
8613
8614 // vector_shuffle <0,1,6,7> lhs, rhs
8615 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
8616 //
8617 // vector_shuffle <6,7,2,3> lhs, rhs
8618 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
8619 //
8620 // vector_shuffle <6,7,0,1> lhs, rhs
8621 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
8622
8623 // Avoid scalarizing when both halves are reading from consecutive elements.
8624
8625 // If we're treating 2 element shuffles as legal, also create odd-to-even
8626 // shuffles of neighboring pairs.
8627 //
8628 // vector_shuffle <3,2,7,6> lhs, rhs
8629 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
8630 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
8631
8633 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
8634 if (ShouldUseConsecutiveExtract &&
8636 const int Idx = SVN->getMaskElt(I);
8637 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8638 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8639 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
8640 SVN->getOperand(VecIdx),
8641 DAG.getConstant(EltIdx, SL, MVT::i32));
8642 Pieces.push_back(SubVec);
8643 } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
8645 int Idx0 = SVN->getMaskElt(I);
8646 int Idx1 = SVN->getMaskElt(I + 1);
8647
8648 SDValue SrcOp0 = SVN->getOperand(0);
8649 SDValue SrcOp1 = SrcOp0;
8650 if (Idx0 >= SrcNumElts) {
8651 SrcOp0 = SVN->getOperand(1);
8652 Idx0 -= SrcNumElts;
8653 }
8654
8655 if (Idx1 >= SrcNumElts) {
8656 SrcOp1 = SVN->getOperand(1);
8657 Idx1 -= SrcNumElts;
8658 }
8659
8660 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8661 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8662
8663 // Extract nearest even aligned piece.
8664 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
8665 DAG.getConstant(AlignedIdx0, SL, MVT::i32));
8666 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
8667 DAG.getConstant(AlignedIdx1, SL, MVT::i32));
8668
8669 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8670 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8671
8672 SDValue Result0 = SubVec0;
8673 SDValue Result1 = SubVec0;
8674
8675 if (SubVec0 != SubVec1) {
8676 NewMaskIdx1 += NewSrcNumElts;
8677 Result1 = SubVec1;
8678 } else {
8679 Result1 = DAG.getPOISON(PackVT);
8680 }
8681
8682 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
8683 {NewMaskIdx0, NewMaskIdx1});
8684 Pieces.push_back(Shuf);
8685 } else {
8686 const int Idx0 = SVN->getMaskElt(I);
8687 const int Idx1 = SVN->getMaskElt(I + 1);
8688 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8689 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8690 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8691 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8692
8693 SDValue Vec0 = SVN->getOperand(VecIdx0);
8694 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
8695 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
8696
8697 SDValue Vec1 = SVN->getOperand(VecIdx1);
8698 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
8699 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
8700 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
8701 }
8702 }
8703
8704 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
8705}
8706
8707SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
8708 SelectionDAG &DAG) const {
8709 SDValue SVal = Op.getOperand(0);
8710 EVT ResultVT = Op.getValueType();
8711 EVT SValVT = SVal.getValueType();
8712 SDValue UndefVal = DAG.getPOISON(SValVT);
8713 SDLoc SL(Op);
8714
8716 VElts.push_back(SVal);
8717 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
8718 VElts.push_back(UndefVal);
8719
8720 return DAG.getBuildVector(ResultVT, SL, VElts);
8721}
8722
8723SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
8724 SelectionDAG &DAG) const {
8725 SDLoc SL(Op);
8726 EVT VT = Op.getValueType();
8727
8728 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8729 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
8730
8731 SDValue Lo = Op.getOperand(0);
8732 SDValue Hi = Op.getOperand(1);
8733
8734 // Avoid adding defined bits with the zero_extend.
8735 if (Hi.isUndef()) {
8736 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8737 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
8738 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
8739 }
8740
8741 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
8742 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
8743
8744 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
8745 DAG.getConstant(16, SL, MVT::i32));
8746 if (Lo.isUndef())
8747 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
8748
8749 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8750 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
8751
8752 SDValue Or =
8753 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
8754 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
8755 }
8756
8757 // Split into 2-element chunks.
8758 const unsigned NumParts = VT.getVectorNumElements() / 2;
8759 EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
8760 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
8761
8763 for (unsigned P = 0; P < NumParts; ++P) {
8764 SDValue Vec = DAG.getBuildVector(
8765 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
8766 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
8767 }
8768
8769 SDValue Blend =
8770 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
8771 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
8772}
8773
8775 const GlobalAddressSDNode *GA) const {
8776 // OSes that use ELF REL relocations (instead of RELA) can only store a
8777 // 32-bit addend in the instruction, so it is not safe to allow offset folding
8778 // which can create arbitrary 64-bit addends. (This is only a problem for
8779 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
8780 // the high 32 bits of the addend.)
8781 //
8782 // This should be kept in sync with how HasRelocationAddend is initialized in
8783 // the constructor of ELFAMDGPUAsmBackend.
8784 if (!Subtarget->isAmdHsaOS())
8785 return false;
8786
8787 // We can fold offsets for anything that doesn't require a GOT relocation.
8788 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
8792}
8793
8794static SDValue
8796 const SDLoc &DL, int64_t Offset, EVT PtrVT,
8797 unsigned GAFlags = SIInstrInfo::MO_NONE) {
8798 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
8799 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
8800 // lowered to the following code sequence:
8801 //
8802 // For constant address space:
8803 // s_getpc_b64 s[0:1]
8804 // s_add_u32 s0, s0, $symbol
8805 // s_addc_u32 s1, s1, 0
8806 //
8807 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8808 // a fixup or relocation is emitted to replace $symbol with a literal
8809 // constant, which is a pc-relative offset from the encoding of the $symbol
8810 // operand to the global variable.
8811 //
8812 // For global address space:
8813 // s_getpc_b64 s[0:1]
8814 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
8815 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
8816 //
8817 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8818 // fixups or relocations are emitted to replace $symbol@*@lo and
8819 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
8820 // which is a 64-bit pc-relative offset from the encoding of the $symbol
8821 // operand to the global variable.
8822 if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
8823 assert(GAFlags != SIInstrInfo::MO_NONE);
8824
8825 SDValue Ptr =
8826 DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2);
8827 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr);
8828 }
8829
8830 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
8831 SDValue PtrHi;
8832 if (GAFlags == SIInstrInfo::MO_NONE)
8833 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
8834 else
8835 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
8836 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
8837}
8838
8839SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
8840 SDValue Op,
8841 SelectionDAG &DAG) const {
8842 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
8843 SDLoc DL(GSD);
8844 EVT PtrVT = Op.getValueType();
8845
8846 const GlobalValue *GV = GSD->getGlobal();
8852 GV->hasExternalLinkage()) {
8853 Type *Ty = GV->getValueType();
8854 // HIP uses an unsized array `extern __shared__ T s[]` or similar
8855 // zero-sized type in other languages to declare the dynamic shared
8856 // memory which size is not known at the compile time. They will be
8857 // allocated by the runtime and placed directly after the static
8858 // allocated ones. They all share the same offset.
8859 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
8860 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
8861 // Adjust alignment for that dynamic shared memory array.
8864 MFI->setUsesDynamicLDS(true);
8865 return SDValue(
8866 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
8867 }
8868 }
8870 }
8871
8873 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
8875 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
8876 }
8877
8878 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8879 if (Subtarget->has64BitLiterals()) {
8881 GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64);
8882 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr),
8883 0);
8884 }
8885
8886 SDValue AddrLo = DAG.getTargetGlobalAddress(
8887 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
8888 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
8889
8890 SDValue AddrHi = DAG.getTargetGlobalAddress(
8891 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
8892 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
8893
8894 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
8895 }
8896
8897 if (shouldEmitFixup(GV))
8898 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
8899
8900 if (shouldEmitPCReloc(GV))
8901 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
8903
8904 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
8906 PointerType *PtrTy =
8908 const DataLayout &DataLayout = DAG.getDataLayout();
8909 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
8910 MachinePointerInfo PtrInfo =
8912
8913 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
8916}
8917
8919 const SDLoc &DL, SDValue V) const {
8920 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
8921 // the destination register.
8922 //
8923 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
8924 // so we will end up with redundant moves to m0.
8925 //
8926 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
8927
8928 // A Null SDValue creates a glue result.
8929 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
8930 V, Chain);
8931 return SDValue(M0, 0);
8932}
8933
8934SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
8935 MVT VT,
8936 unsigned Offset) const {
8937 SDLoc SL(Op);
8938 SDValue Param = lowerKernargMemParameter(
8939 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
8940 // The local size values will have the hi 16-bits as zero.
8941 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
8942 DAG.getValueType(VT));
8943}
8944
8946 EVT VT) {
8949 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
8950 return DAG.getPOISON(VT);
8951}
8952
8954 EVT VT) {
8957 "intrinsic not supported on subtarget", DL.getDebugLoc()));
8958 return DAG.getPOISON(VT);
8959}
8960
8962 ArrayRef<SDValue> Elts) {
8963 assert(!Elts.empty());
8964 MVT Type;
8965 unsigned NumElts = Elts.size();
8966
8967 if (NumElts <= 12) {
8968 Type = MVT::getVectorVT(MVT::f32, NumElts);
8969 } else {
8970 assert(Elts.size() <= 16);
8971 Type = MVT::v16f32;
8972 NumElts = 16;
8973 }
8974
8975 SmallVector<SDValue, 16> VecElts(NumElts);
8976 for (unsigned i = 0; i < Elts.size(); ++i) {
8977 SDValue Elt = Elts[i];
8978 if (Elt.getValueType() != MVT::f32)
8979 Elt = DAG.getBitcast(MVT::f32, Elt);
8980 VecElts[i] = Elt;
8981 }
8982 for (unsigned i = Elts.size(); i < NumElts; ++i)
8983 VecElts[i] = DAG.getPOISON(MVT::f32);
8984
8985 if (NumElts == 1)
8986 return VecElts[0];
8987 return DAG.getBuildVector(Type, DL, VecElts);
8988}
8989
8990static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
8991 SDValue Src, int ExtraElts) {
8992 EVT SrcVT = Src.getValueType();
8993
8995
8996 if (SrcVT.isVector())
8997 DAG.ExtractVectorElements(Src, Elts);
8998 else
8999 Elts.push_back(Src);
9000
9001 SDValue Undef = DAG.getPOISON(SrcVT.getScalarType());
9002 while (ExtraElts--)
9003 Elts.push_back(Undef);
9004
9005 return DAG.getBuildVector(CastVT, DL, Elts);
9006}
9007
9008// Re-construct the required return value for a image load intrinsic.
9009// This is more complicated due to the optional use TexFailCtrl which means the
9010// required return type is an aggregate
9012 ArrayRef<EVT> ResultTypes, bool IsTexFail,
9013 bool Unpacked, bool IsD16, int DMaskPop,
9014 int NumVDataDwords, bool IsAtomicPacked16Bit,
9015 const SDLoc &DL) {
9016 // Determine the required return type. This is the same regardless of
9017 // IsTexFail flag
9018 EVT ReqRetVT = ResultTypes[0];
9019 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
9020 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9021 ? (ReqRetNumElts + 1) / 2
9022 : ReqRetNumElts;
9023
9024 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9025
9026 MVT DataDwordVT =
9027 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
9028
9029 MVT MaskPopVT =
9030 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
9031
9032 SDValue Data(Result, 0);
9033 SDValue TexFail;
9034
9035 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
9036 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
9037 if (MaskPopVT.isVector()) {
9038 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
9039 SDValue(Result, 0), ZeroIdx);
9040 } else {
9041 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
9042 SDValue(Result, 0), ZeroIdx);
9043 }
9044 }
9045
9046 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
9047 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
9048 NumDataDwords - MaskPopDwords);
9049
9050 if (IsD16)
9051 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
9052
9053 EVT LegalReqRetVT = ReqRetVT;
9054 if (!ReqRetVT.isVector()) {
9055 if (!Data.getValueType().isInteger())
9056 Data = DAG.getNode(ISD::BITCAST, DL,
9057 Data.getValueType().changeTypeToInteger(), Data);
9058 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
9059 } else {
9060 // We need to widen the return vector to a legal type
9061 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
9062 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
9063 LegalReqRetVT =
9065 ReqRetVT.getVectorNumElements() + 1);
9066 }
9067 }
9068 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
9069
9070 if (IsTexFail) {
9071 TexFail =
9072 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
9073 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
9074
9075 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
9076 }
9077
9078 if (Result->getNumValues() == 1)
9079 return Data;
9080
9081 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
9082}
9083
9084static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
9085 SDValue *LWE, bool &IsTexFail) {
9086 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
9087
9088 uint64_t Value = TexFailCtrlConst->getZExtValue();
9089 if (Value) {
9090 IsTexFail = true;
9091 }
9092
9093 SDLoc DL(TexFailCtrlConst);
9094 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
9095 Value &= ~(uint64_t)0x1;
9096 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
9097 Value &= ~(uint64_t)0x2;
9098
9099 return Value == 0;
9100}
9101
9103 MVT PackVectorVT,
9104 SmallVectorImpl<SDValue> &PackedAddrs,
9105 unsigned DimIdx, unsigned EndIdx,
9106 unsigned NumGradients) {
9107 SDLoc DL(Op);
9108 for (unsigned I = DimIdx; I < EndIdx; I++) {
9109 SDValue Addr = Op.getOperand(I);
9110
9111 // Gradients are packed with undef for each coordinate.
9112 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
9113 // 1D: undef,dx/dh; undef,dx/dv
9114 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
9115 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
9116 if (((I + 1) >= EndIdx) ||
9117 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
9118 I == DimIdx + NumGradients - 1))) {
9119 if (Addr.getValueType() != MVT::i16)
9120 Addr = DAG.getBitcast(MVT::i16, Addr);
9121 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
9122 } else {
9123 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
9124 I++;
9125 }
9126 Addr = DAG.getBitcast(MVT::f32, Addr);
9127 PackedAddrs.push_back(Addr);
9128 }
9129}
9130
9131SDValue SITargetLowering::lowerImage(SDValue Op,
9133 SelectionDAG &DAG, bool WithChain) const {
9134 SDLoc DL(Op);
9135 MachineFunction &MF = DAG.getMachineFunction();
9136 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
9137 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9139 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
9140 unsigned IntrOpcode = Intr->BaseOpcode;
9141 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
9142 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9143 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9144
9145 SmallVector<EVT, 3> ResultTypes(Op->values());
9146 SmallVector<EVT, 3> OrigResultTypes(Op->values());
9147 bool IsD16 = false;
9148 bool IsG16 = false;
9149 bool IsA16 = false;
9150 SDValue VData;
9151 int NumVDataDwords = 0;
9152 bool AdjustRetType = false;
9153 bool IsAtomicPacked16Bit = false;
9154
9155 // Offset of intrinsic arguments
9156 const unsigned ArgOffset = WithChain ? 2 : 1;
9157
9158 unsigned DMask;
9159 unsigned DMaskLanes = 0;
9160
9161 if (BaseOpcode->Atomic) {
9162 VData = Op.getOperand(2);
9163
9164 IsAtomicPacked16Bit =
9165 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9166 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
9167
9168 bool Is64Bit = VData.getValueSizeInBits() == 64;
9169 if (BaseOpcode->AtomicX2) {
9170 SDValue VData2 = Op.getOperand(3);
9171 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
9172 {VData, VData2});
9173 if (Is64Bit)
9174 VData = DAG.getBitcast(MVT::v4i32, VData);
9175
9176 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9177 DMask = Is64Bit ? 0xf : 0x3;
9178 NumVDataDwords = Is64Bit ? 4 : 2;
9179 } else {
9180 DMask = Is64Bit ? 0x3 : 0x1;
9181 NumVDataDwords = Is64Bit ? 2 : 1;
9182 }
9183 } else {
9184 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
9185 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
9186
9187 if (BaseOpcode->Store) {
9188 VData = Op.getOperand(2);
9189
9190 MVT StoreVT = VData.getSimpleValueType();
9191 if (StoreVT.getScalarType() == MVT::f16) {
9192 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9193 return Op; // D16 is unsupported for this instruction
9194
9195 IsD16 = true;
9196 VData = handleD16VData(VData, DAG, true);
9197 }
9198
9199 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
9200 } else if (!BaseOpcode->NoReturn) {
9201 // Work out the num dwords based on the dmask popcount and underlying type
9202 // and whether packing is supported.
9203 MVT LoadVT = ResultTypes[0].getSimpleVT();
9204 if (LoadVT.getScalarType() == MVT::f16) {
9205 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9206 return Op; // D16 is unsupported for this instruction
9207
9208 IsD16 = true;
9209 }
9210
9211 // Confirm that the return type is large enough for the dmask specified
9212 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
9213 (!LoadVT.isVector() && DMaskLanes > 1))
9214 return Op;
9215
9216 // The sq block of gfx8 and gfx9 do not estimate register use correctly
9217 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
9218 // instructions.
9219 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9220 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9221 NumVDataDwords = (DMaskLanes + 1) / 2;
9222 else
9223 NumVDataDwords = DMaskLanes;
9224
9225 AdjustRetType = true;
9226 }
9227 }
9228
9229 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
9231
9232 // Check for 16 bit addresses or derivatives and pack if true.
9233 MVT VAddrVT =
9234 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
9235 MVT VAddrScalarVT = VAddrVT.getScalarType();
9236 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9237 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9238
9239 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
9240 VAddrScalarVT = VAddrVT.getScalarType();
9241 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9242 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9243
9244 // Push back extra arguments.
9245 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
9246 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
9247 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
9248 // Special handling of bias when A16 is on. Bias is of type half but
9249 // occupies full 32-bit.
9250 SDValue Bias = DAG.getBuildVector(
9251 MVT::v2f16, DL,
9252 {Op.getOperand(ArgOffset + I), DAG.getPOISON(MVT::f16)});
9253 VAddrs.push_back(Bias);
9254 } else {
9255 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
9256 "Bias needs to be converted to 16 bit in A16 mode");
9257 VAddrs.push_back(Op.getOperand(ArgOffset + I));
9258 }
9259 }
9260
9261 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
9262 // 16 bit gradients are supported, but are tied to the A16 control
9263 // so both gradients and addresses must be 16 bit
9264 LLVM_DEBUG(
9265 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
9266 "require 16 bit args for both gradients and addresses");
9267 return Op;
9268 }
9269
9270 if (IsA16) {
9271 if (!ST->hasA16()) {
9272 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
9273 "support 16 bit addresses\n");
9274 return Op;
9275 }
9276 }
9277
9278 // We've dealt with incorrect input so we know that if IsA16, IsG16
9279 // are set then we have to compress/pack operands (either address,
9280 // gradient or both)
9281 // In the case where a16 and gradients are tied (no G16 support) then we
9282 // have already verified that both IsA16 and IsG16 are true
9283 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
9284 // Activate g16
9285 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9287 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
9288 }
9289
9290 // Add gradients (packed or unpacked)
9291 if (IsG16) {
9292 // Pack the gradients
9293 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
9294 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
9295 ArgOffset + Intr->GradientStart,
9296 ArgOffset + Intr->CoordStart, Intr->NumGradients);
9297 } else {
9298 for (unsigned I = ArgOffset + Intr->GradientStart;
9299 I < ArgOffset + Intr->CoordStart; I++)
9300 VAddrs.push_back(Op.getOperand(I));
9301 }
9302
9303 // Add addresses (packed or unpacked)
9304 if (IsA16) {
9305 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
9306 ArgOffset + Intr->CoordStart, VAddrEnd,
9307 0 /* No gradients */);
9308 } else {
9309 // Add uncompressed address
9310 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
9311 VAddrs.push_back(Op.getOperand(I));
9312 }
9313
9314 // If the register allocator cannot place the address registers contiguously
9315 // without introducing moves, then using the non-sequential address encoding
9316 // is always preferable, since it saves VALU instructions and is usually a
9317 // wash in terms of code size or even better.
9318 //
9319 // However, we currently have no way of hinting to the register allocator that
9320 // MIMG addresses should be placed contiguously when it is possible to do so,
9321 // so force non-NSA for the common 2-address case as a heuristic.
9322 //
9323 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
9324 // allocation when possible.
9325 //
9326 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
9327 // set of the remaining addresses.
9328 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
9329 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
9330 const bool UseNSA = ST->hasNSAEncoding() &&
9331 VAddrs.size() >= ST->getNSAThreshold(MF) &&
9332 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
9333 const bool UsePartialNSA =
9334 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
9335
9336 SDValue VAddr;
9337 if (UsePartialNSA) {
9338 VAddr = getBuildDwordsVector(DAG, DL,
9339 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9340 } else if (!UseNSA) {
9341 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
9342 }
9343
9344 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
9345 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
9346 SDValue Unorm;
9347 if (!BaseOpcode->Sampler) {
9348 Unorm = True;
9349 } else {
9350 uint64_t UnormConst =
9351 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
9352
9353 Unorm = UnormConst ? True : False;
9354 }
9355
9356 SDValue TFE;
9357 SDValue LWE;
9358 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
9359 bool IsTexFail = false;
9360 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9361 return Op;
9362
9363 if (IsTexFail) {
9364 if (!DMaskLanes) {
9365 // Expecting to get an error flag since TFC is on - and dmask is 0
9366 // Force dmask to be at least 1 otherwise the instruction will fail
9367 DMask = 0x1;
9368 DMaskLanes = 1;
9369 NumVDataDwords = 1;
9370 }
9371 NumVDataDwords += 1;
9372 AdjustRetType = true;
9373 }
9374
9375 // Has something earlier tagged that the return type needs adjusting
9376 // This happens if the instruction is a load or has set TexFailCtrl flags
9377 if (AdjustRetType) {
9378 // NumVDataDwords reflects the true number of dwords required in the return
9379 // type
9380 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9381 // This is a no-op load. This can be eliminated
9382 SDValue Undef = DAG.getPOISON(Op.getValueType());
9383 if (isa<MemSDNode>(Op))
9384 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
9385 return Undef;
9386 }
9387
9388 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
9389 MVT::i32, NumVDataDwords)
9390 : MVT::i32;
9391
9392 ResultTypes[0] = NewVT;
9393 if (ResultTypes.size() == 3) {
9394 // Original result was aggregate type used for TexFailCtrl results
9395 // The actual instruction returns as a vector type which has now been
9396 // created. Remove the aggregate result.
9397 ResultTypes.erase(&ResultTypes[1]);
9398 }
9399 }
9400
9401 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
9402 if (BaseOpcode->Atomic)
9403 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
9404 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
9406 return Op;
9407
9409 if (BaseOpcode->Store || BaseOpcode->Atomic)
9410 Ops.push_back(VData); // vdata
9411 if (UsePartialNSA) {
9412 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
9413 Ops.push_back(VAddr);
9414 } else if (UseNSA)
9415 append_range(Ops, VAddrs);
9416 else
9417 Ops.push_back(VAddr);
9418 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
9419 EVT RsrcVT = Rsrc.getValueType();
9420 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9421 return Op;
9422 Ops.push_back(Rsrc);
9423 if (BaseOpcode->Sampler) {
9424 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
9425 if (Samp.getValueType() != MVT::v4i32)
9426 return Op;
9427 Ops.push_back(Samp);
9428 }
9429 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
9430 if (IsGFX10Plus)
9431 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
9432 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9433 Ops.push_back(Unorm);
9434 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
9435 Ops.push_back(IsA16 && // r128, a16 for gfx9
9436 ST->hasFeature(AMDGPU::FeatureR128A16)
9437 ? True
9438 : False);
9439 if (IsGFX10Plus)
9440 Ops.push_back(IsA16 ? True : False);
9441
9442 if (!Subtarget->hasGFX90AInsts())
9443 Ops.push_back(TFE); // tfe
9444 else if (TFE->getAsZExtVal()) {
9445 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9447 "TFE is not supported on this GPU", DL.getDebugLoc()));
9448 }
9449
9450 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9451 Ops.push_back(LWE); // lwe
9452 if (!IsGFX10Plus)
9453 Ops.push_back(DimInfo->DA ? True : False);
9454 if (BaseOpcode->HasD16)
9455 Ops.push_back(IsD16 ? True : False);
9456 if (isa<MemSDNode>(Op))
9457 Ops.push_back(Op.getOperand(0)); // chain
9458
9459 int NumVAddrDwords =
9460 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
9461 int Opcode = -1;
9462
9463 if (IsGFX12Plus) {
9464 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
9465 NumVDataDwords, NumVAddrDwords);
9466 } else if (IsGFX11Plus) {
9467 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9468 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9469 : AMDGPU::MIMGEncGfx11Default,
9470 NumVDataDwords, NumVAddrDwords);
9471 } else if (IsGFX10Plus) {
9472 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9473 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9474 : AMDGPU::MIMGEncGfx10Default,
9475 NumVDataDwords, NumVAddrDwords);
9476 } else {
9477 if (Subtarget->hasGFX90AInsts()) {
9478 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
9479 NumVDataDwords, NumVAddrDwords);
9480 if (Opcode == -1) {
9481 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9483 "requested image instruction is not supported on this GPU",
9484 DL.getDebugLoc()));
9485
9486 unsigned Idx = 0;
9487 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
9488 for (EVT VT : OrigResultTypes) {
9489 if (VT == MVT::Other)
9490 RetValues[Idx++] = Op.getOperand(0); // Chain
9491 else
9492 RetValues[Idx++] = DAG.getPOISON(VT);
9493 }
9494
9495 return DAG.getMergeValues(RetValues, DL);
9496 }
9497 }
9498 if (Opcode == -1 &&
9499 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9500 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
9501 NumVDataDwords, NumVAddrDwords);
9502 if (Opcode == -1)
9503 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
9504 NumVDataDwords, NumVAddrDwords);
9505 }
9506 if (Opcode == -1)
9507 return Op;
9508
9509 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
9510 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
9511 MachineMemOperand *MemRef = MemOp->getMemOperand();
9512 DAG.setNodeMemRefs(NewNode, {MemRef});
9513 }
9514
9515 if (BaseOpcode->AtomicX2) {
9517 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
9518 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
9519 }
9520 if (BaseOpcode->NoReturn)
9521 return SDValue(NewNode, 0);
9522 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
9523 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9524 NumVDataDwords, IsAtomicPacked16Bit, DL);
9525}
9526
9527SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
9528 SDValue Offset, SDValue CachePolicy,
9529 SelectionDAG &DAG) const {
9530 MachineFunction &MF = DAG.getMachineFunction();
9531
9532 const DataLayout &DataLayout = DAG.getDataLayout();
9533 Align Alignment =
9534 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
9535
9536 MachineMemOperand *MMO = MF.getMachineMemOperand(
9537 MachinePointerInfo(),
9540 VT.getStoreSize(), Alignment);
9541
9542 if (!Offset->isDivergent()) {
9543 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
9544
9545 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
9546 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
9547 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
9548 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
9549 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9550 SDValue BufferLoad =
9552 DAG.getVTList(MVT::i32), Ops, VT, MMO);
9553 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
9554 }
9555
9556 // Widen vec3 load to vec4.
9557 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
9558 !Subtarget->hasScalarDwordx3Loads()) {
9559 EVT WidenedVT =
9561 auto WidenedOp = DAG.getMemIntrinsicNode(
9562 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
9563 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
9564 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
9565 DAG.getVectorIdxConstant(0, DL));
9566 return Subvector;
9567 }
9568
9570 DAG.getVTList(VT), Ops, VT, MMO);
9571 }
9572
9573 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
9574 // assume that the buffer is unswizzled.
9575 SDValue Ops[] = {
9576 DAG.getEntryNode(), // Chain
9577 Rsrc, // rsrc
9578 DAG.getConstant(0, DL, MVT::i32), // vindex
9579 {}, // voffset
9580 {}, // soffset
9581 {}, // offset
9582 CachePolicy, // cachepolicy
9583 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9584 };
9585 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9586 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
9587 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
9588 }
9589
9591 unsigned NumLoads = 1;
9592 MVT LoadVT = VT.getSimpleVT();
9593 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
9594 assert((LoadVT.getScalarType() == MVT::i32 ||
9595 LoadVT.getScalarType() == MVT::f32));
9596
9597 if (NumElts == 8 || NumElts == 16) {
9598 NumLoads = NumElts / 4;
9599 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
9600 }
9601
9602 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other});
9603
9604 // Use the alignment to ensure that the required offsets will fit into the
9605 // immediate offsets.
9606 setBufferOffsets(Offset, DAG, &Ops[3],
9607 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
9608
9609 uint64_t InstOffset = Ops[5]->getAsZExtVal();
9610 for (unsigned i = 0; i < NumLoads; ++i) {
9611 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
9612 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
9613 LoadVT, MMO, DAG));
9614 }
9615
9616 if (NumElts == 8 || NumElts == 16)
9617 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
9618
9619 return Loads[0];
9620}
9621
9622SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
9623 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
9624 if (!Subtarget->hasArchitectedSGPRs())
9625 return {};
9626 SDLoc SL(Op);
9627 MVT VT = MVT::i32;
9628 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
9629 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
9630 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
9631}
9632
9633SDValue SITargetLowering::lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,
9634 AMDGPU::Hwreg::Id HwReg,
9635 unsigned LowBit,
9636 unsigned Width) const {
9637 SDLoc SL(Op);
9638 using namespace AMDGPU::Hwreg;
9639 return {DAG.getMachineNode(
9640 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
9641 DAG.getTargetConstant(HwregEncoding::encode(HwReg, LowBit, Width),
9642 SL, MVT::i32)),
9643 0};
9644}
9645
9646SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
9647 unsigned Dim,
9648 const ArgDescriptor &Arg) const {
9649 SDLoc SL(Op);
9650 MachineFunction &MF = DAG.getMachineFunction();
9651 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
9652 if (MaxID == 0)
9653 return DAG.getConstant(0, SL, MVT::i32);
9654
9655 // It's undefined behavior if a function marked with the amdgpu-no-*
9656 // attributes uses the corresponding intrinsic.
9657 if (!Arg)
9658 return DAG.getPOISON(Op->getValueType(0));
9659
9660 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
9661 SDLoc(DAG.getEntryNode()), Arg);
9662
9663 // Don't bother inserting AssertZext for packed IDs since we're emitting the
9664 // masking operations anyway.
9665 //
9666 // TODO: We could assert the top bit is 0 for the source copy.
9667 if (Arg.isMasked())
9668 return Val;
9669
9670 // Preserve the known bits after expansion to a copy.
9671 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID));
9672 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
9673 DAG.getValueType(SmallVT));
9674}
9675
9676SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
9677 SelectionDAG &DAG) const {
9678 MachineFunction &MF = DAG.getMachineFunction();
9679 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
9680
9681 EVT VT = Op.getValueType();
9682 SDLoc DL(Op);
9683 unsigned IntrinsicID = Op.getConstantOperandVal(0);
9684
9685 // TODO: Should this propagate fast-math-flags?
9686
9687 switch (IntrinsicID) {
9688 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9689 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
9690 return emitNonHSAIntrinsicError(DAG, DL, VT);
9691 return getPreloadedValue(DAG, *MFI, VT,
9693 }
9694 case Intrinsic::amdgcn_dispatch_ptr:
9695 case Intrinsic::amdgcn_queue_ptr: {
9696 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
9697 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9698 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
9699 DL.getDebugLoc()));
9700 return DAG.getPOISON(VT);
9701 }
9702
9703 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9706 return getPreloadedValue(DAG, *MFI, VT, RegID);
9707 }
9708 case Intrinsic::amdgcn_implicitarg_ptr: {
9709 if (MFI->isEntryFunction())
9710 return getImplicitArgPtr(DAG, DL);
9711 return getPreloadedValue(DAG, *MFI, VT,
9713 }
9714 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9716 // This only makes sense to call in a kernel, so just lower to null.
9717 return DAG.getConstant(0, DL, VT);
9718 }
9719
9720 return getPreloadedValue(DAG, *MFI, VT,
9722 }
9723 case Intrinsic::amdgcn_dispatch_id: {
9724 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
9725 }
9726 case Intrinsic::amdgcn_rcp:
9727 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
9728 case Intrinsic::amdgcn_rsq:
9729 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9730 case Intrinsic::amdgcn_rsq_legacy:
9731 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9732 return emitRemovedIntrinsicError(DAG, DL, VT);
9733 return SDValue();
9734 case Intrinsic::amdgcn_rcp_legacy:
9735 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9736 return emitRemovedIntrinsicError(DAG, DL, VT);
9737 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
9738 case Intrinsic::amdgcn_rsq_clamp: {
9739 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9740 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
9741
9742 Type *Type = VT.getTypeForEVT(*DAG.getContext());
9743 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
9744 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
9745
9746 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9747 SDValue Tmp =
9748 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
9749 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
9750 DAG.getConstantFP(Min, DL, VT));
9751 }
9752 case Intrinsic::r600_read_ngroups_x:
9753 if (Subtarget->isAmdHsaOS())
9754 return emitNonHSAIntrinsicError(DAG, DL, VT);
9755
9756 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9758 false);
9759 case Intrinsic::r600_read_ngroups_y:
9760 if (Subtarget->isAmdHsaOS())
9761 return emitNonHSAIntrinsicError(DAG, DL, VT);
9762
9763 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9765 false);
9766 case Intrinsic::r600_read_ngroups_z:
9767 if (Subtarget->isAmdHsaOS())
9768 return emitNonHSAIntrinsicError(DAG, DL, VT);
9769
9770 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9772 false);
9773 case Intrinsic::r600_read_local_size_x:
9774 if (Subtarget->isAmdHsaOS())
9775 return emitNonHSAIntrinsicError(DAG, DL, VT);
9776
9777 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9779 case Intrinsic::r600_read_local_size_y:
9780 if (Subtarget->isAmdHsaOS())
9781 return emitNonHSAIntrinsicError(DAG, DL, VT);
9782
9783 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9785 case Intrinsic::r600_read_local_size_z:
9786 if (Subtarget->isAmdHsaOS())
9787 return emitNonHSAIntrinsicError(DAG, DL, VT);
9788
9789 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9791 case Intrinsic::amdgcn_workgroup_id_x:
9792 return lowerWorkGroupId(DAG, *MFI, VT,
9796 case Intrinsic::amdgcn_workgroup_id_y:
9797 return lowerWorkGroupId(DAG, *MFI, VT,
9801 case Intrinsic::amdgcn_workgroup_id_z:
9802 return lowerWorkGroupId(DAG, *MFI, VT,
9806 case Intrinsic::amdgcn_cluster_id_x:
9807 return Subtarget->hasClusters()
9808 ? getPreloadedValue(DAG, *MFI, VT,
9810 : DAG.getPOISON(VT);
9811 case Intrinsic::amdgcn_cluster_id_y:
9812 return Subtarget->hasClusters()
9813 ? getPreloadedValue(DAG, *MFI, VT,
9815 : DAG.getPOISON(VT);
9816 case Intrinsic::amdgcn_cluster_id_z:
9817 return Subtarget->hasClusters()
9818 ? getPreloadedValue(DAG, *MFI, VT,
9820 : DAG.getPOISON(VT);
9821 case Intrinsic::amdgcn_cluster_workgroup_id_x:
9822 return Subtarget->hasClusters()
9823 ? getPreloadedValue(
9824 DAG, *MFI, VT,
9826 : DAG.getPOISON(VT);
9827 case Intrinsic::amdgcn_cluster_workgroup_id_y:
9828 return Subtarget->hasClusters()
9829 ? getPreloadedValue(
9830 DAG, *MFI, VT,
9832 : DAG.getPOISON(VT);
9833 case Intrinsic::amdgcn_cluster_workgroup_id_z:
9834 return Subtarget->hasClusters()
9835 ? getPreloadedValue(
9836 DAG, *MFI, VT,
9838 : DAG.getPOISON(VT);
9839 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
9840 return Subtarget->hasClusters()
9841 ? lowerConstHwRegRead(DAG, Op, AMDGPU::Hwreg::ID_IB_STS2, 21, 4)
9842 : SDValue();
9843 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
9844 return Subtarget->hasClusters()
9845 ? getPreloadedValue(
9846 DAG, *MFI, VT,
9848 : DAG.getPOISON(VT);
9849 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
9850 return Subtarget->hasClusters()
9851 ? getPreloadedValue(
9852 DAG, *MFI, VT,
9854 : DAG.getPOISON(VT);
9855 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
9856 return Subtarget->hasClusters()
9857 ? getPreloadedValue(
9858 DAG, *MFI, VT,
9860 : DAG.getPOISON(VT);
9861 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
9862 return Subtarget->hasClusters()
9863 ? getPreloadedValue(
9864 DAG, *MFI, VT,
9866 : DAG.getPOISON(VT);
9867 case Intrinsic::amdgcn_wave_id:
9868 return lowerWaveID(DAG, Op);
9869 case Intrinsic::amdgcn_lds_kernel_id: {
9870 if (MFI->isEntryFunction())
9871 return getLDSKernelId(DAG, DL);
9872 return getPreloadedValue(DAG, *MFI, VT,
9874 }
9875 case Intrinsic::amdgcn_workitem_id_x:
9876 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
9877 case Intrinsic::amdgcn_workitem_id_y:
9878 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
9879 case Intrinsic::amdgcn_workitem_id_z:
9880 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
9881 case Intrinsic::amdgcn_wavefrontsize:
9882 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
9883 SDLoc(Op), MVT::i32);
9884 case Intrinsic::amdgcn_s_buffer_load: {
9885 unsigned CPol = Op.getConstantOperandVal(3);
9886 // s_buffer_load, because of how it's optimized, can't be volatile
9887 // so reject ones with the volatile bit set.
9888 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
9891 return Op;
9892 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
9893 Op.getOperand(3), DAG);
9894 }
9895 case Intrinsic::amdgcn_fdiv_fast:
9896 return lowerFDIV_FAST(Op, DAG);
9897 case Intrinsic::amdgcn_sin:
9898 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
9899
9900 case Intrinsic::amdgcn_cos:
9901 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
9902
9903 case Intrinsic::amdgcn_mul_u24:
9904 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
9905 Op.getOperand(2));
9906 case Intrinsic::amdgcn_mul_i24:
9907 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
9908 Op.getOperand(2));
9909
9910 case Intrinsic::amdgcn_log_clamp: {
9911 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9912 return SDValue();
9913
9914 return emitRemovedIntrinsicError(DAG, DL, VT);
9915 }
9916 case Intrinsic::amdgcn_fract:
9917 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
9918
9919 case Intrinsic::amdgcn_class:
9920 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
9921 Op.getOperand(2));
9922 case Intrinsic::amdgcn_div_fmas:
9923 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
9924 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9925
9926 case Intrinsic::amdgcn_div_fixup:
9927 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
9928 Op.getOperand(2), Op.getOperand(3));
9929
9930 case Intrinsic::amdgcn_div_scale: {
9931 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
9932
9933 // Translate to the operands expected by the machine instruction. The
9934 // first parameter must be the same as the first instruction.
9935 SDValue Numerator = Op.getOperand(1);
9936 SDValue Denominator = Op.getOperand(2);
9937
9938 // Note this order is opposite of the machine instruction's operations,
9939 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
9940 // intrinsic has the numerator as the first operand to match a normal
9941 // division operation.
9942
9943 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
9944
9945 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
9946 Denominator, Numerator);
9947 }
9948 case Intrinsic::amdgcn_icmp: {
9949 // There is a Pat that handles this variant, so return it as-is.
9950 if (Op.getOperand(1).getValueType() == MVT::i1 &&
9951 Op.getConstantOperandVal(2) == 0 &&
9952 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
9953 return Op;
9954 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
9955 }
9956 case Intrinsic::amdgcn_fcmp: {
9957 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
9958 }
9959 case Intrinsic::amdgcn_ballot:
9960 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
9961 case Intrinsic::amdgcn_fmed3:
9962 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
9963 Op.getOperand(2), Op.getOperand(3));
9964 case Intrinsic::amdgcn_fdot2:
9965 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
9966 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9967 case Intrinsic::amdgcn_fmul_legacy:
9968 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
9969 Op.getOperand(2));
9970 case Intrinsic::amdgcn_sffbh:
9971 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
9972 case Intrinsic::amdgcn_sbfe:
9973 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
9974 Op.getOperand(2), Op.getOperand(3));
9975 case Intrinsic::amdgcn_ubfe:
9976 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
9977 Op.getOperand(2), Op.getOperand(3));
9978 case Intrinsic::amdgcn_cvt_pkrtz:
9979 case Intrinsic::amdgcn_cvt_pknorm_i16:
9980 case Intrinsic::amdgcn_cvt_pknorm_u16:
9981 case Intrinsic::amdgcn_cvt_pk_i16:
9982 case Intrinsic::amdgcn_cvt_pk_u16: {
9983 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
9984 EVT VT = Op.getValueType();
9985 unsigned Opcode;
9986
9987 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
9989 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
9991 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
9993 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
9995 else
9997
9998 if (isTypeLegal(VT))
9999 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
10000
10001 SDValue Node =
10002 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
10003 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
10004 }
10005 case Intrinsic::amdgcn_fmad_ftz:
10006 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
10007 Op.getOperand(2), Op.getOperand(3));
10008
10009 case Intrinsic::amdgcn_if_break:
10010 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
10011 Op->getOperand(1), Op->getOperand(2)),
10012 0);
10013
10014 case Intrinsic::amdgcn_groupstaticsize: {
10016 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
10017 return Op;
10018
10019 const Module *M = MF.getFunction().getParent();
10020 const GlobalValue *GV =
10021 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
10022 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
10024 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10025 }
10026 case Intrinsic::amdgcn_is_shared:
10027 case Intrinsic::amdgcn_is_private: {
10028 SDLoc SL(Op);
10029 SDValue SrcVec =
10030 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10031 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
10032 DAG.getConstant(1, SL, MVT::i32));
10033
10034 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10036 : AMDGPUAS::PRIVATE_ADDRESS;
10037 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
10038 Subtarget->hasGloballyAddressableScratch()) {
10039 SDValue FlatScratchBaseHi(
10040 DAG.getMachineNode(
10041 AMDGPU::S_MOV_B32, DL, MVT::i32,
10042 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10043 0);
10044 // Test bits 63..58 against the aperture address.
10045 return DAG.getSetCC(
10046 SL, MVT::i1,
10047 DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
10048 DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
10049 }
10050
10051 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10052 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
10053 }
10054 case Intrinsic::amdgcn_perm:
10055 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
10056 Op.getOperand(2), Op.getOperand(3));
10057 case Intrinsic::amdgcn_reloc_constant: {
10058 Module *M = MF.getFunction().getParent();
10059 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
10060 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
10061 auto *RelocSymbol = cast<GlobalVariable>(
10062 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
10063 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
10065 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10066 }
10067 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10068 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10069 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10070 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10071 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10072 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10073 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10074 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10075 if (Op.getOperand(4).getValueType() == MVT::i32)
10076 return SDValue();
10077
10078 SDLoc SL(Op);
10079 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
10080 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10081 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10082 Op.getOperand(3), IndexKeyi32);
10083 }
10084 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10085 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10086 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10087 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10088 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10089 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10090 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10091 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10092 if (Op.getOperand(4).getValueType() == MVT::i64)
10093 return SDValue();
10094
10095 SDLoc SL(Op);
10096 auto IndexKeyi64 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64);
10097 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10098 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10099 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10100 Op.getOperand(6)});
10101 }
10102 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10103 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10104 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10105 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10106 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10107 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10108 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10109 ? MVT::i64
10110 : MVT::i32;
10111 if (Op.getOperand(6).getValueType() == IndexKeyTy)
10112 return SDValue();
10113
10114 SDLoc SL(Op);
10115 auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
10116 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10117 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10118 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10119 IndexKey, Op.getOperand(7),
10120 Op.getOperand(8)}); // No clamp operand
10121 }
10122 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10123 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10124 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10125 if (Op.getOperand(6).getValueType() == MVT::i32)
10126 return SDValue();
10127
10128 SDLoc SL(Op);
10129 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
10130 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10131 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10132 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10133 IndexKeyi32, Op.getOperand(7)});
10134 }
10135 case Intrinsic::amdgcn_addrspacecast_nonnull:
10136 return lowerADDRSPACECAST(Op, DAG);
10137 case Intrinsic::amdgcn_readlane:
10138 case Intrinsic::amdgcn_readfirstlane:
10139 case Intrinsic::amdgcn_writelane:
10140 case Intrinsic::amdgcn_permlane16:
10141 case Intrinsic::amdgcn_permlanex16:
10142 case Intrinsic::amdgcn_permlane64:
10143 case Intrinsic::amdgcn_set_inactive:
10144 case Intrinsic::amdgcn_set_inactive_chain_arg:
10145 case Intrinsic::amdgcn_mov_dpp8:
10146 case Intrinsic::amdgcn_update_dpp:
10147 return lowerLaneOp(*this, Op.getNode(), DAG);
10148 case Intrinsic::amdgcn_dead: {
10150 for (const EVT ValTy : Op.getNode()->values())
10151 Poisons.push_back(DAG.getPOISON(ValTy));
10152 return DAG.getMergeValues(Poisons, SDLoc(Op));
10153 }
10154 default:
10155 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10157 return lowerImage(Op, ImageDimIntr, DAG, false);
10158
10159 return Op;
10160 }
10161}
10162
10163// On targets not supporting constant in soffset field, turn zero to
10164// SGPR_NULL to avoid generating an extra s_mov with zero.
10166 const GCNSubtarget *Subtarget) {
10167 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
10168 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10169 return SOffset;
10170}
10171
10172SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
10173 SelectionDAG &DAG,
10174 unsigned NewOpcode) const {
10175 SDLoc DL(Op);
10176
10177 SDValue VData = Op.getOperand(2);
10178 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10179 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10180 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10181 SDValue Ops[] = {
10182 Op.getOperand(0), // Chain
10183 VData, // vdata
10184 Rsrc, // rsrc
10185 DAG.getConstant(0, DL, MVT::i32), // vindex
10186 VOffset, // voffset
10187 SOffset, // soffset
10188 Offset, // offset
10189 Op.getOperand(6), // cachepolicy
10190 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10191 };
10192
10193 auto *M = cast<MemSDNode>(Op);
10194
10195 EVT MemVT = VData.getValueType();
10196 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10197 M->getMemOperand());
10198}
10199
10200SDValue
10201SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
10202 unsigned NewOpcode) const {
10203 SDLoc DL(Op);
10204
10205 SDValue VData = Op.getOperand(2);
10206 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10207 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10208 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10209 SDValue Ops[] = {
10210 Op.getOperand(0), // Chain
10211 VData, // vdata
10212 Rsrc, // rsrc
10213 Op.getOperand(4), // vindex
10214 VOffset, // voffset
10215 SOffset, // soffset
10216 Offset, // offset
10217 Op.getOperand(7), // cachepolicy
10218 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10219 };
10220
10221 auto *M = cast<MemSDNode>(Op);
10222
10223 EVT MemVT = VData.getValueType();
10224 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10225 M->getMemOperand());
10226}
10227
10228SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
10229 SelectionDAG &DAG) const {
10230 unsigned IntrID = Op.getConstantOperandVal(1);
10231 SDLoc DL(Op);
10232
10233 switch (IntrID) {
10234 case Intrinsic::amdgcn_ds_ordered_add:
10235 case Intrinsic::amdgcn_ds_ordered_swap: {
10236 MemSDNode *M = cast<MemSDNode>(Op);
10237 SDValue Chain = M->getOperand(0);
10238 SDValue M0 = M->getOperand(2);
10239 SDValue Value = M->getOperand(3);
10240 unsigned IndexOperand = M->getConstantOperandVal(7);
10241 unsigned WaveRelease = M->getConstantOperandVal(8);
10242 unsigned WaveDone = M->getConstantOperandVal(9);
10243
10244 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10245 IndexOperand &= ~0x3f;
10246 unsigned CountDw = 0;
10247
10248 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
10249 CountDw = (IndexOperand >> 24) & 0xf;
10250 IndexOperand &= ~(0xf << 24);
10251
10252 if (CountDw < 1 || CountDw > 4) {
10253 const Function &Fn = DAG.getMachineFunction().getFunction();
10254 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10255 Fn, "ds_ordered_count: dword count must be between 1 and 4",
10256 DL.getDebugLoc()));
10257 CountDw = 1;
10258 }
10259 }
10260
10261 if (IndexOperand) {
10262 const Function &Fn = DAG.getMachineFunction().getFunction();
10263 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10264 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
10265 }
10266
10267 if (WaveDone && !WaveRelease) {
10268 // TODO: Move this to IR verifier
10269 const Function &Fn = DAG.getMachineFunction().getFunction();
10270 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10271 Fn, "ds_ordered_count: wave_done requires wave_release",
10272 DL.getDebugLoc()));
10273 }
10274
10275 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10276 unsigned ShaderType =
10278 unsigned Offset0 = OrderedCountIndex << 2;
10279 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10280
10281 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
10282 Offset1 |= (CountDw - 1) << 6;
10283
10284 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
10285 Offset1 |= ShaderType << 2;
10286
10287 unsigned Offset = Offset0 | (Offset1 << 8);
10288
10289 SDValue Ops[] = {
10290 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
10291 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
10292 };
10294 M->getVTList(), Ops, M->getMemoryVT(),
10295 M->getMemOperand());
10296 }
10297 case Intrinsic::amdgcn_raw_buffer_load:
10298 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10299 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10300 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10301 case Intrinsic::amdgcn_raw_buffer_load_format:
10302 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10303 const bool IsFormat =
10304 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10305 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10306
10307 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10308 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10309 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10310 SDValue Ops[] = {
10311 Op.getOperand(0), // Chain
10312 Rsrc, // rsrc
10313 DAG.getConstant(0, DL, MVT::i32), // vindex
10314 VOffset, // voffset
10315 SOffset, // soffset
10316 Offset, // offset
10317 Op.getOperand(5), // cachepolicy, swizzled buffer
10318 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10319 };
10320
10321 auto *M = cast<MemSDNode>(Op);
10322 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
10323 }
10324 case Intrinsic::amdgcn_struct_buffer_load:
10325 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10326 case Intrinsic::amdgcn_struct_buffer_load_format:
10327 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10328 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10329 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10330 const bool IsFormat =
10331 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10332 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10333
10334 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10335 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10336 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10337 SDValue Ops[] = {
10338 Op.getOperand(0), // Chain
10339 Rsrc, // rsrc
10340 Op.getOperand(3), // vindex
10341 VOffset, // voffset
10342 SOffset, // soffset
10343 Offset, // offset
10344 Op.getOperand(6), // cachepolicy, swizzled buffer
10345 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10346 };
10347
10348 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
10349 }
10350 case Intrinsic::amdgcn_raw_tbuffer_load:
10351 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10352 MemSDNode *M = cast<MemSDNode>(Op);
10353 EVT LoadVT = Op.getValueType();
10354 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10355 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10356 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10357
10358 SDValue Ops[] = {
10359 Op.getOperand(0), // Chain
10360 Rsrc, // rsrc
10361 DAG.getConstant(0, DL, MVT::i32), // vindex
10362 VOffset, // voffset
10363 SOffset, // soffset
10364 Offset, // offset
10365 Op.getOperand(5), // format
10366 Op.getOperand(6), // cachepolicy, swizzled buffer
10367 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10368 };
10369
10370 if (LoadVT.getScalarType() == MVT::f16)
10371 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10372 Ops);
10373 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10374 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10375 DAG);
10376 }
10377 case Intrinsic::amdgcn_struct_tbuffer_load:
10378 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10379 MemSDNode *M = cast<MemSDNode>(Op);
10380 EVT LoadVT = Op.getValueType();
10381 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10382 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10383 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10384
10385 SDValue Ops[] = {
10386 Op.getOperand(0), // Chain
10387 Rsrc, // rsrc
10388 Op.getOperand(3), // vindex
10389 VOffset, // voffset
10390 SOffset, // soffset
10391 Offset, // offset
10392 Op.getOperand(6), // format
10393 Op.getOperand(7), // cachepolicy, swizzled buffer
10394 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10395 };
10396
10397 if (LoadVT.getScalarType() == MVT::f16)
10398 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10399 Ops);
10400 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10401 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10402 DAG);
10403 }
10404 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10405 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10406 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
10407 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10408 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10409 return lowerStructBufferAtomicIntrin(Op, DAG,
10411 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10412 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10413 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
10414 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10415 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10416 return lowerStructBufferAtomicIntrin(Op, DAG,
10418 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10419 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10420 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
10421 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10422 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10423 return lowerStructBufferAtomicIntrin(Op, DAG,
10425 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10426 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10427 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
10428 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10429 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10430 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10431 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10432 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10433 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10434 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10435 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10436 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
10437 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10438 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10439 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
10440 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10441 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10442 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
10443 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10444 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10445 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
10446 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10447 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10448 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10449 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10450 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10451 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10452 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10453 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10454 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10455 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10456 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10457 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10458 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10459 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10460 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10461 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10462 return lowerRawBufferAtomicIntrin(Op, DAG,
10464 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10465 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10466 return lowerStructBufferAtomicIntrin(Op, DAG,
10468 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10469 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10470 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10471 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10472 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10473 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10474 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10475 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10476 return lowerStructBufferAtomicIntrin(Op, DAG,
10478 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10479 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10480 return lowerStructBufferAtomicIntrin(Op, DAG,
10482 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10483 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10484 return lowerStructBufferAtomicIntrin(Op, DAG,
10486 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10487 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10488 return lowerStructBufferAtomicIntrin(Op, DAG,
10490 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10491 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10492 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10493 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10494 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10495 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10496 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10497 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10498 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10499 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10500 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10501 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10502 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10503 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10504 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10505 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10506 return lowerStructBufferAtomicIntrin(Op, DAG,
10508
10509 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10510 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10511 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
10512 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10513 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10514 SDValue Ops[] = {
10515 Op.getOperand(0), // Chain
10516 Op.getOperand(2), // src
10517 Op.getOperand(3), // cmp
10518 Rsrc, // rsrc
10519 DAG.getConstant(0, DL, MVT::i32), // vindex
10520 VOffset, // voffset
10521 SOffset, // soffset
10522 Offset, // offset
10523 Op.getOperand(7), // cachepolicy
10524 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10525 };
10526 EVT VT = Op.getValueType();
10527 auto *M = cast<MemSDNode>(Op);
10528
10530 Op->getVTList(), Ops, VT,
10531 M->getMemOperand());
10532 }
10533 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10534 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10535 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
10536 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
10537 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
10538 SDValue Ops[] = {
10539 Op.getOperand(0), // Chain
10540 Op.getOperand(2), // src
10541 Op.getOperand(3), // cmp
10542 Rsrc, // rsrc
10543 Op.getOperand(5), // vindex
10544 VOffset, // voffset
10545 SOffset, // soffset
10546 Offset, // offset
10547 Op.getOperand(8), // cachepolicy
10548 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10549 };
10550 EVT VT = Op.getValueType();
10551 auto *M = cast<MemSDNode>(Op);
10552
10554 Op->getVTList(), Ops, VT,
10555 M->getMemOperand());
10556 }
10557 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10558 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10559 MemSDNode *M = cast<MemSDNode>(Op);
10560 SDValue NodePtr = M->getOperand(2);
10561 SDValue RayExtent = M->getOperand(3);
10562 SDValue InstanceMask = M->getOperand(4);
10563 SDValue RayOrigin = M->getOperand(5);
10564 SDValue RayDir = M->getOperand(6);
10565 SDValue Offsets = M->getOperand(7);
10566 SDValue TDescr = M->getOperand(8);
10567
10568 assert(NodePtr.getValueType() == MVT::i64);
10569 assert(RayDir.getValueType() == MVT::v3f32);
10570
10571 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10572 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10573 return SDValue();
10574 }
10575
10576 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10577 const unsigned NumVDataDwords = 10;
10578 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10579 int Opcode = AMDGPU::getMIMGOpcode(
10580 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10581 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10582 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10583 assert(Opcode != -1);
10584
10586 Ops.push_back(NodePtr);
10587 Ops.push_back(DAG.getBuildVector(
10588 MVT::v2i32, DL,
10589 {DAG.getBitcast(MVT::i32, RayExtent),
10590 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10591 Ops.push_back(RayOrigin);
10592 Ops.push_back(RayDir);
10593 Ops.push_back(Offsets);
10594 Ops.push_back(TDescr);
10595 Ops.push_back(M->getChain());
10596
10597 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10598 MachineMemOperand *MemRef = M->getMemOperand();
10599 DAG.setNodeMemRefs(NewNode, {MemRef});
10600 return SDValue(NewNode, 0);
10601 }
10602 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10603 MemSDNode *M = cast<MemSDNode>(Op);
10604 SDValue NodePtr = M->getOperand(2);
10605 SDValue RayExtent = M->getOperand(3);
10606 SDValue RayOrigin = M->getOperand(4);
10607 SDValue RayDir = M->getOperand(5);
10608 SDValue RayInvDir = M->getOperand(6);
10609 SDValue TDescr = M->getOperand(7);
10610
10611 assert(NodePtr.getValueType() == MVT::i32 ||
10612 NodePtr.getValueType() == MVT::i64);
10613 assert(RayDir.getValueType() == MVT::v3f16 ||
10614 RayDir.getValueType() == MVT::v3f32);
10615
10616 if (!Subtarget->hasGFX10_AEncoding()) {
10617 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10618 return SDValue();
10619 }
10620
10621 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
10622 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
10623 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10624 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
10625 const bool Is64 = NodePtr.getValueType() == MVT::i64;
10626 const unsigned NumVDataDwords = 4;
10627 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10628 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10629 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10630 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
10631 IsGFX12Plus;
10632 const unsigned BaseOpcodes[2][2] = {
10633 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10634 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10635 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10636 int Opcode;
10637 if (UseNSA) {
10638 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10639 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10640 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10641 : AMDGPU::MIMGEncGfx10NSA,
10642 NumVDataDwords, NumVAddrDwords);
10643 } else {
10644 assert(!IsGFX12Plus);
10645 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10646 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10647 : AMDGPU::MIMGEncGfx10Default,
10648 NumVDataDwords, NumVAddrDwords);
10649 }
10650 assert(Opcode != -1);
10651
10653
10654 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
10656 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
10657 if (Lanes[0].getValueSizeInBits() == 32) {
10658 for (unsigned I = 0; I < 3; ++I)
10659 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
10660 } else {
10661 if (IsAligned) {
10662 Ops.push_back(DAG.getBitcast(
10663 MVT::i32,
10664 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
10665 Ops.push_back(Lanes[2]);
10666 } else {
10667 SDValue Elt0 = Ops.pop_back_val();
10668 Ops.push_back(DAG.getBitcast(
10669 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
10670 Ops.push_back(DAG.getBitcast(
10671 MVT::i32,
10672 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
10673 }
10674 }
10675 };
10676
10677 if (UseNSA && IsGFX11Plus) {
10678 Ops.push_back(NodePtr);
10679 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10680 Ops.push_back(RayOrigin);
10681 if (IsA16) {
10682 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
10683 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
10684 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
10685 for (unsigned I = 0; I < 3; ++I) {
10686 MergedLanes.push_back(DAG.getBitcast(
10687 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
10688 {DirLanes[I], InvDirLanes[I]})));
10689 }
10690 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
10691 } else {
10692 Ops.push_back(RayDir);
10693 Ops.push_back(RayInvDir);
10694 }
10695 } else {
10696 if (Is64)
10697 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
10698 2);
10699 else
10700 Ops.push_back(NodePtr);
10701
10702 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10703 packLanes(RayOrigin, true);
10704 packLanes(RayDir, true);
10705 packLanes(RayInvDir, false);
10706 }
10707
10708 if (!UseNSA) {
10709 // Build a single vector containing all the operands so far prepared.
10710 if (NumVAddrDwords > 12) {
10711 SDValue Undef = DAG.getPOISON(MVT::i32);
10712 Ops.append(16 - Ops.size(), Undef);
10713 }
10714 assert(Ops.size() >= 8 && Ops.size() <= 12);
10715 SDValue MergedOps =
10716 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
10717 Ops.clear();
10718 Ops.push_back(MergedOps);
10719 }
10720
10721 Ops.push_back(TDescr);
10722 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
10723 Ops.push_back(M->getChain());
10724
10725 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10726 MachineMemOperand *MemRef = M->getMemOperand();
10727 DAG.setNodeMemRefs(NewNode, {MemRef});
10728 return SDValue(NewNode, 0);
10729 }
10730 case Intrinsic::amdgcn_global_atomic_fmin_num:
10731 case Intrinsic::amdgcn_global_atomic_fmax_num:
10732 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10733 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10734 MemSDNode *M = cast<MemSDNode>(Op);
10735 SDValue Ops[] = {
10736 M->getOperand(0), // Chain
10737 M->getOperand(2), // Ptr
10738 M->getOperand(3) // Value
10739 };
10740 unsigned Opcode = 0;
10741 switch (IntrID) {
10742 case Intrinsic::amdgcn_global_atomic_fmin_num:
10743 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10744 Opcode = ISD::ATOMIC_LOAD_FMIN;
10745 break;
10746 }
10747 case Intrinsic::amdgcn_global_atomic_fmax_num:
10748 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10749 Opcode = ISD::ATOMIC_LOAD_FMAX;
10750 break;
10751 }
10752 default:
10753 llvm_unreachable("unhandled atomic opcode");
10754 }
10755 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
10756 Ops, M->getMemOperand());
10757 }
10758 case Intrinsic::amdgcn_s_get_barrier_state:
10759 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10760 SDValue Chain = Op->getOperand(0);
10762 unsigned Opc;
10763
10764 if (isa<ConstantSDNode>(Op->getOperand(2))) {
10765 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
10766 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10767 BarID = (BarID >> 4) & 0x3F;
10768 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10769 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10770 Ops.push_back(K);
10771 Ops.push_back(Chain);
10772 } else {
10773 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10774 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10775 SDValue M0Val;
10776 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
10777 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10778 M0Val = SDValue(
10779 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10780 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10781 0);
10782 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10783 } else
10784 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
10785 }
10786
10787 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10788 return SDValue(NewMI, 0);
10789 }
10790 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
10791 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
10792 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
10793 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
10794 SDValue Chain = Op->getOperand(0);
10795 SDValue Ptr = Op->getOperand(2);
10796 EVT VT = Op->getValueType(0);
10797 return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
10798 Chain, Ptr, MII->getMemOperand());
10799 }
10800 default:
10801
10802 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10804 return lowerImage(Op, ImageDimIntr, DAG, true);
10805
10806 return SDValue();
10807 }
10808}
10809
10810// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
10811// dwordx4 if on SI and handle TFE loads.
10812SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
10813 SDVTList VTList,
10814 ArrayRef<SDValue> Ops, EVT MemVT,
10815 MachineMemOperand *MMO,
10816 SelectionDAG &DAG) const {
10817 LLVMContext &C = *DAG.getContext();
10818 MachineFunction &MF = DAG.getMachineFunction();
10819 EVT VT = VTList.VTs[0];
10820
10821 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
10822 bool IsTFE = VTList.NumVTs == 3;
10823 if (IsTFE) {
10824 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
10825 unsigned NumOpDWords = NumValueDWords + 1;
10826 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
10827 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
10828 MachineMemOperand *OpDWordsMMO =
10829 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
10830 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
10831 OpDWordsVT, OpDWordsMMO, DAG);
10832 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10833 DAG.getVectorIdxConstant(NumValueDWords, DL));
10834 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
10835 SDValue ValueDWords =
10836 NumValueDWords == 1
10837 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
10839 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
10840 ZeroIdx);
10841 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
10842 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10843 }
10844
10845 if (!Subtarget->hasDwordx3LoadStores() &&
10846 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10847 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
10848 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
10849 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
10850 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
10851 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
10852 WidenedMemVT, WidenedMMO);
10854 DAG.getVectorIdxConstant(0, DL));
10855 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
10856 }
10857
10858 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
10859}
10860
10861SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
10862 bool ImageStore) const {
10863 EVT StoreVT = VData.getValueType();
10864
10865 // No change for f16 and legal vector D16 types.
10866 if (!StoreVT.isVector())
10867 return VData;
10868
10869 SDLoc DL(VData);
10870 unsigned NumElements = StoreVT.getVectorNumElements();
10871
10872 if (Subtarget->hasUnpackedD16VMem()) {
10873 // We need to unpack the packed data to store.
10874 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10875 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10876
10877 EVT EquivStoreVT =
10878 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
10879 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
10880 return DAG.UnrollVectorOp(ZExt.getNode());
10881 }
10882
10883 // The sq block of gfx8.1 does not estimate register use correctly for d16
10884 // image store instructions. The data operand is computed as if it were not a
10885 // d16 image instruction.
10886 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
10887 // Bitcast to i16
10888 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10889 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10890
10891 // Decompose into scalars
10893 DAG.ExtractVectorElements(IntVData, Elts);
10894
10895 // Group pairs of i16 into v2i16 and bitcast to i32
10896 SmallVector<SDValue, 4> PackedElts;
10897 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
10898 SDValue Pair =
10899 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
10900 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10901 PackedElts.push_back(IntPair);
10902 }
10903 if ((NumElements % 2) == 1) {
10904 // Handle v3i16
10905 unsigned I = Elts.size() / 2;
10906 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
10907 {Elts[I * 2], DAG.getPOISON(MVT::i16)});
10908 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10909 PackedElts.push_back(IntPair);
10910 }
10911
10912 // Pad using UNDEF
10913 PackedElts.resize(Elts.size(), DAG.getPOISON(MVT::i32));
10914
10915 // Build final vector
10916 EVT VecVT =
10917 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
10918 return DAG.getBuildVector(VecVT, DL, PackedElts);
10919 }
10920
10921 if (NumElements == 3) {
10922 EVT IntStoreVT =
10924 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10925
10926 EVT WidenedStoreVT = EVT::getVectorVT(
10927 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
10928 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
10929 WidenedStoreVT.getStoreSizeInBits());
10930 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
10931 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
10932 }
10933
10934 assert(isTypeLegal(StoreVT));
10935 return VData;
10936}
10937
10938SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
10939 SelectionDAG &DAG) const {
10940 SDLoc DL(Op);
10941 SDValue Chain = Op.getOperand(0);
10942 unsigned IntrinsicID = Op.getConstantOperandVal(1);
10943 MachineFunction &MF = DAG.getMachineFunction();
10944
10945 switch (IntrinsicID) {
10946 case Intrinsic::amdgcn_exp_compr: {
10947 if (!Subtarget->hasCompressedExport()) {
10948 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10950 "intrinsic not supported on subtarget", DL.getDebugLoc()));
10951 }
10952 SDValue Src0 = Op.getOperand(4);
10953 SDValue Src1 = Op.getOperand(5);
10954 // Hack around illegal type on SI by directly selecting it.
10955 if (isTypeLegal(Src0.getValueType()))
10956 return SDValue();
10957
10958 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
10959 SDValue Undef = DAG.getPOISON(MVT::f32);
10960 const SDValue Ops[] = {
10961 Op.getOperand(2), // tgt
10962 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
10963 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
10964 Undef, // src2
10965 Undef, // src3
10966 Op.getOperand(7), // vm
10967 DAG.getTargetConstant(1, DL, MVT::i1), // compr
10968 Op.getOperand(3), // en
10969 Op.getOperand(0) // Chain
10970 };
10971
10972 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
10973 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
10974 }
10975
10976 case Intrinsic::amdgcn_struct_tbuffer_store:
10977 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
10978 SDValue VData = Op.getOperand(2);
10979 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
10980 if (IsD16)
10981 VData = handleD16VData(VData, DAG);
10982 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10983 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10984 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10985 SDValue Ops[] = {
10986 Chain,
10987 VData, // vdata
10988 Rsrc, // rsrc
10989 Op.getOperand(4), // vindex
10990 VOffset, // voffset
10991 SOffset, // soffset
10992 Offset, // offset
10993 Op.getOperand(7), // format
10994 Op.getOperand(8), // cachepolicy, swizzled buffer
10995 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10996 };
10997 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
10999 MemSDNode *M = cast<MemSDNode>(Op);
11000 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11001 M->getMemoryVT(), M->getMemOperand());
11002 }
11003
11004 case Intrinsic::amdgcn_raw_tbuffer_store:
11005 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11006 SDValue VData = Op.getOperand(2);
11007 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11008 if (IsD16)
11009 VData = handleD16VData(VData, DAG);
11010 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11011 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11012 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11013 SDValue Ops[] = {
11014 Chain,
11015 VData, // vdata
11016 Rsrc, // rsrc
11017 DAG.getConstant(0, DL, MVT::i32), // vindex
11018 VOffset, // voffset
11019 SOffset, // soffset
11020 Offset, // offset
11021 Op.getOperand(6), // format
11022 Op.getOperand(7), // cachepolicy, swizzled buffer
11023 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11024 };
11025 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11027 MemSDNode *M = cast<MemSDNode>(Op);
11028 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11029 M->getMemoryVT(), M->getMemOperand());
11030 }
11031
11032 case Intrinsic::amdgcn_raw_buffer_store:
11033 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11034 case Intrinsic::amdgcn_raw_buffer_store_format:
11035 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11036 const bool IsFormat =
11037 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11038 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11039
11040 SDValue VData = Op.getOperand(2);
11041 EVT VDataVT = VData.getValueType();
11042 EVT EltType = VDataVT.getScalarType();
11043 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11044 if (IsD16) {
11045 VData = handleD16VData(VData, DAG);
11046 VDataVT = VData.getValueType();
11047 }
11048
11049 if (!isTypeLegal(VDataVT)) {
11050 VData =
11051 DAG.getNode(ISD::BITCAST, DL,
11052 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11053 }
11054
11055 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11056 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11057 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11058 SDValue Ops[] = {
11059 Chain,
11060 VData,
11061 Rsrc,
11062 DAG.getConstant(0, DL, MVT::i32), // vindex
11063 VOffset, // voffset
11064 SOffset, // soffset
11065 Offset, // offset
11066 Op.getOperand(6), // cachepolicy, swizzled buffer
11067 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11068 };
11069 unsigned Opc =
11072 MemSDNode *M = cast<MemSDNode>(Op);
11073
11074 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11075 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11076 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
11077
11078 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11079 M->getMemoryVT(), M->getMemOperand());
11080 }
11081
11082 case Intrinsic::amdgcn_struct_buffer_store:
11083 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11084 case Intrinsic::amdgcn_struct_buffer_store_format:
11085 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11086 const bool IsFormat =
11087 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11088 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11089
11090 SDValue VData = Op.getOperand(2);
11091 EVT VDataVT = VData.getValueType();
11092 EVT EltType = VDataVT.getScalarType();
11093 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11094
11095 if (IsD16) {
11096 VData = handleD16VData(VData, DAG);
11097 VDataVT = VData.getValueType();
11098 }
11099
11100 if (!isTypeLegal(VDataVT)) {
11101 VData =
11102 DAG.getNode(ISD::BITCAST, DL,
11103 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11104 }
11105
11106 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11107 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11108 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11109 SDValue Ops[] = {
11110 Chain,
11111 VData,
11112 Rsrc,
11113 Op.getOperand(4), // vindex
11114 VOffset, // voffset
11115 SOffset, // soffset
11116 Offset, // offset
11117 Op.getOperand(7), // cachepolicy, swizzled buffer
11118 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11119 };
11120 unsigned Opc =
11123 MemSDNode *M = cast<MemSDNode>(Op);
11124
11125 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11126 EVT VDataType = VData.getValueType().getScalarType();
11127 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11128 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
11129
11130 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11131 M->getMemoryVT(), M->getMemOperand());
11132 }
11133 case Intrinsic::amdgcn_raw_buffer_load_lds:
11134 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11135 case Intrinsic::amdgcn_struct_buffer_load_lds:
11136 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
11137 if (!Subtarget->hasVMemToLDSLoad())
11138 return SDValue();
11139 unsigned Opc;
11140 bool HasVIndex =
11141 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11142 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
11143 unsigned OpOffset = HasVIndex ? 1 : 0;
11144 SDValue VOffset = Op.getOperand(5 + OpOffset);
11145 bool HasVOffset = !isNullConstant(VOffset);
11146 unsigned Size = Op->getConstantOperandVal(4);
11147
11148 switch (Size) {
11149 default:
11150 return SDValue();
11151 case 1:
11152 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11153 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11154 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11155 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11156 break;
11157 case 2:
11158 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11159 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11160 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11161 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11162 break;
11163 case 4:
11164 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11165 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11166 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11167 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11168 break;
11169 case 12:
11170 if (!Subtarget->hasLDSLoadB96_B128())
11171 return SDValue();
11172 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11173 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11174 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11175 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11176 break;
11177 case 16:
11178 if (!Subtarget->hasLDSLoadB96_B128())
11179 return SDValue();
11180 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11181 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11182 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11183 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11184 break;
11185 }
11186
11187 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11188
11190
11191 if (HasVIndex && HasVOffset)
11192 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
11193 {Op.getOperand(5), // VIndex
11194 VOffset}));
11195 else if (HasVIndex)
11196 Ops.push_back(Op.getOperand(5));
11197 else if (HasVOffset)
11198 Ops.push_back(VOffset);
11199
11200 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11201 Ops.push_back(Rsrc);
11202 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
11203 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
11204 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
11205 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
11206 Ops.push_back(DAG.getTargetConstant(
11207 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
11208 DL, MVT::i8)); // cpol
11209 Ops.push_back(DAG.getTargetConstant(
11210 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
11211 ? 1
11212 : 0,
11213 DL, MVT::i8)); // swz
11214 Ops.push_back(M0Val.getValue(0)); // Chain
11215 Ops.push_back(M0Val.getValue(1)); // Glue
11216
11217 auto *M = cast<MemSDNode>(Op);
11218 MachineMemOperand *LoadMMO = M->getMemOperand();
11219 // Don't set the offset value here because the pointer points to the base of
11220 // the buffer.
11221 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11222
11223 MachinePointerInfo StorePtrI = LoadPtrI;
11224 LoadPtrI.V = PoisonValue::get(
11228
11229 auto F = LoadMMO->getFlags() &
11231 LoadMMO =
11233 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11234
11235 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11236 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
11237 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11238
11239 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
11240 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11241
11242 return SDValue(Load, 0);
11243 }
11244 // Buffers are handled by LowerBufferFatPointers, and we're going to go
11245 // for "trust me" that the remaining cases are global pointers until
11246 // such time as we can put two mem operands on an intrinsic.
11247 case Intrinsic::amdgcn_load_to_lds:
11248 case Intrinsic::amdgcn_global_load_lds: {
11249 if (!Subtarget->hasVMemToLDSLoad())
11250 return SDValue();
11251
11252 unsigned Opc;
11253 unsigned Size = Op->getConstantOperandVal(4);
11254 switch (Size) {
11255 default:
11256 return SDValue();
11257 case 1:
11258 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11259 break;
11260 case 2:
11261 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11262 break;
11263 case 4:
11264 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11265 break;
11266 case 12:
11267 if (!Subtarget->hasLDSLoadB96_B128())
11268 return SDValue();
11269 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11270 break;
11271 case 16:
11272 if (!Subtarget->hasLDSLoadB96_B128())
11273 return SDValue();
11274 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11275 break;
11276 }
11277
11278 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11279
11281
11282 SDValue Addr = Op.getOperand(2); // Global ptr
11283 SDValue VOffset;
11284 // Try to split SAddr and VOffset. Global and LDS pointers share the same
11285 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
11286 if (Addr->isDivergent() && Addr->isAnyAdd()) {
11287 SDValue LHS = Addr.getOperand(0);
11288 SDValue RHS = Addr.getOperand(1);
11289
11290 if (LHS->isDivergent())
11291 std::swap(LHS, RHS);
11292
11293 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
11294 RHS.getOperand(0).getValueType() == MVT::i32) {
11295 // add (i64 sgpr), (zero_extend (i32 vgpr))
11296 Addr = LHS;
11297 VOffset = RHS.getOperand(0);
11298 }
11299 }
11300
11301 Ops.push_back(Addr);
11302 if (!Addr->isDivergent()) {
11304 if (!VOffset)
11305 VOffset =
11306 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
11307 DAG.getTargetConstant(0, DL, MVT::i32)),
11308 0);
11309 Ops.push_back(VOffset);
11310 }
11311
11312 Ops.push_back(Op.getOperand(5)); // Offset
11313 Ops.push_back(Op.getOperand(6)); // CPol
11314 Ops.push_back(M0Val.getValue(0)); // Chain
11315 Ops.push_back(M0Val.getValue(1)); // Glue
11316
11317 auto *M = cast<MemSDNode>(Op);
11318 MachineMemOperand *LoadMMO = M->getMemOperand();
11319 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11320 LoadPtrI.Offset = Op->getConstantOperandVal(5);
11321 MachinePointerInfo StorePtrI = LoadPtrI;
11322 LoadPtrI.V = PoisonValue::get(
11326 auto F = LoadMMO->getFlags() &
11328 LoadMMO =
11330 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11331 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11332 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
11333 LoadMMO->getAAInfo());
11334
11335 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11336 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11337
11338 return SDValue(Load, 0);
11339 }
11340 case Intrinsic::amdgcn_end_cf:
11341 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
11342 Op->getOperand(2), Chain),
11343 0);
11344 case Intrinsic::amdgcn_s_barrier_init:
11345 case Intrinsic::amdgcn_s_barrier_signal_var: {
11346 // these two intrinsics have two operands: barrier pointer and member count
11347 SDValue Chain = Op->getOperand(0);
11349 SDValue BarOp = Op->getOperand(2);
11350 SDValue CntOp = Op->getOperand(3);
11351 SDValue M0Val;
11352 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11353 ? AMDGPU::S_BARRIER_INIT_M0
11354 : AMDGPU::S_BARRIER_SIGNAL_M0;
11355 // extract the BarrierID from bits 4-9 of BarOp
11356 SDValue BarID;
11357 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11358 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11359 BarID =
11360 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
11361 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11362 0);
11363 // Member count should be put into M0[ShAmt:+6]
11364 // Barrier ID should be put into M0[5:0]
11365 M0Val =
11366 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
11367 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11368 0);
11369 constexpr unsigned ShAmt = 16;
11370 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
11371 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
11372
11373 M0Val = SDValue(
11374 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
11375
11376 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11377
11378 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11379 return SDValue(NewMI, 0);
11380 }
11381 case Intrinsic::amdgcn_s_barrier_join: {
11382 // these three intrinsics have one operand: barrier pointer
11383 SDValue Chain = Op->getOperand(0);
11385 SDValue BarOp = Op->getOperand(2);
11386 unsigned Opc;
11387
11388 if (isa<ConstantSDNode>(BarOp)) {
11389 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
11390 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11391
11392 // extract the BarrierID from bits 4-9 of the immediate
11393 unsigned BarID = (BarVal >> 4) & 0x3F;
11394 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
11395 Ops.push_back(K);
11396 Ops.push_back(Chain);
11397 } else {
11398 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11399
11400 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
11401 SDValue M0Val;
11402 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11403 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11404 M0Val =
11405 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
11406 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11407 0);
11408 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11409 }
11410
11411 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11412 return SDValue(NewMI, 0);
11413 }
11414 case Intrinsic::amdgcn_s_prefetch_data: {
11415 // For non-global address space preserve the chain and remove the call.
11417 return Op.getOperand(0);
11418 return Op;
11419 }
11420 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11421 SDValue Ops[] = {
11422 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
11423 Op.getOperand(3), // offset
11424 Op.getOperand(4), // length
11425 };
11426
11427 MemSDNode *M = cast<MemSDNode>(Op);
11429 Op->getVTList(), Ops, M->getMemoryVT(),
11430 M->getMemOperand());
11431 }
11432 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11433 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11434 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11435 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11436 SDValue Chain = Op->getOperand(0);
11437 SDValue Ptr = Op->getOperand(2);
11438 SDValue Val = Op->getOperand(3);
11439 return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val,
11440 Ptr, MII->getMemOperand());
11441 }
11442 default: {
11443 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11445 return lowerImage(Op, ImageDimIntr, DAG, true);
11446
11447 return Op;
11448 }
11449 }
11450}
11451
11452// Return whether the operation has NoUnsignedWrap property.
11453static bool isNoUnsignedWrap(SDValue Addr) {
11454 return (Addr.getOpcode() == ISD::ADD &&
11455 Addr->getFlags().hasNoUnsignedWrap()) ||
11456 Addr->getOpcode() == ISD::OR;
11457}
11458
11460 EVT PtrVT) const {
11461 return PtrVT == MVT::i64;
11462}
11463
11465 EVT PtrVT) const {
11466 return true;
11467}
11468
11469// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
11470// offset (the offset that is included in bounds checking and swizzling, to be
11471// split between the instruction's voffset and immoffset fields) and soffset
11472// (the offset that is excluded from bounds checking and swizzling, to go in
11473// the instruction's soffset field). This function takes the first kind of
11474// offset and figures out how to split it between voffset and immoffset.
11475std::pair<SDValue, SDValue>
11476SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
11477 SDLoc DL(Offset);
11478 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
11479 SDValue N0 = Offset;
11480 ConstantSDNode *C1 = nullptr;
11481
11482 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
11483 N0 = SDValue();
11484 else if (DAG.isBaseWithConstantOffset(N0)) {
11485 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
11486 // being added, so we can only safely match a 32-bit addition with no
11487 // unsigned overflow.
11488 bool CheckNUW = AMDGPU::isGFX1250(*Subtarget);
11489 if (!CheckNUW || isNoUnsignedWrap(N0)) {
11490 C1 = cast<ConstantSDNode>(N0.getOperand(1));
11491 N0 = N0.getOperand(0);
11492 }
11493 }
11494
11495 if (C1) {
11496 unsigned ImmOffset = C1->getZExtValue();
11497 // If the immediate value is too big for the immoffset field, put only bits
11498 // that would normally fit in the immoffset field. The remaining value that
11499 // is copied/added for the voffset field is a large power of 2, and it
11500 // stands more chance of being CSEd with the copy/add for another similar
11501 // load/store.
11502 // However, do not do that rounding down if that is a negative
11503 // number, as it appears to be illegal to have a negative offset in the
11504 // vgpr, even if adding the immediate offset makes it positive.
11505 unsigned Overflow = ImmOffset & ~MaxImm;
11506 ImmOffset -= Overflow;
11507 if ((int32_t)Overflow < 0) {
11508 Overflow += ImmOffset;
11509 ImmOffset = 0;
11510 }
11511 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
11512 if (Overflow) {
11513 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
11514 if (!N0)
11515 N0 = OverflowVal;
11516 else {
11517 SDValue Ops[] = {N0, OverflowVal};
11518 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
11519 }
11520 }
11521 }
11522 if (!N0)
11523 N0 = DAG.getConstant(0, DL, MVT::i32);
11524 if (!C1)
11525 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
11526 return {N0, SDValue(C1, 0)};
11527}
11528
11529// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
11530// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
11531// pointed to by Offsets.
11532void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
11533 SelectionDAG &DAG, SDValue *Offsets,
11534 Align Alignment) const {
11535 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11536 SDLoc DL(CombinedOffset);
11537 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
11538 uint32_t Imm = C->getZExtValue();
11539 uint32_t SOffset, ImmOffset;
11540 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11541 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
11542 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11543 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11544 return;
11545 }
11546 }
11547 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
11548 SDValue N0 = CombinedOffset.getOperand(0);
11549 SDValue N1 = CombinedOffset.getOperand(1);
11550 uint32_t SOffset, ImmOffset;
11551 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
11552 if (Offset >= 0 &&
11553 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
11554 Offsets[0] = N0;
11555 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11556 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11557 return;
11558 }
11559 }
11560
11561 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11562 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
11563 : DAG.getConstant(0, DL, MVT::i32);
11564
11565 Offsets[0] = CombinedOffset;
11566 Offsets[1] = SOffsetZero;
11567 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
11568}
11569
11570SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
11571 SelectionDAG &DAG) const {
11572 if (!MaybePointer.getValueType().isScalarInteger())
11573 return MaybePointer;
11574
11575 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
11576 return Rsrc;
11577}
11578
11579// Wrap a global or flat pointer into a buffer intrinsic using the flags
11580// specified in the intrinsic.
11581SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
11582 SelectionDAG &DAG) const {
11583 SDLoc Loc(Op);
11584
11585 SDValue Pointer = Op->getOperand(1);
11586 SDValue Stride = Op->getOperand(2);
11587 SDValue NumRecords = Op->getOperand(3);
11588 SDValue Flags = Op->getOperand(4);
11589
11590 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
11591 SDValue Rsrc;
11592
11593 if (Subtarget->has45BitNumRecordsBufferResource()) {
11594 SDValue Zero = DAG.getConstant(0, Loc, MVT::i32);
11595 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
11596 // num_records.
11597 SDValue ExtPointer = DAG.getAnyExtOrTrunc(Pointer, Loc, MVT::i64);
11598 SDValue NumRecordsLHS =
11599 DAG.getNode(ISD::SHL, Loc, MVT::i64, NumRecords,
11600 DAG.getShiftAmountConstant(57, MVT::i32, Loc));
11601 SDValue LowHalf =
11602 DAG.getNode(ISD::OR, Loc, MVT::i64, ExtPointer, NumRecordsLHS);
11603
11604 // Build the higher 64-bit value, which has the higher 38-bit num_records,
11605 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
11606 SDValue NumRecordsRHS =
11607 DAG.getNode(ISD::SRL, Loc, MVT::i64, NumRecords,
11608 DAG.getShiftAmountConstant(7, MVT::i32, Loc));
11609 SDValue ShiftedStride =
11610 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11611 DAG.getShiftAmountConstant(12, MVT::i32, Loc));
11612 SDValue ExtShiftedStrideVec =
11613 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedStride);
11614 SDValue ExtShiftedStride =
11615 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedStrideVec);
11616 SDValue ShiftedFlags =
11617 DAG.getNode(ISD::SHL, Loc, MVT::i32, Flags,
11618 DAG.getShiftAmountConstant(28, MVT::i32, Loc));
11619 SDValue ExtShiftedFlagsVec =
11620 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedFlags);
11621 SDValue ExtShiftedFlags =
11622 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedFlagsVec);
11623 SDValue CombinedFields =
11624 DAG.getNode(ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
11625 SDValue HighHalf =
11626 DAG.getNode(ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
11627
11628 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i64, LowHalf, HighHalf);
11629 } else {
11630 NumRecords = DAG.getAnyExtOrTrunc(NumRecords, Loc, MVT::i32);
11631 auto [LowHalf, HighHalf] =
11632 DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11633 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
11634 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
11635 SDValue ShiftedStride =
11636 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11637 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
11638 SDValue NewHighHalf =
11639 DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
11640
11641 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf, NewHighHalf,
11642 NumRecords, Flags);
11643 }
11644
11645 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11646 return RsrcPtr;
11647}
11648
11649// Handle 8 bit and 16 bit buffer loads
11650SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
11651 EVT LoadVT, SDLoc DL,
11653 MachineMemOperand *MMO,
11654 bool IsTFE) const {
11655 EVT IntVT = LoadVT.changeTypeToInteger();
11656
11657 if (IsTFE) {
11658 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
11661 MachineFunction &MF = DAG.getMachineFunction();
11662 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
11663 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
11664 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
11665 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11666 DAG.getConstant(1, DL, MVT::i32));
11667 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11668 DAG.getConstant(0, DL, MVT::i32));
11669 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
11670 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
11671 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
11672 }
11673
11674 unsigned Opc = LoadVT.getScalarType() == MVT::i8
11677
11678 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
11679 SDValue BufferLoad =
11680 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
11681 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
11682 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
11683
11684 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
11685}
11686
11687// Handle 8 bit and 16 bit buffer stores
11688SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
11689 EVT VDataType, SDLoc DL,
11690 SDValue Ops[],
11691 MemSDNode *M) const {
11692 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11693 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
11694
11695 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
11696 Ops[1] = BufferStoreExt;
11697 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
11698 : AMDGPUISD::BUFFER_STORE_SHORT;
11699 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
11700 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
11701 M->getMemOperand());
11702}
11703
11705 SDValue Op, const SDLoc &SL, EVT VT) {
11706 if (VT.bitsLT(Op.getValueType()))
11707 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
11708
11709 switch (ExtType) {
11710 case ISD::SEXTLOAD:
11711 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
11712 case ISD::ZEXTLOAD:
11713 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
11714 case ISD::EXTLOAD:
11715 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
11716 case ISD::NON_EXTLOAD:
11717 return Op;
11718 }
11719
11720 llvm_unreachable("invalid ext type");
11721}
11722
11723// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
11724// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
11725SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
11726 DAGCombinerInfo &DCI) const {
11727 SelectionDAG &DAG = DCI.DAG;
11728 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
11729 return SDValue();
11730
11731 // FIXME: Constant loads should all be marked invariant.
11732 unsigned AS = Ld->getAddressSpace();
11733 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
11735 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
11736 return SDValue();
11737
11738 // Don't do this early, since it may interfere with adjacent load merging for
11739 // illegal types. We can avoid losing alignment information for exotic types
11740 // pre-legalize.
11741 EVT MemVT = Ld->getMemoryVT();
11742 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
11743 MemVT.getSizeInBits() >= 32)
11744 return SDValue();
11745
11746 SDLoc SL(Ld);
11747
11748 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
11749 "unexpected vector extload");
11750
11751 // TODO: Drop only high part of range.
11752 SDValue Ptr = Ld->getBasePtr();
11753 SDValue NewLoad = DAG.getLoad(
11754 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
11755 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
11756 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
11757 nullptr); // Drop ranges
11758
11759 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
11760 if (MemVT.isFloatingPoint()) {
11762 "unexpected fp extload");
11763 TruncVT = MemVT.changeTypeToInteger();
11764 }
11765
11766 SDValue Cvt = NewLoad;
11767 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
11768 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
11769 DAG.getValueType(TruncVT));
11770 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
11772 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
11773 } else {
11775 }
11776
11777 EVT VT = Ld->getValueType(0);
11778 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
11779
11780 DCI.AddToWorklist(Cvt.getNode());
11781
11782 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
11783 // the appropriate extension from the 32-bit load.
11784 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
11785 DCI.AddToWorklist(Cvt.getNode());
11786
11787 // Handle conversion back to floating point if necessary.
11788 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
11789
11790 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
11791}
11792
11794 const SIMachineFunctionInfo &Info) {
11795 // TODO: Should check if the address can definitely not access stack.
11796 if (Info.isEntryFunction())
11797 return Info.getUserSGPRInfo().hasFlatScratchInit();
11798 return true;
11799}
11800
11801SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
11802 SDLoc DL(Op);
11803 LoadSDNode *Load = cast<LoadSDNode>(Op);
11804 ISD::LoadExtType ExtType = Load->getExtensionType();
11805 EVT MemVT = Load->getMemoryVT();
11806 MachineMemOperand *MMO = Load->getMemOperand();
11807
11808 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
11809 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
11810 return SDValue();
11811
11812 // FIXME: Copied from PPC
11813 // First, load into 32 bits, then truncate to 1 bit.
11814
11815 SDValue Chain = Load->getChain();
11816 SDValue BasePtr = Load->getBasePtr();
11817
11818 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
11819
11820 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
11821 RealMemVT, MMO);
11822
11823 if (!MemVT.isVector()) {
11824 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
11825 NewLD.getValue(1)};
11826
11827 return DAG.getMergeValues(Ops, DL);
11828 }
11829
11831 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
11832 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
11833 DAG.getConstant(I, DL, MVT::i32));
11834
11835 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
11836 }
11837
11838 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
11839
11840 return DAG.getMergeValues(Ops, DL);
11841 }
11842
11843 if (!MemVT.isVector())
11844 return SDValue();
11845
11846 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
11847 "Custom lowering for non-i32 vectors hasn't been implemented.");
11848
11849 Align Alignment = Load->getAlign();
11850 unsigned AS = Load->getAddressSpace();
11851 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
11852 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
11853 return SplitVectorLoad(Op, DAG);
11854 }
11855
11856 MachineFunction &MF = DAG.getMachineFunction();
11857 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
11858 // If there is a possibility that flat instruction access scratch memory
11859 // then we need to use the same legalization rules we use for private.
11860 if (AS == AMDGPUAS::FLAT_ADDRESS &&
11861 !Subtarget->hasMultiDwordFlatScratchAddressing())
11862 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
11865
11866 unsigned NumElements = MemVT.getVectorNumElements();
11867
11868 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11870 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
11871 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
11873 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
11874 Alignment >= Align(4) && NumElements < 32) {
11875 if (MemVT.isPow2VectorType() ||
11876 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
11877 return SDValue();
11878 return WidenOrSplitVectorLoad(Op, DAG);
11879 }
11880 // Non-uniform loads will be selected to MUBUF instructions, so they
11881 // have the same legalization requirements as global and private
11882 // loads.
11883 //
11884 }
11885 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11888 if (NumElements > 4)
11889 return SplitVectorLoad(Op, DAG);
11890 // v3 loads not supported on SI.
11891 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11892 return WidenOrSplitVectorLoad(Op, DAG);
11893
11894 // v3 and v4 loads are supported for private and global memory.
11895 return SDValue();
11896 }
11897 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11898 // Depending on the setting of the private_element_size field in the
11899 // resource descriptor, we can only make private accesses up to a certain
11900 // size.
11901 switch (Subtarget->getMaxPrivateElementSize()) {
11902 case 4: {
11903 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
11904 return DAG.getMergeValues({Op0, Op1}, DL);
11905 }
11906 case 8:
11907 if (NumElements > 2)
11908 return SplitVectorLoad(Op, DAG);
11909 return SDValue();
11910 case 16:
11911 // Same as global/flat
11912 if (NumElements > 4)
11913 return SplitVectorLoad(Op, DAG);
11914 // v3 loads not supported on SI.
11915 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11916 return WidenOrSplitVectorLoad(Op, DAG);
11917
11918 return SDValue();
11919 default:
11920 llvm_unreachable("unsupported private_element_size");
11921 }
11922 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
11923 unsigned Fast = 0;
11924 auto Flags = Load->getMemOperand()->getFlags();
11926 Load->getAlign(), Flags, &Fast) &&
11927 Fast > 1)
11928 return SDValue();
11929
11930 if (MemVT.isVector())
11931 return SplitVectorLoad(Op, DAG);
11932 }
11933
11935 MemVT, *Load->getMemOperand())) {
11936 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
11937 return DAG.getMergeValues({Op0, Op1}, DL);
11938 }
11939
11940 return SDValue();
11941}
11942
11943SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
11944 EVT VT = Op.getValueType();
11945 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
11946 VT.getSizeInBits() == 512)
11947 return splitTernaryVectorOp(Op, DAG);
11948
11949 assert(VT.getSizeInBits() == 64);
11950
11951 SDLoc DL(Op);
11952 SDValue Cond = DAG.getFreeze(Op.getOperand(0));
11953
11954 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
11955 SDValue One = DAG.getConstant(1, DL, MVT::i32);
11956
11957 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
11958 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
11959
11960 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
11961 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
11962
11963 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
11964
11965 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
11966 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
11967
11968 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
11969
11970 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
11971 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
11972}
11973
11974// Catch division cases where we can use shortcuts with rcp and rsq
11975// instructions.
11976SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
11977 SelectionDAG &DAG) const {
11978 SDLoc SL(Op);
11979 SDValue LHS = Op.getOperand(0);
11980 SDValue RHS = Op.getOperand(1);
11981 EVT VT = Op.getValueType();
11982 const SDNodeFlags Flags = Op->getFlags();
11983
11984 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
11985
11986 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
11987 // Without !fpmath accuracy information, we can't do more because we don't
11988 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
11989 // f16 is always accurate enough
11990 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
11991 return SDValue();
11992
11993 if (CLHS->isExactlyValue(1.0)) {
11994 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
11995 // the CI documentation has a worst case error of 1 ulp.
11996 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
11997 // use it as long as we aren't trying to use denormals.
11998 //
11999 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
12000
12001 // 1.0 / sqrt(x) -> rsq(x)
12002
12003 // XXX - Is afn sufficient to do this for f64? The maximum ULP
12004 // error seems really high at 2^29 ULP.
12005 // 1.0 / x -> rcp(x)
12006 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12007 }
12008
12009 // Same as for 1.0, but expand the sign out of the constant.
12010 if (CLHS->isExactlyValue(-1.0)) {
12011 // -1.0 / x -> rcp (fneg x)
12012 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
12013 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
12014 }
12015 }
12016
12017 // For f16 and bf16 require afn or arcp.
12018 // For f32 require afn.
12019 if (!AllowInaccurateRcp &&
12020 ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
12021 return SDValue();
12022
12023 // Turn into multiply by the reciprocal.
12024 // x / y -> x * (1.0 / y)
12025 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12026 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
12027}
12028
12029SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
12030 SelectionDAG &DAG) const {
12031 SDLoc SL(Op);
12032 SDValue X = Op.getOperand(0);
12033 SDValue Y = Op.getOperand(1);
12034 EVT VT = Op.getValueType();
12035 const SDNodeFlags Flags = Op->getFlags();
12036
12037 bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
12038 if (!AllowInaccurateDiv)
12039 return SDValue();
12040
12041 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
12042 SDValue One = DAG.getConstantFP(1.0, SL, VT);
12043
12044 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
12045 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12046
12047 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
12048 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12049 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
12050 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
12051 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
12052 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
12053}
12054
12055static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12056 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
12057 SDNodeFlags Flags) {
12058 if (GlueChain->getNumValues() <= 1) {
12059 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
12060 }
12061
12062 assert(GlueChain->getNumValues() == 3);
12063
12064 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12065 switch (Opcode) {
12066 default:
12067 llvm_unreachable("no chain equivalent for opcode");
12068 case ISD::FMUL:
12069 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12070 break;
12071 }
12072
12073 return DAG.getNode(Opcode, SL, VTList,
12074 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
12075 Flags);
12076}
12077
12078static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12079 EVT VT, SDValue A, SDValue B, SDValue C,
12080 SDValue GlueChain, SDNodeFlags Flags) {
12081 if (GlueChain->getNumValues() <= 1) {
12082 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
12083 }
12084
12085 assert(GlueChain->getNumValues() == 3);
12086
12087 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12088 switch (Opcode) {
12089 default:
12090 llvm_unreachable("no chain equivalent for opcode");
12091 case ISD::FMA:
12092 Opcode = AMDGPUISD::FMA_W_CHAIN;
12093 break;
12094 }
12095
12096 return DAG.getNode(Opcode, SL, VTList,
12097 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
12098 Flags);
12099}
12100
12101SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
12102 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12103 return FastLowered;
12104
12105 SDLoc SL(Op);
12106 EVT VT = Op.getValueType();
12107 SDValue LHS = Op.getOperand(0);
12108 SDValue RHS = Op.getOperand(1);
12109
12110 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
12111 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
12112
12113 if (VT == MVT::bf16) {
12114 SDValue ExtDiv =
12115 DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags());
12116 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv,
12117 DAG.getTargetConstant(0, SL, MVT::i32));
12118 }
12119
12120 assert(VT == MVT::f16);
12121
12122 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
12123 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
12124 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
12125 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
12126 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12127 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
12128 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12129 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
12130 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
12131 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
12132 // q16.u = opx(V_CVT_F16_F32, q32.u);
12133 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
12134
12135 // We will use ISD::FMA on targets that don't support ISD::FMAD.
12136 unsigned FMADOpCode =
12138 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
12139 SDValue Rcp =
12140 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
12141 SDValue Quot =
12142 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
12143 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12144 Op->getFlags());
12145 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
12146 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12147 Op->getFlags());
12148 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
12149 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
12150 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
12151 DAG.getConstant(0xff800000, SL, MVT::i32));
12152 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
12153 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
12154 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
12155 DAG.getTargetConstant(0, SL, MVT::i32));
12156 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
12157 Op->getFlags());
12158}
12159
12160// Faster 2.5 ULP division that does not support denormals.
12161SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
12162 SDNodeFlags Flags = Op->getFlags();
12163 SDLoc SL(Op);
12164 SDValue LHS = Op.getOperand(1);
12165 SDValue RHS = Op.getOperand(2);
12166
12167 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
12168
12169 const APFloat K0Val(0x1p+96f);
12170 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
12171
12172 const APFloat K1Val(0x1p-32f);
12173 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
12174
12175 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12176
12177 EVT SetCCVT =
12178 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
12179
12180 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
12181
12182 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
12183
12184 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
12185
12186 // rcp does not support denormals.
12187 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
12188
12189 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
12190
12191 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
12192}
12193
12194// Returns immediate value for setting the F32 denorm mode when using the
12195// S_DENORM_MODE instruction.
12198 const GCNSubtarget *ST) {
12199 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
12200 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
12201 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12202 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
12203}
12204
12205SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
12206 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12207 return FastLowered;
12208
12209 // The selection matcher assumes anything with a chain selecting to a
12210 // mayRaiseFPException machine instruction. Since we're introducing a chain
12211 // here, we need to explicitly report nofpexcept for the regular fdiv
12212 // lowering.
12213 SDNodeFlags Flags = Op->getFlags();
12214 Flags.setNoFPExcept(true);
12215
12216 SDLoc SL(Op);
12217 SDValue LHS = Op.getOperand(0);
12218 SDValue RHS = Op.getOperand(1);
12219
12220 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12221
12222 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
12223
12224 SDValue DenominatorScaled =
12225 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
12226 SDValue NumeratorScaled =
12227 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
12228
12229 // Denominator is scaled to not be denormal, so using rcp is ok.
12230 SDValue ApproxRcp =
12231 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
12232 SDValue NegDivScale0 =
12233 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
12234
12235 using namespace AMDGPU::Hwreg;
12236 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12237 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
12238
12239 const MachineFunction &MF = DAG.getMachineFunction();
12240 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
12241 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
12242
12243 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
12244 const bool HasDynamicDenormals =
12245 (DenormMode.Input == DenormalMode::Dynamic) ||
12246 (DenormMode.Output == DenormalMode::Dynamic);
12247
12248 SDValue SavedDenormMode;
12249
12250 if (!PreservesDenormals) {
12251 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
12252 // lowering. The chain dependence is insufficient, and we need glue. We do
12253 // not need the glue variants in a strictfp function.
12254
12255 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12256
12257 SDValue Glue = DAG.getEntryNode();
12258 if (HasDynamicDenormals) {
12259 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
12260 DAG.getVTList(MVT::i32, MVT::Glue),
12261 {BitField, Glue});
12262 SavedDenormMode = SDValue(GetReg, 0);
12263
12264 Glue = DAG.getMergeValues(
12265 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
12266 }
12267
12268 SDNode *EnableDenorm;
12269 if (Subtarget->hasDenormModeInst()) {
12270 const SDValue EnableDenormValue =
12272
12273 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
12274 EnableDenormValue)
12275 .getNode();
12276 } else {
12277 const SDValue EnableDenormValue =
12278 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
12279 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12280 {EnableDenormValue, BitField, Glue});
12281 }
12282
12283 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
12284 SDValue(EnableDenorm, 1)};
12285
12286 NegDivScale0 = DAG.getMergeValues(Ops, SL);
12287 }
12288
12289 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
12290 ApproxRcp, One, NegDivScale0, Flags);
12291
12292 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
12293 ApproxRcp, Fma0, Flags);
12294
12295 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
12296 Fma1, Flags);
12297
12298 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
12299 NumeratorScaled, Mul, Flags);
12300
12301 SDValue Fma3 =
12302 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
12303
12304 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
12305 NumeratorScaled, Fma3, Flags);
12306
12307 if (!PreservesDenormals) {
12308 SDNode *DisableDenorm;
12309 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12310 const SDValue DisableDenormValue = getSPDenormModeValue(
12311 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
12312
12313 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12314 DisableDenorm =
12315 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
12316 Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2))
12317 .getNode();
12318 } else {
12319 assert(HasDynamicDenormals == (bool)SavedDenormMode);
12320 const SDValue DisableDenormValue =
12321 HasDynamicDenormals
12322 ? SavedDenormMode
12323 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
12324
12325 DisableDenorm = DAG.getMachineNode(
12326 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12327 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
12328 }
12329
12330 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
12331 SDValue(DisableDenorm, 0), DAG.getRoot());
12332 DAG.setRoot(OutputChain);
12333 }
12334
12335 SDValue Scale = NumeratorScaled.getValue(1);
12336 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
12337 {Fma4, Fma1, Fma3, Scale}, Flags);
12338
12339 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
12340}
12341
12342SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
12343 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
12344 return FastLowered;
12345
12346 SDLoc SL(Op);
12347 SDValue X = Op.getOperand(0);
12348 SDValue Y = Op.getOperand(1);
12349
12350 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
12351
12352 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
12353
12354 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
12355
12356 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
12357
12358 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
12359
12360 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
12361
12362 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
12363
12364 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
12365
12366 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
12367
12368 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
12369 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
12370
12371 SDValue Fma4 =
12372 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
12373
12374 SDValue Scale;
12375
12376 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12377 // Workaround a hardware bug on SI where the condition output from div_scale
12378 // is not usable.
12379
12380 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
12381
12382 // Figure out if the scale to use for div_fmas.
12383 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
12384 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
12385 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
12386 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
12387
12388 SDValue NumHi =
12389 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
12390 SDValue DenHi =
12391 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
12392
12393 SDValue Scale0Hi =
12394 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
12395 SDValue Scale1Hi =
12396 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
12397
12398 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
12399 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
12400 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
12401 } else {
12402 Scale = DivScale1.getValue(1);
12403 }
12404
12405 SDValue Fmas =
12406 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
12407
12408 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
12409}
12410
12411SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
12412 EVT VT = Op.getValueType();
12413
12414 if (VT == MVT::f32)
12415 return LowerFDIV32(Op, DAG);
12416
12417 if (VT == MVT::f64)
12418 return LowerFDIV64(Op, DAG);
12419
12420 if (VT == MVT::f16 || VT == MVT::bf16)
12421 return LowerFDIV16(Op, DAG);
12422
12423 llvm_unreachable("Unexpected type for fdiv");
12424}
12425
12426SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
12427 SDLoc dl(Op);
12428 SDValue Val = Op.getOperand(0);
12429 EVT VT = Val.getValueType();
12430 EVT ResultExpVT = Op->getValueType(1);
12431 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12432
12433 SDValue Mant = DAG.getNode(
12435 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
12436
12437 SDValue Exp = DAG.getNode(
12438 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
12439 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
12440
12441 if (Subtarget->hasFractBug()) {
12442 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
12443 SDValue Inf =
12445
12446 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
12447 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
12448 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
12449 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
12450 }
12451
12452 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
12453 return DAG.getMergeValues({Mant, CastExp}, dl);
12454}
12455
12456SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
12457 SDLoc DL(Op);
12458 StoreSDNode *Store = cast<StoreSDNode>(Op);
12459 EVT VT = Store->getMemoryVT();
12460
12461 if (VT == MVT::i1) {
12462 return DAG.getTruncStore(
12463 Store->getChain(), DL,
12464 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
12465 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
12466 }
12467
12468 assert(VT.isVector() &&
12469 Store->getValue().getValueType().getScalarType() == MVT::i32);
12470
12471 unsigned AS = Store->getAddressSpace();
12472 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
12473 Store->getAlign().value() < VT.getStoreSize() &&
12474 VT.getSizeInBits() > 32) {
12475 return SplitVectorStore(Op, DAG);
12476 }
12477
12478 MachineFunction &MF = DAG.getMachineFunction();
12479 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12480 // If there is a possibility that flat instruction access scratch memory
12481 // then we need to use the same legalization rules we use for private.
12482 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12483 !Subtarget->hasMultiDwordFlatScratchAddressing())
12484 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
12487
12488 unsigned NumElements = VT.getVectorNumElements();
12490 if (NumElements > 4)
12491 return SplitVectorStore(Op, DAG);
12492 // v3 stores not supported on SI.
12493 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12494 return SplitVectorStore(Op, DAG);
12495
12497 VT, *Store->getMemOperand()))
12498 return expandUnalignedStore(Store, DAG);
12499
12500 return SDValue();
12501 }
12502 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12503 switch (Subtarget->getMaxPrivateElementSize()) {
12504 case 4:
12505 return scalarizeVectorStore(Store, DAG);
12506 case 8:
12507 if (NumElements > 2)
12508 return SplitVectorStore(Op, DAG);
12509 return SDValue();
12510 case 16:
12511 if (NumElements > 4 ||
12512 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12513 return SplitVectorStore(Op, DAG);
12514 return SDValue();
12515 default:
12516 llvm_unreachable("unsupported private_element_size");
12517 }
12518 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12519 unsigned Fast = 0;
12520 auto Flags = Store->getMemOperand()->getFlags();
12522 Store->getAlign(), Flags, &Fast) &&
12523 Fast > 1)
12524 return SDValue();
12525
12526 if (VT.isVector())
12527 return SplitVectorStore(Op, DAG);
12528
12529 return expandUnalignedStore(Store, DAG);
12530 }
12531
12532 // Probably an invalid store. If so we'll end up emitting a selection error.
12533 return SDValue();
12534}
12535
12536// Avoid the full correct expansion for f32 sqrt when promoting from f16.
12537SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
12538 SDLoc SL(Op);
12539 assert(!Subtarget->has16BitInsts());
12540 SDNodeFlags Flags = Op->getFlags();
12541 SDValue Ext =
12542 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
12543
12544 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
12545 SDValue Sqrt =
12546 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
12547
12548 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
12549 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
12550}
12551
12552SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
12553 SDLoc DL(Op);
12554 SDNodeFlags Flags = Op->getFlags();
12555 MVT VT = Op.getValueType().getSimpleVT();
12556 const SDValue X = Op.getOperand(0);
12557
12558 if (allowApproxFunc(DAG, Flags)) {
12559 // Instruction is 1ulp but ignores denormals.
12560 return DAG.getNode(
12562 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
12563 }
12564
12565 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
12566 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
12567
12568 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
12569
12570 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
12571
12572 SDValue SqrtX =
12573 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
12574
12575 SDValue SqrtS;
12576 if (needsDenormHandlingF32(DAG, X, Flags)) {
12577 SDValue SqrtID =
12578 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
12579 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
12580
12581 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
12582 SDValue SqrtSNextDownInt =
12583 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12584 DAG.getAllOnesConstant(DL, MVT::i32));
12585 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
12586
12587 SDValue NegSqrtSNextDown =
12588 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
12589
12590 SDValue SqrtVP =
12591 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
12592
12593 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12594 DAG.getConstant(1, DL, MVT::i32));
12595 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
12596
12597 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
12598 SDValue SqrtVS =
12599 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
12600
12601 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
12602 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
12603
12604 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
12605 Flags);
12606
12607 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
12608 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
12609 Flags);
12610 } else {
12611 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
12612
12613 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
12614
12615 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
12616 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
12617 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
12618
12619 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
12620 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
12621 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
12622
12623 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
12624 SDValue SqrtD =
12625 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
12626 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
12627 }
12628
12629 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
12630
12631 SDValue ScaledDown =
12632 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
12633
12634 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
12635 SDValue IsZeroOrInf =
12636 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12637 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12638
12639 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
12640}
12641
12642SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
12643 // For double type, the SQRT and RSQ instructions don't have required
12644 // precision, we apply Goldschmidt's algorithm to improve the result:
12645 //
12646 // y0 = rsq(x)
12647 // g0 = x * y0
12648 // h0 = 0.5 * y0
12649 //
12650 // r0 = 0.5 - h0 * g0
12651 // g1 = g0 * r0 + g0
12652 // h1 = h0 * r0 + h0
12653 //
12654 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
12655 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
12656 // h2 = h1 * r1 + h1
12657 //
12658 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
12659 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
12660 //
12661 // sqrt(x) = g3
12662
12663 SDNodeFlags Flags = Op->getFlags();
12664
12665 SDLoc DL(Op);
12666
12667 SDValue X = Op.getOperand(0);
12668 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
12669
12670 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
12671
12672 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
12673
12674 // Scale up input if it is too small.
12675 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
12676 SDValue ScaleUp =
12677 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
12678 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
12679
12680 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
12681
12682 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
12683
12684 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
12685 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
12686
12687 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
12688 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
12689
12690 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
12691
12692 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
12693
12694 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
12695 SDValue SqrtD0 =
12696 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
12697
12698 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
12699
12700 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
12701 SDValue SqrtD1 =
12702 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
12703
12704 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
12705
12706 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
12707 SDValue ScaleDown =
12708 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
12709 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12710
12711 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
12712 // with finite only or nsz because rsq(+/-0) = +/-inf
12713
12714 // TODO: Check for DAZ and expand to subnormals
12715 SDValue IsZeroOrInf =
12716 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12717 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12718
12719 // If x is +INF, +0, or -0, use its original value
12720 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
12721 Flags);
12722}
12723
12724SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
12725 SDLoc DL(Op);
12726 EVT VT = Op.getValueType();
12727 SDValue Arg = Op.getOperand(0);
12728 SDValue TrigVal;
12729
12730 // Propagate fast-math flags so that the multiply we introduce can be folded
12731 // if Arg is already the result of a multiply by constant.
12732 auto Flags = Op->getFlags();
12733
12734 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
12735
12736 if (Subtarget->hasTrigReducedRange()) {
12737 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12738 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
12739 } else {
12740 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12741 }
12742
12743 switch (Op.getOpcode()) {
12744 case ISD::FCOS:
12745 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
12746 case ISD::FSIN:
12747 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
12748 default:
12749 llvm_unreachable("Wrong trig opcode");
12750 }
12751}
12752
12753SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
12754 SelectionDAG &DAG) const {
12755 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
12756 assert(AtomicNode->isCompareAndSwap());
12757 unsigned AS = AtomicNode->getAddressSpace();
12758
12759 // No custom lowering required for local address space
12761 return Op;
12762
12763 // Non-local address space requires custom lowering for atomic compare
12764 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
12765 SDLoc DL(Op);
12766 SDValue ChainIn = Op.getOperand(0);
12767 SDValue Addr = Op.getOperand(1);
12768 SDValue Old = Op.getOperand(2);
12769 SDValue New = Op.getOperand(3);
12770 EVT VT = Op.getValueType();
12771 MVT SimpleVT = VT.getSimpleVT();
12772 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
12773
12774 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
12775 SDValue Ops[] = {ChainIn, Addr, NewOld};
12776
12778 Op->getVTList(), Ops, VT,
12779 AtomicNode->getMemOperand());
12780}
12781
12782//===----------------------------------------------------------------------===//
12783// Custom DAG optimizations
12784//===----------------------------------------------------------------------===//
12785
12786SDValue
12787SITargetLowering::performUCharToFloatCombine(SDNode *N,
12788 DAGCombinerInfo &DCI) const {
12789 EVT VT = N->getValueType(0);
12790 EVT ScalarVT = VT.getScalarType();
12791 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12792 return SDValue();
12793
12794 SelectionDAG &DAG = DCI.DAG;
12795 SDLoc DL(N);
12796
12797 SDValue Src = N->getOperand(0);
12798 EVT SrcVT = Src.getValueType();
12799
12800 // TODO: We could try to match extracting the higher bytes, which would be
12801 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
12802 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
12803 // about in practice.
12804 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12805 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
12806 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
12807 DCI.AddToWorklist(Cvt.getNode());
12808
12809 // For the f16 case, fold to a cast to f32 and then cast back to f16.
12810 if (ScalarVT != MVT::f32) {
12811 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
12812 DAG.getTargetConstant(0, DL, MVT::i32));
12813 }
12814 return Cvt;
12815 }
12816 }
12817
12818 return SDValue();
12819}
12820
12821SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
12822 DAGCombinerInfo &DCI) const {
12823 SDValue MagnitudeOp = N->getOperand(0);
12824 SDValue SignOp = N->getOperand(1);
12825
12826 // The generic combine for fcopysign + fp cast is too conservative with
12827 // vectors, and also gets confused by the splitting we will perform here, so
12828 // peek through FP casts.
12829 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
12830 SignOp.getOpcode() == ISD::FP_ROUND)
12831 SignOp = SignOp.getOperand(0);
12832
12833 SelectionDAG &DAG = DCI.DAG;
12834 SDLoc DL(N);
12835 EVT SignVT = SignOp.getValueType();
12836
12837 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
12838 // lower half with a copy.
12839 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
12840 EVT MagVT = MagnitudeOp.getValueType();
12841
12842 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
12843
12844 if (MagVT.getScalarType() == MVT::f64) {
12845 EVT F32VT = MagVT.isVector()
12846 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
12847 : MVT::v2f32;
12848
12849 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
12850
12852 for (unsigned I = 0; I != NumElts; ++I) {
12853 SDValue MagLo =
12854 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
12855 DAG.getConstant(2 * I, DL, MVT::i32));
12856 SDValue MagHi =
12857 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
12858 DAG.getConstant(2 * I + 1, DL, MVT::i32));
12859
12860 SDValue SignOpElt =
12861 MagVT.isVector()
12863 SignOp, DAG.getConstant(I, DL, MVT::i32))
12864 : SignOp;
12865
12866 SDValue HiOp =
12867 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
12868
12869 SDValue Vector =
12870 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
12871
12872 SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
12873 NewElts.push_back(NewElt);
12874 }
12875
12876 if (NewElts.size() == 1)
12877 return NewElts[0];
12878
12879 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
12880 }
12881
12882 if (SignVT.getScalarType() != MVT::f64)
12883 return SDValue();
12884
12885 // Reduce width of sign operand, we only need the highest bit.
12886 //
12887 // fcopysign f64:x, f64:y ->
12888 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
12889 // TODO: In some cases it might make sense to go all the way to f16.
12890
12891 EVT F32VT = MagVT.isVector()
12892 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
12893 : MVT::v2f32;
12894
12895 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
12896
12897 SmallVector<SDValue, 8> F32Signs;
12898 for (unsigned I = 0; I != NumElts; ++I) {
12899 // Take sign from odd elements of cast vector
12900 SDValue SignAsF32 =
12901 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
12902 DAG.getConstant(2 * I + 1, DL, MVT::i32));
12903 F32Signs.push_back(SignAsF32);
12904 }
12905
12906 SDValue NewSign =
12907 NumElts == 1
12908 ? F32Signs.back()
12910 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
12911 F32Signs);
12912
12913 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
12914 NewSign);
12915}
12916
12917// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
12918// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
12919// bits
12920
12921// This is a variant of
12922// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
12923//
12924// The normal DAG combiner will do this, but only if the add has one use since
12925// that would increase the number of instructions.
12926//
12927// This prevents us from seeing a constant offset that can be folded into a
12928// memory instruction's addressing mode. If we know the resulting add offset of
12929// a pointer can be folded into an addressing offset, we can replace the pointer
12930// operand with the add of new constant offset. This eliminates one of the uses,
12931// and may allow the remaining use to also be simplified.
12932//
12933SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
12934 EVT MemVT,
12935 DAGCombinerInfo &DCI) const {
12936 SDValue N0 = N->getOperand(0);
12937 SDValue N1 = N->getOperand(1);
12938
12939 // We only do this to handle cases where it's profitable when there are
12940 // multiple uses of the add, so defer to the standard combine.
12941 if ((!N0->isAnyAdd() && N0.getOpcode() != ISD::OR) || N0->hasOneUse())
12942 return SDValue();
12943
12944 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
12945 if (!CN1)
12946 return SDValue();
12947
12948 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
12949 if (!CAdd)
12950 return SDValue();
12951
12952 SelectionDAG &DAG = DCI.DAG;
12953
12954 if (N0->getOpcode() == ISD::OR &&
12955 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
12956 return SDValue();
12957
12958 // If the resulting offset is too large, we can't fold it into the
12959 // addressing mode offset.
12960 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
12961 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
12962
12963 AddrMode AM;
12964 AM.HasBaseReg = true;
12965 AM.BaseOffs = Offset.getSExtValue();
12966 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
12967 return SDValue();
12968
12969 SDLoc SL(N);
12970 EVT VT = N->getValueType(0);
12971
12972 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
12973 SDValue COffset = DAG.getConstant(Offset, SL, VT);
12974
12975 SDNodeFlags Flags;
12976 Flags.setNoUnsignedWrap(
12977 N->getFlags().hasNoUnsignedWrap() &&
12978 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
12979
12980 // Use ISD::ADD even if the original operation was ISD::PTRADD, since we can't
12981 // be sure that the new left operand is a proper base pointer.
12982 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
12983}
12984
12985/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
12986/// by the chain and intrinsic ID. Theoretically we would also need to check the
12987/// specific intrinsic, but they all place the pointer operand first.
12988static unsigned getBasePtrIndex(const MemSDNode *N) {
12989 switch (N->getOpcode()) {
12990 case ISD::STORE:
12993 return 2;
12994 default:
12995 return 1;
12996 }
12997}
12998
12999SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
13000 DAGCombinerInfo &DCI) const {
13001 SelectionDAG &DAG = DCI.DAG;
13002
13003 unsigned PtrIdx = getBasePtrIndex(N);
13004 SDValue Ptr = N->getOperand(PtrIdx);
13005
13006 // TODO: We could also do this for multiplies.
13007 if (Ptr.getOpcode() == ISD::SHL) {
13008 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
13009 N->getMemoryVT(), DCI);
13010 if (NewPtr) {
13011 SmallVector<SDValue, 8> NewOps(N->ops());
13012
13013 NewOps[PtrIdx] = NewPtr;
13014 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
13015 }
13016 }
13017
13018 return SDValue();
13019}
13020
13021static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
13022 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13023 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13024 (Opc == ISD::XOR && Val == 0);
13025}
13026
13027// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
13028// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
13029// integer combine opportunities since most 64-bit operations are decomposed
13030// this way. TODO: We won't want this for SALU especially if it is an inline
13031// immediate.
13032SDValue SITargetLowering::splitBinaryBitConstantOp(
13033 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
13034 const ConstantSDNode *CRHS) const {
13035 uint64_t Val = CRHS->getZExtValue();
13036 uint32_t ValLo = Lo_32(Val);
13037 uint32_t ValHi = Hi_32(Val);
13038 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13039
13040 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
13042 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
13043 // We have 64-bit scalar and/or/xor, but do not have vector forms.
13044 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
13045 !CRHS->user_begin()->isDivergent())
13046 return SDValue();
13047
13048 // If we need to materialize a 64-bit immediate, it will be split up later
13049 // anyway. Avoid creating the harder to understand 64-bit immediate
13050 // materialization.
13051 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
13052 }
13053
13054 return SDValue();
13055}
13056
13058 if (V.getValueType() != MVT::i1)
13059 return false;
13060 switch (V.getOpcode()) {
13061 default:
13062 break;
13063 case ISD::SETCC:
13064 case ISD::IS_FPCLASS:
13066 return true;
13067 case ISD::AND:
13068 case ISD::OR:
13069 case ISD::XOR:
13070 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
13071 case ISD::SADDO:
13072 case ISD::UADDO:
13073 case ISD::SSUBO:
13074 case ISD::USUBO:
13075 case ISD::SMULO:
13076 case ISD::UMULO:
13077 return V.getResNo() == 1;
13079 unsigned IntrinsicID = V.getConstantOperandVal(0);
13080 switch (IntrinsicID) {
13081 case Intrinsic::amdgcn_is_shared:
13082 case Intrinsic::amdgcn_is_private:
13083 return true;
13084 default:
13085 return false;
13086 }
13087
13088 return false;
13089 }
13090 }
13091 return false;
13092}
13093
13094// If a constant has all zeroes or all ones within each byte return it.
13095// Otherwise return 0.
13097 // 0xff for any zero byte in the mask
13098 uint32_t ZeroByteMask = 0;
13099 if (!(C & 0x000000ff))
13100 ZeroByteMask |= 0x000000ff;
13101 if (!(C & 0x0000ff00))
13102 ZeroByteMask |= 0x0000ff00;
13103 if (!(C & 0x00ff0000))
13104 ZeroByteMask |= 0x00ff0000;
13105 if (!(C & 0xff000000))
13106 ZeroByteMask |= 0xff000000;
13107 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
13108 if ((NonZeroByteMask & C) != NonZeroByteMask)
13109 return 0; // Partial bytes selected.
13110 return C;
13111}
13112
13113// Check if a node selects whole bytes from its operand 0 starting at a byte
13114// boundary while masking the rest. Returns select mask as in the v_perm_b32
13115// or -1 if not succeeded.
13116// Note byte select encoding:
13117// value 0-3 selects corresponding source byte;
13118// value 0xc selects zero;
13119// value 0xff selects 0xff.
13121 assert(V.getValueSizeInBits() == 32);
13122
13123 if (V.getNumOperands() != 2)
13124 return ~0;
13125
13126 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
13127 if (!N1)
13128 return ~0;
13129
13130 uint32_t C = N1->getZExtValue();
13131
13132 switch (V.getOpcode()) {
13133 default:
13134 break;
13135 case ISD::AND:
13136 if (uint32_t ConstMask = getConstantPermuteMask(C))
13137 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13138 break;
13139
13140 case ISD::OR:
13141 if (uint32_t ConstMask = getConstantPermuteMask(C))
13142 return (0x03020100 & ~ConstMask) | ConstMask;
13143 break;
13144
13145 case ISD::SHL:
13146 if (C % 8)
13147 return ~0;
13148
13149 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
13150
13151 case ISD::SRL:
13152 if (C % 8)
13153 return ~0;
13154
13155 return uint32_t(0x0c0c0c0c03020100ull >> C);
13156 }
13157
13158 return ~0;
13159}
13160
13161SDValue SITargetLowering::performAndCombine(SDNode *N,
13162 DAGCombinerInfo &DCI) const {
13163 if (DCI.isBeforeLegalize())
13164 return SDValue();
13165
13166 SelectionDAG &DAG = DCI.DAG;
13167 EVT VT = N->getValueType(0);
13168 SDValue LHS = N->getOperand(0);
13169 SDValue RHS = N->getOperand(1);
13170
13171 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13172 if (VT == MVT::i64 && CRHS) {
13173 if (SDValue Split =
13174 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
13175 return Split;
13176 }
13177
13178 if (CRHS && VT == MVT::i32) {
13179 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
13180 // nb = number of trailing zeroes in mask
13181 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
13182 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
13183 uint64_t Mask = CRHS->getZExtValue();
13184 unsigned Bits = llvm::popcount(Mask);
13185 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
13186 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
13187 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
13188 unsigned Shift = CShift->getZExtValue();
13189 unsigned NB = CRHS->getAPIntValue().countr_zero();
13190 unsigned Offset = NB + Shift;
13191 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
13192 SDLoc SL(N);
13193 SDValue BFE =
13194 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
13195 DAG.getConstant(Offset, SL, MVT::i32),
13196 DAG.getConstant(Bits, SL, MVT::i32));
13197 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
13198 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
13199 DAG.getValueType(NarrowVT));
13200 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
13201 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
13202 return Shl;
13203 }
13204 }
13205 }
13206
13207 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13208 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
13209 isa<ConstantSDNode>(LHS.getOperand(2))) {
13210 uint32_t Sel = getConstantPermuteMask(Mask);
13211 if (!Sel)
13212 return SDValue();
13213
13214 // Select 0xc for all zero bytes
13215 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13216 SDLoc DL(N);
13217 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13218 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13219 }
13220 }
13221
13222 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
13223 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
13224 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
13225 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13226 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
13227
13228 SDValue X = LHS.getOperand(0);
13229 SDValue Y = RHS.getOperand(0);
13230 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
13231 !isTypeLegal(X.getValueType()))
13232 return SDValue();
13233
13234 if (LCC == ISD::SETO) {
13235 if (X != LHS.getOperand(1))
13236 return SDValue();
13237
13238 if (RCC == ISD::SETUNE) {
13239 const ConstantFPSDNode *C1 =
13240 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
13241 if (!C1 || !C1->isInfinity() || C1->isNegative())
13242 return SDValue();
13243
13244 const uint32_t Mask = SIInstrFlags::N_NORMAL |
13248
13249 static_assert(
13252 0x3ff) == Mask,
13253 "mask not equal");
13254
13255 SDLoc DL(N);
13256 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
13257 DAG.getConstant(Mask, DL, MVT::i32));
13258 }
13259 }
13260 }
13261
13262 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13263 std::swap(LHS, RHS);
13264
13265 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13266 RHS.hasOneUse()) {
13267 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13268 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
13269 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
13270 // | n_nan)
13271 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13272 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
13273 (RHS.getOperand(0) == LHS.getOperand(0) &&
13274 LHS.getOperand(0) == LHS.getOperand(1))) {
13275 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
13276 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
13277 : Mask->getZExtValue() & OrdMask;
13278
13279 SDLoc DL(N);
13280 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
13281 DAG.getConstant(NewMask, DL, MVT::i32));
13282 }
13283 }
13284
13285 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
13286 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
13287 // and x, (sext cc from i1) => select cc, x, 0
13288 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
13289 std::swap(LHS, RHS);
13290 if (isBoolSGPR(RHS.getOperand(0)))
13291 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
13292 DAG.getConstant(0, SDLoc(N), MVT::i32));
13293 }
13294
13295 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13296 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13297 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13298 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13299 uint32_t LHSMask = getPermuteMask(LHS);
13300 uint32_t RHSMask = getPermuteMask(RHS);
13301 if (LHSMask != ~0u && RHSMask != ~0u) {
13302 // Canonicalize the expression in an attempt to have fewer unique masks
13303 // and therefore fewer registers used to hold the masks.
13304 if (LHSMask > RHSMask) {
13305 std::swap(LHSMask, RHSMask);
13306 std::swap(LHS, RHS);
13307 }
13308
13309 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13310 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13311 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13312 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13313
13314 // Check of we need to combine values from two sources within a byte.
13315 if (!(LHSUsedLanes & RHSUsedLanes) &&
13316 // If we select high and lower word keep it for SDWA.
13317 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13318 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13319 // Each byte in each mask is either selector mask 0-3, or has higher
13320 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
13321 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
13322 // mask which is not 0xff wins. By anding both masks we have a correct
13323 // result except that 0x0c shall be corrected to give 0x0c only.
13324 uint32_t Mask = LHSMask & RHSMask;
13325 for (unsigned I = 0; I < 32; I += 8) {
13326 uint32_t ByteSel = 0xff << I;
13327 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13328 Mask &= (0x0c << I) & 0xffffffff;
13329 }
13330
13331 // Add 4 to each active LHS lane. It will not affect any existing 0xff
13332 // or 0x0c.
13333 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
13334 SDLoc DL(N);
13335
13336 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13337 RHS.getOperand(0),
13338 DAG.getConstant(Sel, DL, MVT::i32));
13339 }
13340 }
13341 }
13342
13343 return SDValue();
13344}
13345
13346// A key component of v_perm is a mapping between byte position of the src
13347// operands, and the byte position of the dest. To provide such, we need: 1. the
13348// node that provides x byte of the dest of the OR, and 2. the byte of the node
13349// used to provide that x byte. calculateByteProvider finds which node provides
13350// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
13351// and finds an ultimate src and byte position For example: The supported
13352// LoadCombine pattern for vector loads is as follows
13353// t1
13354// or
13355// / \
13356// t2 t3
13357// zext shl
13358// | | \
13359// t4 t5 16
13360// or anyext
13361// / \ |
13362// t6 t7 t8
13363// srl shl or
13364// / | / \ / \
13365// t9 t10 t11 t12 t13 t14
13366// trunc* 8 trunc* 8 and and
13367// | | / | | \
13368// t15 t16 t17 t18 t19 t20
13369// trunc* 255 srl -256
13370// | / \
13371// t15 t15 16
13372//
13373// *In this example, the truncs are from i32->i16
13374//
13375// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
13376// respectively. calculateSrcByte would find (given node) -> ultimate src &
13377// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
13378// After finding the mapping, we can combine the tree into vperm t15, t16,
13379// 0x05000407
13380
13381// Find the source and byte position from a node.
13382// \p DestByte is the byte position of the dest of the or that the src
13383// ultimately provides. \p SrcIndex is the byte of the src that maps to this
13384// dest of the or byte. \p Depth tracks how many recursive iterations we have
13385// performed.
13386static const std::optional<ByteProvider<SDValue>>
13387calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
13388 unsigned Depth = 0) {
13389 // We may need to recursively traverse a series of SRLs
13390 if (Depth >= 6)
13391 return std::nullopt;
13392
13393 if (Op.getValueSizeInBits() < 8)
13394 return std::nullopt;
13395
13396 if (Op.getValueType().isVector())
13397 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13398
13399 switch (Op->getOpcode()) {
13400 case ISD::TRUNCATE: {
13401 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13402 }
13403
13404 case ISD::SIGN_EXTEND:
13405 case ISD::ZERO_EXTEND:
13407 SDValue NarrowOp = Op->getOperand(0);
13408 auto NarrowVT = NarrowOp.getValueType();
13409 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
13410 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13411 NarrowVT = VTSign->getVT();
13412 }
13413 if (!NarrowVT.isByteSized())
13414 return std::nullopt;
13415 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
13416
13417 if (SrcIndex >= NarrowByteWidth)
13418 return std::nullopt;
13419 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13420 }
13421
13422 case ISD::SRA:
13423 case ISD::SRL: {
13424 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13425 if (!ShiftOp)
13426 return std::nullopt;
13427
13428 uint64_t BitShift = ShiftOp->getZExtValue();
13429
13430 if (BitShift % 8 != 0)
13431 return std::nullopt;
13432
13433 SrcIndex += BitShift / 8;
13434
13435 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13436 }
13437
13438 default: {
13439 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13440 }
13441 }
13442 llvm_unreachable("fully handled switch");
13443}
13444
13445// For a byte position in the result of an Or, traverse the tree and find the
13446// node (and the byte of the node) which ultimately provides this {Or,
13447// BytePosition}. \p Op is the operand we are currently examining. \p Index is
13448// the byte position of the Op that corresponds with the originally requested
13449// byte of the Or \p Depth tracks how many recursive iterations we have
13450// performed. \p StartingIndex is the originally requested byte of the Or
13451static const std::optional<ByteProvider<SDValue>>
13452calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
13453 unsigned StartingIndex = 0) {
13454 // Finding Src tree of RHS of or typically requires at least 1 additional
13455 // depth
13456 if (Depth > 6)
13457 return std::nullopt;
13458
13459 unsigned BitWidth = Op.getScalarValueSizeInBits();
13460 if (BitWidth % 8 != 0)
13461 return std::nullopt;
13462 if (Index > BitWidth / 8 - 1)
13463 return std::nullopt;
13464
13465 bool IsVec = Op.getValueType().isVector();
13466 switch (Op.getOpcode()) {
13467 case ISD::OR: {
13468 if (IsVec)
13469 return std::nullopt;
13470
13471 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
13472 StartingIndex);
13473 if (!RHS)
13474 return std::nullopt;
13475 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13476 StartingIndex);
13477 if (!LHS)
13478 return std::nullopt;
13479 // A well formed Or will have two ByteProviders for each byte, one of which
13480 // is constant zero
13481 if (!LHS->isConstantZero() && !RHS->isConstantZero())
13482 return std::nullopt;
13483 if (!LHS || LHS->isConstantZero())
13484 return RHS;
13485 if (!RHS || RHS->isConstantZero())
13486 return LHS;
13487 return std::nullopt;
13488 }
13489
13490 case ISD::AND: {
13491 if (IsVec)
13492 return std::nullopt;
13493
13494 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13495 if (!BitMaskOp)
13496 return std::nullopt;
13497
13498 uint32_t BitMask = BitMaskOp->getZExtValue();
13499 // Bits we expect for our StartingIndex
13500 uint32_t IndexMask = 0xFF << (Index * 8);
13501
13502 if ((IndexMask & BitMask) != IndexMask) {
13503 // If the result of the and partially provides the byte, then it
13504 // is not well formatted
13505 if (IndexMask & BitMask)
13506 return std::nullopt;
13508 }
13509
13510 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
13511 }
13512
13513 case ISD::FSHR: {
13514 if (IsVec)
13515 return std::nullopt;
13516
13517 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
13518 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13519 if (!ShiftOp || Op.getValueType().isVector())
13520 return std::nullopt;
13521
13522 uint64_t BitsProvided = Op.getValueSizeInBits();
13523 if (BitsProvided % 8 != 0)
13524 return std::nullopt;
13525
13526 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13527 if (BitShift % 8)
13528 return std::nullopt;
13529
13530 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13531 uint64_t ByteShift = BitShift / 8;
13532
13533 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13534 uint64_t BytesProvided = BitsProvided / 8;
13535 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13536 NewIndex %= BytesProvided;
13537 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
13538 }
13539
13540 case ISD::SRA:
13541 case ISD::SRL: {
13542 if (IsVec)
13543 return std::nullopt;
13544
13545 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13546 if (!ShiftOp)
13547 return std::nullopt;
13548
13549 uint64_t BitShift = ShiftOp->getZExtValue();
13550 if (BitShift % 8)
13551 return std::nullopt;
13552
13553 auto BitsProvided = Op.getScalarValueSizeInBits();
13554 if (BitsProvided % 8 != 0)
13555 return std::nullopt;
13556
13557 uint64_t BytesProvided = BitsProvided / 8;
13558 uint64_t ByteShift = BitShift / 8;
13559 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
13560 // If the byte we are trying to provide (as tracked by index) falls in this
13561 // range, then the SRL provides the byte. The byte of interest of the src of
13562 // the SRL is Index + ByteShift
13563 return BytesProvided - ByteShift > Index
13564 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
13565 Index + ByteShift)
13567 }
13568
13569 case ISD::SHL: {
13570 if (IsVec)
13571 return std::nullopt;
13572
13573 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13574 if (!ShiftOp)
13575 return std::nullopt;
13576
13577 uint64_t BitShift = ShiftOp->getZExtValue();
13578 if (BitShift % 8 != 0)
13579 return std::nullopt;
13580 uint64_t ByteShift = BitShift / 8;
13581
13582 // If we are shifting by an amount greater than (or equal to)
13583 // the index we are trying to provide, then it provides 0s. If not,
13584 // then this bytes are not definitively 0s, and the corresponding byte
13585 // of interest is Index - ByteShift of the src
13586 return Index < ByteShift
13588 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
13589 Depth + 1, StartingIndex);
13590 }
13591 case ISD::ANY_EXTEND:
13592 case ISD::SIGN_EXTEND:
13593 case ISD::ZERO_EXTEND:
13595 case ISD::AssertZext:
13596 case ISD::AssertSext: {
13597 if (IsVec)
13598 return std::nullopt;
13599
13600 SDValue NarrowOp = Op->getOperand(0);
13601 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
13602 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
13603 Op->getOpcode() == ISD::AssertZext ||
13604 Op->getOpcode() == ISD::AssertSext) {
13605 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13606 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13607 }
13608 if (NarrowBitWidth % 8 != 0)
13609 return std::nullopt;
13610 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13611
13612 if (Index >= NarrowByteWidth)
13613 return Op.getOpcode() == ISD::ZERO_EXTEND
13614 ? std::optional<ByteProvider<SDValue>>(
13616 : std::nullopt;
13617 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
13618 }
13619
13620 case ISD::TRUNCATE: {
13621 if (IsVec)
13622 return std::nullopt;
13623
13624 uint64_t NarrowByteWidth = BitWidth / 8;
13625
13626 if (NarrowByteWidth >= Index) {
13627 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13628 StartingIndex);
13629 }
13630
13631 return std::nullopt;
13632 }
13633
13634 case ISD::CopyFromReg: {
13635 if (BitWidth / 8 > Index)
13636 return calculateSrcByte(Op, StartingIndex, Index);
13637
13638 return std::nullopt;
13639 }
13640
13641 case ISD::LOAD: {
13642 auto *L = cast<LoadSDNode>(Op.getNode());
13643
13644 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13645 if (NarrowBitWidth % 8 != 0)
13646 return std::nullopt;
13647 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13648
13649 // If the width of the load does not reach byte we are trying to provide for
13650 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
13651 // question
13652 if (Index >= NarrowByteWidth) {
13653 return L->getExtensionType() == ISD::ZEXTLOAD
13654 ? std::optional<ByteProvider<SDValue>>(
13656 : std::nullopt;
13657 }
13658
13659 if (NarrowByteWidth > Index) {
13660 return calculateSrcByte(Op, StartingIndex, Index);
13661 }
13662
13663 return std::nullopt;
13664 }
13665
13666 case ISD::BSWAP: {
13667 if (IsVec)
13668 return std::nullopt;
13669
13670 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
13671 Depth + 1, StartingIndex);
13672 }
13673
13675 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13676 if (!IdxOp)
13677 return std::nullopt;
13678 auto VecIdx = IdxOp->getZExtValue();
13679 auto ScalarSize = Op.getScalarValueSizeInBits();
13680 if (ScalarSize < 32)
13681 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13682 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
13683 StartingIndex, Index);
13684 }
13685
13686 case AMDGPUISD::PERM: {
13687 if (IsVec)
13688 return std::nullopt;
13689
13690 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13691 if (!PermMask)
13692 return std::nullopt;
13693
13694 auto IdxMask =
13695 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13696 if (IdxMask > 0x07 && IdxMask != 0x0c)
13697 return std::nullopt;
13698
13699 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13700 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13701
13702 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
13705 }
13706
13707 default: {
13708 return std::nullopt;
13709 }
13710 }
13711
13712 llvm_unreachable("fully handled switch");
13713}
13714
13715// Returns true if the Operand is a scalar and is 16 bits
13716static bool isExtendedFrom16Bits(SDValue &Operand) {
13717
13718 switch (Operand.getOpcode()) {
13719 case ISD::ANY_EXTEND:
13720 case ISD::SIGN_EXTEND:
13721 case ISD::ZERO_EXTEND: {
13722 auto OpVT = Operand.getOperand(0).getValueType();
13723 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
13724 }
13725 case ISD::LOAD: {
13726 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
13727 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
13728 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
13729 ExtType == ISD::EXTLOAD) {
13730 auto MemVT = L->getMemoryVT();
13731 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
13732 }
13733 return L->getMemoryVT().getSizeInBits() == 16;
13734 }
13735 default:
13736 return false;
13737 }
13738}
13739
13740// Returns true if the mask matches consecutive bytes, and the first byte
13741// begins at a power of 2 byte offset from 0th byte
13742static bool addresses16Bits(int Mask) {
13743 int Low8 = Mask & 0xff;
13744 int Hi8 = (Mask & 0xff00) >> 8;
13745
13746 assert(Low8 < 8 && Hi8 < 8);
13747 // Are the bytes contiguous in the order of increasing addresses.
13748 bool IsConsecutive = (Hi8 - Low8 == 1);
13749 // Is the first byte at location that is aligned for 16 bit instructions.
13750 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
13751 // In this case, we still need code to extract the 16 bit operand, so it
13752 // is better to use i8 v_perm
13753 bool Is16Aligned = !(Low8 % 2);
13754
13755 return IsConsecutive && Is16Aligned;
13756}
13757
13758// Do not lower into v_perm if the operands are actually 16 bit
13759// and the selected bits (based on PermMask) correspond with two
13760// easily addressable 16 bit operands.
13762 SDValue &OtherOp) {
13763 int Low16 = PermMask & 0xffff;
13764 int Hi16 = (PermMask & 0xffff0000) >> 16;
13765
13766 auto TempOp = peekThroughBitcasts(Op);
13767 auto TempOtherOp = peekThroughBitcasts(OtherOp);
13768
13769 auto OpIs16Bit =
13770 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
13771 if (!OpIs16Bit)
13772 return true;
13773
13774 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13775 isExtendedFrom16Bits(TempOtherOp);
13776 if (!OtherOpIs16Bit)
13777 return true;
13778
13779 // Do we cleanly address both
13780 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
13781}
13782
13784 unsigned DWordOffset) {
13785 SDValue Ret;
13786
13787 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
13788 // ByteProvider must be at least 8 bits
13789 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13790
13791 if (TypeSize <= 32)
13792 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
13793
13794 if (Src.getValueType().isVector()) {
13795 auto ScalarTySize = Src.getScalarValueSizeInBits();
13796 auto ScalarTy = Src.getValueType().getScalarType();
13797 if (ScalarTySize == 32) {
13798 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
13799 DAG.getConstant(DWordOffset, SL, MVT::i32));
13800 }
13801 if (ScalarTySize > 32) {
13802 Ret = DAG.getNode(
13803 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
13804 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13805 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13806 if (ShiftVal)
13807 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
13808 DAG.getConstant(ShiftVal, SL, MVT::i32));
13809 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13810 }
13811
13812 assert(ScalarTySize < 32);
13813 auto NumElements = TypeSize / ScalarTySize;
13814 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13815 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13816 auto NumElementsIn32 = 32 / ScalarTySize;
13817 auto NumAvailElements = DWordOffset < Trunc32Elements
13818 ? NumElementsIn32
13819 : NumElements - NormalizedTrunc;
13820
13822 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
13823 NumAvailElements);
13824
13825 Ret = DAG.getBuildVector(
13826 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
13827 VecSrcs);
13828 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13829 }
13830
13831 /// Scalar Type
13832 auto ShiftVal = 32 * DWordOffset;
13833 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
13834 DAG.getConstant(ShiftVal, SL, MVT::i32));
13835 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13836}
13837
13839 SelectionDAG &DAG = DCI.DAG;
13840 [[maybe_unused]] EVT VT = N->getValueType(0);
13842
13843 // VT is known to be MVT::i32, so we need to provide 4 bytes.
13844 assert(VT == MVT::i32);
13845 for (int i = 0; i < 4; i++) {
13846 // Find the ByteProvider that provides the ith byte of the result of OR
13847 std::optional<ByteProvider<SDValue>> P =
13848 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
13849 // TODO support constantZero
13850 if (!P || P->isConstantZero())
13851 return SDValue();
13852
13853 PermNodes.push_back(*P);
13854 }
13855 if (PermNodes.size() != 4)
13856 return SDValue();
13857
13858 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
13859 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
13860 uint64_t PermMask = 0x00000000;
13861 for (size_t i = 0; i < PermNodes.size(); i++) {
13862 auto PermOp = PermNodes[i];
13863 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
13864 // by sizeof(Src2) = 4
13865 int SrcByteAdjust = 4;
13866
13867 // If the Src uses a byte from a different DWORD, then it corresponds
13868 // with a difference source
13869 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
13870 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
13871 if (SecondSrc)
13872 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
13873 ((PermOp.SrcOffset / 4) != SecondSrc->second))
13874 return SDValue();
13875
13876 // Set the index of the second distinct Src node
13877 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
13878 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
13879 SrcByteAdjust = 0;
13880 }
13881 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
13883 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
13884 }
13885 SDLoc DL(N);
13886 SDValue Op = *PermNodes[FirstSrc.first].Src;
13887 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
13888 assert(Op.getValueSizeInBits() == 32);
13889
13890 // Check that we are not just extracting the bytes in order from an op
13891 if (!SecondSrc) {
13892 int Low16 = PermMask & 0xffff;
13893 int Hi16 = (PermMask & 0xffff0000) >> 16;
13894
13895 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
13896 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
13897
13898 // The perm op would really just produce Op. So combine into Op
13899 if (WellFormedLow && WellFormedHi)
13900 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
13901 }
13902
13903 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
13904
13905 if (SecondSrc) {
13906 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
13907 assert(OtherOp.getValueSizeInBits() == 32);
13908 }
13909
13910 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
13911
13912 assert(Op.getValueType().isByteSized() &&
13913 OtherOp.getValueType().isByteSized());
13914
13915 // If the ultimate src is less than 32 bits, then we will only be
13916 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
13917 // CalculateByteProvider would not have returned Op as source if we
13918 // used a byte that is outside its ValueType. Thus, we are free to
13919 // ANY_EXTEND as the extended bits are dont-cares.
13920 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
13921 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
13922
13923 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
13924 DAG.getConstant(PermMask, DL, MVT::i32));
13925 }
13926 return SDValue();
13927}
13928
13929SDValue SITargetLowering::performOrCombine(SDNode *N,
13930 DAGCombinerInfo &DCI) const {
13931 SelectionDAG &DAG = DCI.DAG;
13932 SDValue LHS = N->getOperand(0);
13933 SDValue RHS = N->getOperand(1);
13934
13935 EVT VT = N->getValueType(0);
13936 if (VT == MVT::i1) {
13937 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
13938 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13939 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
13940 SDValue Src = LHS.getOperand(0);
13941 if (Src != RHS.getOperand(0))
13942 return SDValue();
13943
13944 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
13945 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13946 if (!CLHS || !CRHS)
13947 return SDValue();
13948
13949 // Only 10 bits are used.
13950 static const uint32_t MaxMask = 0x3ff;
13951
13952 uint32_t NewMask =
13953 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
13954 SDLoc DL(N);
13955 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
13956 DAG.getConstant(NewMask, DL, MVT::i32));
13957 }
13958
13959 return SDValue();
13960 }
13961
13962 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13964 LHS.getOpcode() == AMDGPUISD::PERM &&
13965 isa<ConstantSDNode>(LHS.getOperand(2))) {
13966 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
13967 if (!Sel)
13968 return SDValue();
13969
13970 Sel |= LHS.getConstantOperandVal(2);
13971 SDLoc DL(N);
13972 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13973 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13974 }
13975
13976 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13977 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13978 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13979 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13980
13981 // If all the uses of an or need to extract the individual elements, do not
13982 // attempt to lower into v_perm
13983 auto usesCombinedOperand = [](SDNode *OrUse) {
13984 // If we have any non-vectorized use, then it is a candidate for v_perm
13985 if (OrUse->getOpcode() != ISD::BITCAST ||
13986 !OrUse->getValueType(0).isVector())
13987 return true;
13988
13989 // If we have any non-vectorized use, then it is a candidate for v_perm
13990 for (auto *VUser : OrUse->users()) {
13991 if (!VUser->getValueType(0).isVector())
13992 return true;
13993
13994 // If the use of a vector is a store, then combining via a v_perm
13995 // is beneficial.
13996 // TODO -- whitelist more uses
13997 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
13998 if (VUser->getOpcode() == VectorwiseOp)
13999 return true;
14000 }
14001 return false;
14002 };
14003
14004 if (!any_of(N->users(), usesCombinedOperand))
14005 return SDValue();
14006
14007 uint32_t LHSMask = getPermuteMask(LHS);
14008 uint32_t RHSMask = getPermuteMask(RHS);
14009
14010 if (LHSMask != ~0u && RHSMask != ~0u) {
14011 // Canonicalize the expression in an attempt to have fewer unique masks
14012 // and therefore fewer registers used to hold the masks.
14013 if (LHSMask > RHSMask) {
14014 std::swap(LHSMask, RHSMask);
14015 std::swap(LHS, RHS);
14016 }
14017
14018 // Select 0xc for each lane used from source operand. Zero has 0xc mask
14019 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
14020 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14021 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14022
14023 // Check of we need to combine values from two sources within a byte.
14024 if (!(LHSUsedLanes & RHSUsedLanes) &&
14025 // If we select high and lower word keep it for SDWA.
14026 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
14027 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14028 // Kill zero bytes selected by other mask. Zero value is 0xc.
14029 LHSMask &= ~RHSUsedLanes;
14030 RHSMask &= ~LHSUsedLanes;
14031 // Add 4 to each active LHS lane
14032 LHSMask |= LHSUsedLanes & 0x04040404;
14033 // Combine masks
14034 uint32_t Sel = LHSMask | RHSMask;
14035 SDLoc DL(N);
14036
14037 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14038 RHS.getOperand(0),
14039 DAG.getConstant(Sel, DL, MVT::i32));
14040 }
14041 }
14042 if (LHSMask == ~0u || RHSMask == ~0u) {
14043 if (SDValue Perm = matchPERM(N, DCI))
14044 return Perm;
14045 }
14046 }
14047
14048 // Detect identity v2i32 OR and replace with identity source node.
14049 // Specifically an Or that has operands constructed from the same source node
14050 // via extract_vector_elt and build_vector. I.E.
14051 // v2i32 or(
14052 // v2i32 build_vector(
14053 // i32 extract_elt(%IdentitySrc, 0),
14054 // i32 0
14055 // ),
14056 // v2i32 build_vector(
14057 // i32 0,
14058 // i32 extract_elt(%IdentitySrc, 1)
14059 // ) )
14060 // =>
14061 // v2i32 %IdentitySrc
14062
14063 if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
14064 RHS->getOpcode() == ISD::BUILD_VECTOR) {
14065
14066 ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
14067 ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
14068
14069 // Test for and normalise build vectors.
14070 if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
14071
14072 // Get the extract_vector_element operands.
14073 SDValue LEVE = LHS->getOperand(0);
14074 SDValue REVE = RHS->getOperand(1);
14075
14076 if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14078 // Check that different elements from the same vector are
14079 // extracted.
14080 if (LEVE->getOperand(0) == REVE->getOperand(0) &&
14081 LEVE->getOperand(1) != REVE->getOperand(1)) {
14082 SDValue IdentitySrc = LEVE.getOperand(0);
14083 return IdentitySrc;
14084 }
14085 }
14086 }
14087 }
14088
14089 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14090 return SDValue();
14091
14092 // TODO: This could be a generic combine with a predicate for extracting the
14093 // high half of an integer being free.
14094
14095 // (or i64:x, (zero_extend i32:y)) ->
14096 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
14097 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
14098 RHS.getOpcode() != ISD::ZERO_EXTEND)
14099 std::swap(LHS, RHS);
14100
14101 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
14102 SDValue ExtSrc = RHS.getOperand(0);
14103 EVT SrcVT = ExtSrc.getValueType();
14104 if (SrcVT == MVT::i32) {
14105 SDLoc SL(N);
14106 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
14107 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
14108
14109 DCI.AddToWorklist(LowOr.getNode());
14110 DCI.AddToWorklist(HiBits.getNode());
14111
14112 SDValue Vec =
14113 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
14114 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
14115 }
14116 }
14117
14118 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
14119 if (CRHS) {
14120 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
14121 N->getOperand(0), CRHS))
14122 return Split;
14123 }
14124
14125 return SDValue();
14126}
14127
14128SDValue SITargetLowering::performXorCombine(SDNode *N,
14129 DAGCombinerInfo &DCI) const {
14130 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
14131 return RV;
14132
14133 SDValue LHS = N->getOperand(0);
14134 SDValue RHS = N->getOperand(1);
14135
14136 const ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
14137 SelectionDAG &DAG = DCI.DAG;
14138
14139 EVT VT = N->getValueType(0);
14140 if (CRHS && VT == MVT::i64) {
14141 if (SDValue Split =
14142 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
14143 return Split;
14144 }
14145
14146 // v2i32 (xor (vselect cc, x, y), K) ->
14147 // (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be
14148 // replaced with source modifiers when the select is lowered to CNDMASK.
14149 unsigned Opc = LHS.getOpcode();
14150 if (((Opc == ISD::VSELECT && VT == MVT::v2i32) ||
14151 (Opc == ISD::SELECT && VT == MVT::i64)) &&
14152 CRHS && CRHS->getAPIntValue().isSignMask()) {
14153 SDValue CC = LHS->getOperand(0);
14154 SDValue TRUE = LHS->getOperand(1);
14155 SDValue FALSE = LHS->getOperand(2);
14156 SDValue XTrue = DAG.getNode(ISD::XOR, SDLoc(N), VT, TRUE, RHS);
14157 SDValue XFalse = DAG.getNode(ISD::XOR, SDLoc(N), VT, FALSE, RHS);
14158 SDValue XSelect =
14159 DAG.getNode(ISD::VSELECT, SDLoc(N), VT, CC, XTrue, XFalse);
14160 return XSelect;
14161 }
14162
14163 // Make sure to apply the 64-bit constant splitting fold before trying to fold
14164 // fneg-like xors into 64-bit select.
14165 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
14166 // This looks like an fneg, try to fold as a source modifier.
14167 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
14169 // xor (select c, a, b), 0x80000000 ->
14170 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
14171 SDLoc DL(N);
14172 SDValue CastLHS =
14173 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
14174 SDValue CastRHS =
14175 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
14176 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
14177 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
14178 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
14179 LHS->getOperand(0), FNegLHS, FNegRHS);
14180 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
14181 }
14182 }
14183
14184 return SDValue();
14185}
14186
14187SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
14188 DAGCombinerInfo &DCI) const {
14189 if (!Subtarget->has16BitInsts() ||
14190 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14191 return SDValue();
14192
14193 EVT VT = N->getValueType(0);
14194 if (VT != MVT::i32)
14195 return SDValue();
14196
14197 SDValue Src = N->getOperand(0);
14198 if (Src.getValueType() != MVT::i16)
14199 return SDValue();
14200
14201 return SDValue();
14202}
14203
14204SDValue
14205SITargetLowering::performSignExtendInRegCombine(SDNode *N,
14206 DAGCombinerInfo &DCI) const {
14207 SDValue Src = N->getOperand(0);
14208 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
14209
14210 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
14211 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
14212 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14213 VTSign->getVT() == MVT::i8) ||
14214 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14215 VTSign->getVT() == MVT::i16))) {
14216 assert(Subtarget->hasScalarSubwordLoads() &&
14217 "s_buffer_load_{u8, i8} are supported "
14218 "in GFX12 (or newer) architectures.");
14219 EVT VT = Src.getValueType();
14220 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14223 SDLoc DL(N);
14224 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14225 SDValue Ops[] = {
14226 Src.getOperand(0), // source register
14227 Src.getOperand(1), // offset
14228 Src.getOperand(2) // cachePolicy
14229 };
14230 auto *M = cast<MemSDNode>(Src);
14231 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14232 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14233 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
14234 return LoadVal;
14235 }
14236 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
14237 VTSign->getVT() == MVT::i8) ||
14238 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
14239 VTSign->getVT() == MVT::i16)) &&
14240 Src.hasOneUse()) {
14241 auto *M = cast<MemSDNode>(Src);
14242 SDValue Ops[] = {Src.getOperand(0), // Chain
14243 Src.getOperand(1), // rsrc
14244 Src.getOperand(2), // vindex
14245 Src.getOperand(3), // voffset
14246 Src.getOperand(4), // soffset
14247 Src.getOperand(5), // offset
14248 Src.getOperand(6), Src.getOperand(7)};
14249 // replace with BUFFER_LOAD_BYTE/SHORT
14250 SDVTList ResList =
14251 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14252 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
14255 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14256 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14257 return DCI.DAG.getMergeValues(
14258 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
14259 }
14260 return SDValue();
14261}
14262
14263SDValue SITargetLowering::performClassCombine(SDNode *N,
14264 DAGCombinerInfo &DCI) const {
14265 SelectionDAG &DAG = DCI.DAG;
14266 SDValue Mask = N->getOperand(1);
14267
14268 // fp_class x, 0 -> false
14269 if (isNullConstant(Mask))
14270 return DAG.getConstant(0, SDLoc(N), MVT::i1);
14271
14272 if (N->getOperand(0).isUndef())
14273 return DAG.getUNDEF(MVT::i1);
14274
14275 return SDValue();
14276}
14277
14278SDValue SITargetLowering::performRcpCombine(SDNode *N,
14279 DAGCombinerInfo &DCI) const {
14280 EVT VT = N->getValueType(0);
14281 SDValue N0 = N->getOperand(0);
14282
14283 if (N0.isUndef()) {
14284 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
14285 SDLoc(N), VT);
14286 }
14287
14288 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
14289 N0.getOpcode() == ISD::SINT_TO_FP)) {
14290 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
14291 N->getFlags());
14292 }
14293
14294 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
14295 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
14296 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
14297 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
14298 N->getFlags());
14299 }
14300
14302}
14303
14305 unsigned MaxDepth) const {
14306 unsigned Opcode = Op.getOpcode();
14307 if (Opcode == ISD::FCANONICALIZE)
14308 return true;
14309
14310 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14311 const auto &F = CFP->getValueAPF();
14312 if (F.isNaN() && F.isSignaling())
14313 return false;
14314 if (!F.isDenormal())
14315 return true;
14316
14317 DenormalMode Mode =
14318 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
14319 return Mode == DenormalMode::getIEEE();
14320 }
14321
14322 // If source is a result of another standard FP operation it is already in
14323 // canonical form.
14324 if (MaxDepth == 0)
14325 return false;
14326
14327 switch (Opcode) {
14328 // These will flush denorms if required.
14329 case ISD::FADD:
14330 case ISD::FSUB:
14331 case ISD::FMUL:
14332 case ISD::FCEIL:
14333 case ISD::FFLOOR:
14334 case ISD::FMA:
14335 case ISD::FMAD:
14336 case ISD::FSQRT:
14337 case ISD::FDIV:
14338 case ISD::FREM:
14339 case ISD::FP_ROUND:
14340 case ISD::FP_EXTEND:
14341 case ISD::FP16_TO_FP:
14342 case ISD::FP_TO_FP16:
14343 case ISD::BF16_TO_FP:
14344 case ISD::FP_TO_BF16:
14345 case ISD::FLDEXP:
14348 case AMDGPUISD::RCP:
14349 case AMDGPUISD::RSQ:
14353 case AMDGPUISD::LOG:
14354 case AMDGPUISD::EXP:
14358 case AMDGPUISD::FRACT:
14365 case AMDGPUISD::SIN_HW:
14366 case AMDGPUISD::COS_HW:
14367 return true;
14368
14369 // It can/will be lowered or combined as a bit operation.
14370 // Need to check their input recursively to handle.
14371 case ISD::FNEG:
14372 case ISD::FABS:
14373 case ISD::FCOPYSIGN:
14374 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14375
14376 case ISD::AND:
14377 if (Op.getValueType() == MVT::i32) {
14378 // Be careful as we only know it is a bitcast floating point type. It
14379 // could be f32, v2f16, we have no way of knowing. Luckily the constant
14380 // value that we optimize for, which comes up in fp32 to bf16 conversions,
14381 // is valid to optimize for all types.
14382 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
14383 if (RHS->getZExtValue() == 0xffff0000) {
14384 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14385 }
14386 }
14387 }
14388 break;
14389
14390 case ISD::FSIN:
14391 case ISD::FCOS:
14392 case ISD::FSINCOS:
14393 return Op.getValueType().getScalarType() != MVT::f16;
14394
14395 case ISD::FMINNUM:
14396 case ISD::FMAXNUM:
14397 case ISD::FMINNUM_IEEE:
14398 case ISD::FMAXNUM_IEEE:
14399 case ISD::FMINIMUM:
14400 case ISD::FMAXIMUM:
14401 case ISD::FMINIMUMNUM:
14402 case ISD::FMAXIMUMNUM:
14403 case AMDGPUISD::CLAMP:
14404 case AMDGPUISD::FMED3:
14405 case AMDGPUISD::FMAX3:
14406 case AMDGPUISD::FMIN3:
14408 case AMDGPUISD::FMINIMUM3: {
14409 // FIXME: Shouldn't treat the generic operations different based these.
14410 // However, we aren't really required to flush the result from
14411 // minnum/maxnum..
14412
14413 // snans will be quieted, so we only need to worry about denormals.
14414 if (Subtarget->supportsMinMaxDenormModes() ||
14415 // FIXME: denormalsEnabledForType is broken for dynamic
14416 denormalsEnabledForType(DAG, Op.getValueType()))
14417 return true;
14418
14419 // Flushing may be required.
14420 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
14421 // targets need to check their input recursively.
14422
14423 // FIXME: Does this apply with clamp? It's implemented with max.
14424 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
14425 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
14426 return false;
14427 }
14428
14429 return true;
14430 }
14431 case ISD::SELECT: {
14432 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
14433 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
14434 }
14435 case ISD::BUILD_VECTOR: {
14436 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
14437 SDValue SrcOp = Op.getOperand(i);
14438 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
14439 return false;
14440 }
14441
14442 return true;
14443 }
14446 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14447 }
14449 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
14450 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
14451 }
14452 case ISD::UNDEF:
14453 // Could be anything.
14454 return false;
14455
14456 case ISD::BITCAST:
14457 // TODO: This is incorrect as it loses track of the operand's type. We may
14458 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
14459 // same bits that are canonicalized in one type need not be in the other.
14460 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14461 case ISD::TRUNCATE: {
14462 // Hack round the mess we make when legalizing extract_vector_elt
14463 if (Op.getValueType() == MVT::i16) {
14464 SDValue TruncSrc = Op.getOperand(0);
14465 if (TruncSrc.getValueType() == MVT::i32 &&
14466 TruncSrc.getOpcode() == ISD::BITCAST &&
14467 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
14468 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
14469 }
14470 }
14471 return false;
14472 }
14474 unsigned IntrinsicID = Op.getConstantOperandVal(0);
14475 // TODO: Handle more intrinsics
14476 switch (IntrinsicID) {
14477 case Intrinsic::amdgcn_cvt_pkrtz:
14478 case Intrinsic::amdgcn_cubeid:
14479 case Intrinsic::amdgcn_frexp_mant:
14480 case Intrinsic::amdgcn_fdot2:
14481 case Intrinsic::amdgcn_rcp:
14482 case Intrinsic::amdgcn_rsq:
14483 case Intrinsic::amdgcn_rsq_clamp:
14484 case Intrinsic::amdgcn_rcp_legacy:
14485 case Intrinsic::amdgcn_rsq_legacy:
14486 case Intrinsic::amdgcn_trig_preop:
14487 case Intrinsic::amdgcn_tanh:
14488 case Intrinsic::amdgcn_log:
14489 case Intrinsic::amdgcn_exp2:
14490 case Intrinsic::amdgcn_sqrt:
14491 return true;
14492 default:
14493 break;
14494 }
14495
14496 break;
14497 }
14498 default:
14499 break;
14500 }
14501
14502 // FIXME: denormalsEnabledForType is broken for dynamic
14503 return denormalsEnabledForType(DAG, Op.getValueType()) &&
14504 DAG.isKnownNeverSNaN(Op);
14505}
14506
14508 unsigned MaxDepth) const {
14509 const MachineRegisterInfo &MRI = MF.getRegInfo();
14510 MachineInstr *MI = MRI.getVRegDef(Reg);
14511 unsigned Opcode = MI->getOpcode();
14512
14513 if (Opcode == AMDGPU::G_FCANONICALIZE)
14514 return true;
14515
14516 std::optional<FPValueAndVReg> FCR;
14517 // Constant splat (can be padded with undef) or scalar constant.
14519 if (FCR->Value.isSignaling())
14520 return false;
14521 if (!FCR->Value.isDenormal())
14522 return true;
14523
14524 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
14525 return Mode == DenormalMode::getIEEE();
14526 }
14527
14528 if (MaxDepth == 0)
14529 return false;
14530
14531 switch (Opcode) {
14532 case AMDGPU::G_FADD:
14533 case AMDGPU::G_FSUB:
14534 case AMDGPU::G_FMUL:
14535 case AMDGPU::G_FCEIL:
14536 case AMDGPU::G_FFLOOR:
14537 case AMDGPU::G_FRINT:
14538 case AMDGPU::G_FNEARBYINT:
14539 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14540 case AMDGPU::G_INTRINSIC_TRUNC:
14541 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14542 case AMDGPU::G_FMA:
14543 case AMDGPU::G_FMAD:
14544 case AMDGPU::G_FSQRT:
14545 case AMDGPU::G_FDIV:
14546 case AMDGPU::G_FREM:
14547 case AMDGPU::G_FPOW:
14548 case AMDGPU::G_FPEXT:
14549 case AMDGPU::G_FLOG:
14550 case AMDGPU::G_FLOG2:
14551 case AMDGPU::G_FLOG10:
14552 case AMDGPU::G_FPTRUNC:
14553 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14554 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14555 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14556 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14557 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14558 return true;
14559 case AMDGPU::G_FNEG:
14560 case AMDGPU::G_FABS:
14561 case AMDGPU::G_FCOPYSIGN:
14562 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
14563 case AMDGPU::G_FMINNUM:
14564 case AMDGPU::G_FMAXNUM:
14565 case AMDGPU::G_FMINNUM_IEEE:
14566 case AMDGPU::G_FMAXNUM_IEEE:
14567 case AMDGPU::G_FMINIMUM:
14568 case AMDGPU::G_FMAXIMUM:
14569 case AMDGPU::G_FMINIMUMNUM:
14570 case AMDGPU::G_FMAXIMUMNUM: {
14571 if (Subtarget->supportsMinMaxDenormModes() ||
14572 // FIXME: denormalsEnabledForType is broken for dynamic
14573 denormalsEnabledForType(MRI.getType(Reg), MF))
14574 return true;
14575
14576 [[fallthrough]];
14577 }
14578 case AMDGPU::G_BUILD_VECTOR:
14579 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
14580 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
14581 return false;
14582 return true;
14583 case AMDGPU::G_INTRINSIC:
14584 case AMDGPU::G_INTRINSIC_CONVERGENT:
14585 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
14586 case Intrinsic::amdgcn_fmul_legacy:
14587 case Intrinsic::amdgcn_fmad_ftz:
14588 case Intrinsic::amdgcn_sqrt:
14589 case Intrinsic::amdgcn_fmed3:
14590 case Intrinsic::amdgcn_sin:
14591 case Intrinsic::amdgcn_cos:
14592 case Intrinsic::amdgcn_log:
14593 case Intrinsic::amdgcn_exp2:
14594 case Intrinsic::amdgcn_log_clamp:
14595 case Intrinsic::amdgcn_rcp:
14596 case Intrinsic::amdgcn_rcp_legacy:
14597 case Intrinsic::amdgcn_rsq:
14598 case Intrinsic::amdgcn_rsq_clamp:
14599 case Intrinsic::amdgcn_rsq_legacy:
14600 case Intrinsic::amdgcn_div_scale:
14601 case Intrinsic::amdgcn_div_fmas:
14602 case Intrinsic::amdgcn_div_fixup:
14603 case Intrinsic::amdgcn_fract:
14604 case Intrinsic::amdgcn_cvt_pkrtz:
14605 case Intrinsic::amdgcn_cubeid:
14606 case Intrinsic::amdgcn_cubema:
14607 case Intrinsic::amdgcn_cubesc:
14608 case Intrinsic::amdgcn_cubetc:
14609 case Intrinsic::amdgcn_frexp_mant:
14610 case Intrinsic::amdgcn_fdot2:
14611 case Intrinsic::amdgcn_trig_preop:
14612 case Intrinsic::amdgcn_tanh:
14613 return true;
14614 default:
14615 break;
14616 }
14617
14618 [[fallthrough]];
14619 default:
14620 return false;
14621 }
14622
14623 llvm_unreachable("invalid operation");
14624}
14625
14626// Constant fold canonicalize.
14627SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
14628 const SDLoc &SL, EVT VT,
14629 const APFloat &C) const {
14630 // Flush denormals to 0 if not enabled.
14631 if (C.isDenormal()) {
14632 DenormalMode Mode =
14633 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
14634 if (Mode == DenormalMode::getPreserveSign()) {
14635 return DAG.getConstantFP(
14636 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
14637 }
14638
14639 if (Mode != DenormalMode::getIEEE())
14640 return SDValue();
14641 }
14642
14643 if (C.isNaN()) {
14644 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
14645 if (C.isSignaling()) {
14646 // Quiet a signaling NaN.
14647 // FIXME: Is this supposed to preserve payload bits?
14648 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14649 }
14650
14651 // Make sure it is the canonical NaN bitpattern.
14652 //
14653 // TODO: Can we use -1 as the canonical NaN value since it's an inline
14654 // immediate?
14655 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
14656 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14657 }
14658
14659 // Already canonical.
14660 return DAG.getConstantFP(C, SL, VT);
14661}
14662
14664 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
14665}
14666
14667SDValue
14668SITargetLowering::performFCanonicalizeCombine(SDNode *N,
14669 DAGCombinerInfo &DCI) const {
14670 SelectionDAG &DAG = DCI.DAG;
14671 SDValue N0 = N->getOperand(0);
14672 EVT VT = N->getValueType(0);
14673
14674 // fcanonicalize undef -> qnan
14675 if (N0.isUndef()) {
14677 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
14678 }
14679
14680 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
14681 EVT VT = N->getValueType(0);
14682 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
14683 }
14684
14685 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
14686 // (fcanonicalize k)
14687 //
14688 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
14689
14690 // TODO: This could be better with wider vectors that will be split to v2f16,
14691 // and to consider uses since there aren't that many packed operations.
14692 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
14693 isTypeLegal(MVT::v2f16)) {
14694 SDLoc SL(N);
14695 SDValue NewElts[2];
14696 SDValue Lo = N0.getOperand(0);
14697 SDValue Hi = N0.getOperand(1);
14698 EVT EltVT = Lo.getValueType();
14699
14701 for (unsigned I = 0; I != 2; ++I) {
14702 SDValue Op = N0.getOperand(I);
14703 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14704 NewElts[I] =
14705 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14706 } else if (Op.isUndef()) {
14707 // Handled below based on what the other operand is.
14708 NewElts[I] = Op;
14709 } else {
14710 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
14711 }
14712 }
14713
14714 // If one half is undef, and one is constant, prefer a splat vector rather
14715 // than the normal qNaN. If it's a register, prefer 0.0 since that's
14716 // cheaper to use and may be free with a packed operation.
14717 if (NewElts[0].isUndef()) {
14718 if (isa<ConstantFPSDNode>(NewElts[1]))
14719 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
14720 ? NewElts[1]
14721 : DAG.getConstantFP(0.0f, SL, EltVT);
14722 }
14723
14724 if (NewElts[1].isUndef()) {
14725 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
14726 ? NewElts[0]
14727 : DAG.getConstantFP(0.0f, SL, EltVT);
14728 }
14729
14730 return DAG.getBuildVector(VT, SL, NewElts);
14731 }
14732 }
14733
14734 return SDValue();
14735}
14736
14737static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
14738 switch (Opc) {
14739 case ISD::FMAXNUM:
14740 case ISD::FMAXNUM_IEEE:
14741 case ISD::FMAXIMUMNUM:
14742 return AMDGPUISD::FMAX3;
14743 case ISD::FMAXIMUM:
14744 return AMDGPUISD::FMAXIMUM3;
14745 case ISD::SMAX:
14746 return AMDGPUISD::SMAX3;
14747 case ISD::UMAX:
14748 return AMDGPUISD::UMAX3;
14749 case ISD::FMINNUM:
14750 case ISD::FMINNUM_IEEE:
14751 case ISD::FMINIMUMNUM:
14752 return AMDGPUISD::FMIN3;
14753 case ISD::FMINIMUM:
14754 return AMDGPUISD::FMINIMUM3;
14755 case ISD::SMIN:
14756 return AMDGPUISD::SMIN3;
14757 case ISD::UMIN:
14758 return AMDGPUISD::UMIN3;
14759 default:
14760 llvm_unreachable("Not a min/max opcode");
14761 }
14762}
14763
14764SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
14765 const SDLoc &SL, SDValue Src,
14766 SDValue MinVal,
14767 SDValue MaxVal,
14768 bool Signed) const {
14769
14770 // med3 comes from
14771 // min(max(x, K0), K1), K0 < K1
14772 // max(min(x, K0), K1), K1 < K0
14773 //
14774 // "MinVal" and "MaxVal" respectively refer to the rhs of the
14775 // min/max op.
14776 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
14777 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
14778
14779 if (!MinK || !MaxK)
14780 return SDValue();
14781
14782 if (Signed) {
14783 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
14784 return SDValue();
14785 } else {
14786 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
14787 return SDValue();
14788 }
14789
14790 EVT VT = MinK->getValueType(0);
14791 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
14792 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
14793 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
14794
14795 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
14796 // not available, but this is unlikely to be profitable as constants
14797 // will often need to be materialized & extended, especially on
14798 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
14799 return SDValue();
14800}
14801
14804 return C;
14805
14807 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
14808 return C;
14809 }
14810
14811 return nullptr;
14812}
14813
14814SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
14815 const SDLoc &SL, SDValue Op0,
14816 SDValue Op1) const {
14817 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
14818 if (!K1)
14819 return SDValue();
14820
14821 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
14822 if (!K0)
14823 return SDValue();
14824
14825 // Ordered >= (although NaN inputs should have folded away by now).
14826 if (K0->getValueAPF() > K1->getValueAPF())
14827 return SDValue();
14828
14829 // med3 with a nan input acts like
14830 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
14831 //
14832 // So the result depends on whether the IEEE mode bit is enabled or not with a
14833 // signaling nan input.
14834 // ieee=1
14835 // s0 snan: yields s2
14836 // s1 snan: yields s2
14837 // s2 snan: qnan
14838
14839 // s0 qnan: min(s1, s2)
14840 // s1 qnan: min(s0, s2)
14841 // s2 qnan: min(s0, s1)
14842
14843 // ieee=0
14844 // s0 snan: min(s1, s2)
14845 // s1 snan: min(s0, s2)
14846 // s2 snan: qnan
14847
14848 // s0 qnan: min(s1, s2)
14849 // s1 qnan: min(s0, s2)
14850 // s2 qnan: min(s0, s1)
14851 const MachineFunction &MF = DAG.getMachineFunction();
14852 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
14853
14854 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
14855 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
14856 // can only form if op0 is fmaxnum_ieee if IEEE=1.
14857 EVT VT = Op0.getValueType();
14858 if (Info->getMode().DX10Clamp) {
14859 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
14860 // hardware fmed3 behavior converting to a min.
14861 // FIXME: Should this be allowing -0.0?
14862 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
14863 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
14864 }
14865
14866 // med3 for f16 is only available on gfx9+, and not available for v2f16.
14867 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
14868 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
14869 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
14870 // then give the other result, which is different from med3 with a NaN
14871 // input.
14872 SDValue Var = Op0.getOperand(0);
14873 if (!DAG.isKnownNeverSNaN(Var))
14874 return SDValue();
14875
14876 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14877
14878 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
14879 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
14880 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
14881 SDValue(K0, 0), SDValue(K1, 0));
14882 }
14883 }
14884
14885 return SDValue();
14886}
14887
14888/// \return true if the subtarget supports minimum3 and maximum3 with the given
14889/// base min/max opcode \p Opc for type \p VT.
14890static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
14891 EVT VT) {
14892 switch (Opc) {
14893 case ISD::FMINNUM:
14894 case ISD::FMAXNUM:
14895 case ISD::FMINNUM_IEEE:
14896 case ISD::FMAXNUM_IEEE:
14897 case ISD::FMINIMUMNUM:
14898 case ISD::FMAXIMUMNUM:
14901 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
14902 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
14903 case ISD::FMINIMUM:
14904 case ISD::FMAXIMUM:
14905 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
14906 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
14907 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
14908 case ISD::SMAX:
14909 case ISD::SMIN:
14910 case ISD::UMAX:
14911 case ISD::UMIN:
14912 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
14913 default:
14914 return false;
14915 }
14916
14917 llvm_unreachable("not a min/max opcode");
14918}
14919
14920SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
14921 DAGCombinerInfo &DCI) const {
14922 SelectionDAG &DAG = DCI.DAG;
14923
14924 EVT VT = N->getValueType(0);
14925 unsigned Opc = N->getOpcode();
14926 SDValue Op0 = N->getOperand(0);
14927 SDValue Op1 = N->getOperand(1);
14928
14929 // Only do this if the inner op has one use since this will just increases
14930 // register pressure for no benefit.
14931
14932 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
14933 // max(max(a, b), c) -> max3(a, b, c)
14934 // min(min(a, b), c) -> min3(a, b, c)
14935 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
14936 SDLoc DL(N);
14937 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
14938 Op0.getOperand(0), Op0.getOperand(1), Op1);
14939 }
14940
14941 // Try commuted.
14942 // max(a, max(b, c)) -> max3(a, b, c)
14943 // min(a, min(b, c)) -> min3(a, b, c)
14944 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
14945 SDLoc DL(N);
14946 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
14947 Op0, Op1.getOperand(0), Op1.getOperand(1));
14948 }
14949 }
14950
14951 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
14952 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
14953 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
14954 if (SDValue Med3 = performIntMed3ImmCombine(
14955 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
14956 return Med3;
14957 }
14958 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
14959 if (SDValue Med3 = performIntMed3ImmCombine(
14960 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
14961 return Med3;
14962 }
14963
14964 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
14965 if (SDValue Med3 = performIntMed3ImmCombine(
14966 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
14967 return Med3;
14968 }
14969 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
14970 if (SDValue Med3 = performIntMed3ImmCombine(
14971 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
14972 return Med3;
14973 }
14974
14975 // if !is_snan(x):
14976 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14977 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14978 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14979 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14980 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
14981 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
14982 (Opc == ISD::FMINIMUMNUM && Op0.getOpcode() == ISD::FMAXIMUMNUM) ||
14984 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
14985 (VT == MVT::f32 || VT == MVT::f64 ||
14986 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
14987 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
14988 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
14989 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
14990 Op0.hasOneUse()) {
14991 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
14992 return Res;
14993 }
14994
14995 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
14996 // for some types, but at a higher cost since it's implemented with a 3
14997 // operand form.
14998 const SDNodeFlags Flags = N->getFlags();
14999 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) &&
15000 !Subtarget->hasIEEEMinimumMaximumInsts() && Flags.hasNoNaNs()) {
15001 unsigned NewOpc =
15002 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
15003 return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
15004 }
15005
15006 return SDValue();
15007}
15008
15012 // FIXME: Should this be allowing -0.0?
15013 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
15014 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
15015 }
15016 }
15017
15018 return false;
15019}
15020
15021// FIXME: Should only worry about snans for version with chain.
15022SDValue SITargetLowering::performFMed3Combine(SDNode *N,
15023 DAGCombinerInfo &DCI) const {
15024 EVT VT = N->getValueType(0);
15025 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
15026 // NaNs. With a NaN input, the order of the operands may change the result.
15027
15028 SelectionDAG &DAG = DCI.DAG;
15029 SDLoc SL(N);
15030
15031 SDValue Src0 = N->getOperand(0);
15032 SDValue Src1 = N->getOperand(1);
15033 SDValue Src2 = N->getOperand(2);
15034
15035 if (isClampZeroToOne(Src0, Src1)) {
15036 // const_a, const_b, x -> clamp is safe in all cases including signaling
15037 // nans.
15038 // FIXME: Should this be allowing -0.0?
15039 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
15040 }
15041
15042 const MachineFunction &MF = DAG.getMachineFunction();
15043 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15044
15045 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
15046 // handling no dx10-clamp?
15047 if (Info->getMode().DX10Clamp) {
15048 // If NaNs is clamped to 0, we are free to reorder the inputs.
15049
15050 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15051 std::swap(Src0, Src1);
15052
15053 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
15054 std::swap(Src1, Src2);
15055
15056 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15057 std::swap(Src0, Src1);
15058
15059 if (isClampZeroToOne(Src1, Src2))
15060 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
15061 }
15062
15063 return SDValue();
15064}
15065
15066SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
15067 DAGCombinerInfo &DCI) const {
15068 SDValue Src0 = N->getOperand(0);
15069 SDValue Src1 = N->getOperand(1);
15070 if (Src0.isUndef() && Src1.isUndef())
15071 return DCI.DAG.getUNDEF(N->getValueType(0));
15072 return SDValue();
15073}
15074
15075// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
15076// expanded into a set of cmp/select instructions.
15078 unsigned NumElem,
15079 bool IsDivergentIdx,
15080 const GCNSubtarget *Subtarget) {
15082 return false;
15083
15084 unsigned VecSize = EltSize * NumElem;
15085
15086 // Sub-dword vectors of size 2 dword or less have better implementation.
15087 if (VecSize <= 64 && EltSize < 32)
15088 return false;
15089
15090 // Always expand the rest of sub-dword instructions, otherwise it will be
15091 // lowered via memory.
15092 if (EltSize < 32)
15093 return true;
15094
15095 // Always do this if var-idx is divergent, otherwise it will become a loop.
15096 if (IsDivergentIdx)
15097 return true;
15098
15099 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
15100 unsigned NumInsts = NumElem /* Number of compares */ +
15101 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
15102
15103 // On some architectures (GFX9) movrel is not available and it's better
15104 // to expand.
15105 if (Subtarget->useVGPRIndexMode())
15106 return NumInsts <= 16;
15107
15108 // If movrel is available, use it instead of expanding for vector of 8
15109 // elements.
15110 if (Subtarget->hasMovrel())
15111 return NumInsts <= 15;
15112
15113 return true;
15114}
15115
15117 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
15118 if (isa<ConstantSDNode>(Idx))
15119 return false;
15120
15121 SDValue Vec = N->getOperand(0);
15122 EVT VecVT = Vec.getValueType();
15123 EVT EltVT = VecVT.getVectorElementType();
15124 unsigned EltSize = EltVT.getSizeInBits();
15125 unsigned NumElem = VecVT.getVectorNumElements();
15126
15128 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
15129}
15130
15131SDValue
15132SITargetLowering::performExtractVectorEltCombine(SDNode *N,
15133 DAGCombinerInfo &DCI) const {
15134 SDValue Vec = N->getOperand(0);
15135 SelectionDAG &DAG = DCI.DAG;
15136
15137 EVT VecVT = Vec.getValueType();
15138 EVT VecEltVT = VecVT.getVectorElementType();
15139 EVT ResVT = N->getValueType(0);
15140
15141 unsigned VecSize = VecVT.getSizeInBits();
15142 unsigned VecEltSize = VecEltVT.getSizeInBits();
15143
15144 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
15146 SDLoc SL(N);
15147 SDValue Idx = N->getOperand(1);
15148 SDValue Elt =
15149 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
15150 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
15151 }
15152
15153 // (extract_vector_element (and {y0, y1}, (build_vector 0x1f, 0x1f)), index)
15154 // -> (and (extract_vector_element {y0, y1}, index), 0x1f)
15155 // There are optimisations to transform 64-bit shifts into 32-bit shifts
15156 // depending on the shift operand. See e.g. performSraCombine().
15157 // This combine ensures that the optimisation is compatible with v2i32
15158 // legalised AND.
15159 if (VecVT == MVT::v2i32 && Vec->getOpcode() == ISD::AND &&
15160 Vec->getOperand(1)->getOpcode() == ISD::BUILD_VECTOR) {
15161
15163 if (!C || C->getZExtValue() != 0x1f)
15164 return SDValue();
15165
15166 SDLoc SL(N);
15167 SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
15168 SDValue EVE = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
15169 Vec->getOperand(0), N->getOperand(1));
15170 SDValue A = DAG.getNode(ISD::AND, SL, MVT::i32, EVE, AndMask);
15171 DAG.ReplaceAllUsesWith(N, A.getNode());
15172 }
15173
15174 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
15175 // =>
15176 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
15177 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
15178 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
15179 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15180 SDLoc SL(N);
15181 SDValue Idx = N->getOperand(1);
15182 unsigned Opc = Vec.getOpcode();
15183
15184 switch (Opc) {
15185 default:
15186 break;
15187 // TODO: Support other binary operations.
15188 case ISD::FADD:
15189 case ISD::FSUB:
15190 case ISD::FMUL:
15191 case ISD::ADD:
15192 case ISD::UMIN:
15193 case ISD::UMAX:
15194 case ISD::SMIN:
15195 case ISD::SMAX:
15196 case ISD::FMAXNUM:
15197 case ISD::FMINNUM:
15198 case ISD::FMAXNUM_IEEE:
15199 case ISD::FMINNUM_IEEE:
15200 case ISD::FMAXIMUM:
15201 case ISD::FMINIMUM: {
15202 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15203 Vec.getOperand(0), Idx);
15204 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15205 Vec.getOperand(1), Idx);
15206
15207 DCI.AddToWorklist(Elt0.getNode());
15208 DCI.AddToWorklist(Elt1.getNode());
15209 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
15210 }
15211 }
15212 }
15213
15214 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
15216 SDLoc SL(N);
15217 SDValue Idx = N->getOperand(1);
15218 SDValue V;
15219 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15220 SDValue IC = DAG.getVectorIdxConstant(I, SL);
15221 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
15222 if (I == 0)
15223 V = Elt;
15224 else
15225 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
15226 }
15227 return V;
15228 }
15229
15230 if (!DCI.isBeforeLegalize())
15231 return SDValue();
15232
15233 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
15234 // elements. This exposes more load reduction opportunities by replacing
15235 // multiple small extract_vector_elements with a single 32-bit extract.
15236 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
15237 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
15238 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15239 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
15240
15241 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15242 unsigned EltIdx = BitIndex / 32;
15243 unsigned LeftoverBitIdx = BitIndex % 32;
15244 SDLoc SL(N);
15245
15246 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
15247 DCI.AddToWorklist(Cast.getNode());
15248
15249 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
15250 DAG.getConstant(EltIdx, SL, MVT::i32));
15251 DCI.AddToWorklist(Elt.getNode());
15252 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
15253 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
15254 DCI.AddToWorklist(Srl.getNode());
15255
15256 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
15257 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
15258 DCI.AddToWorklist(Trunc.getNode());
15259
15260 if (VecEltVT == ResVT) {
15261 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
15262 }
15263
15264 assert(ResVT.isScalarInteger());
15265 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
15266 }
15267
15268 return SDValue();
15269}
15270
15271SDValue
15272SITargetLowering::performInsertVectorEltCombine(SDNode *N,
15273 DAGCombinerInfo &DCI) const {
15274 SDValue Vec = N->getOperand(0);
15275 SDValue Idx = N->getOperand(2);
15276 EVT VecVT = Vec.getValueType();
15277 EVT EltVT = VecVT.getVectorElementType();
15278
15279 // INSERT_VECTOR_ELT (<n x e>, var-idx)
15280 // => BUILD_VECTOR n x select (e, const-idx)
15282 return SDValue();
15283
15284 SelectionDAG &DAG = DCI.DAG;
15285 SDLoc SL(N);
15286 SDValue Ins = N->getOperand(1);
15287 EVT IdxVT = Idx.getValueType();
15288
15290 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15291 SDValue IC = DAG.getConstant(I, SL, IdxVT);
15292 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
15293 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
15294 Ops.push_back(V);
15295 }
15296
15297 return DAG.getBuildVector(VecVT, SL, Ops);
15298}
15299
15300/// Return the source of an fp_extend from f16 to f32, or a converted FP
15301/// constant.
15303 if (Src.getOpcode() == ISD::FP_EXTEND &&
15304 Src.getOperand(0).getValueType() == MVT::f16) {
15305 return Src.getOperand(0);
15306 }
15307
15308 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
15309 APFloat Val = CFP->getValueAPF();
15310 bool LosesInfo = true;
15312 if (!LosesInfo)
15313 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
15314 }
15315
15316 return SDValue();
15317}
15318
15319SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
15320 DAGCombinerInfo &DCI) const {
15321 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15322 "combine only useful on gfx8");
15323
15324 SDValue TruncSrc = N->getOperand(0);
15325 EVT VT = N->getValueType(0);
15326 if (VT != MVT::f16)
15327 return SDValue();
15328
15329 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
15330 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
15331 return SDValue();
15332
15333 SelectionDAG &DAG = DCI.DAG;
15334 SDLoc SL(N);
15335
15336 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
15337 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
15338 // casting back.
15339
15340 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
15341 // fmin(fmax(a, b), fmax(fmin(a, b), c))
15342 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
15343 if (!A)
15344 return SDValue();
15345
15346 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
15347 if (!B)
15348 return SDValue();
15349
15350 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
15351 if (!C)
15352 return SDValue();
15353
15354 // This changes signaling nan behavior. If an input is a signaling nan, it
15355 // would have been quieted by the fpext originally. We don't care because
15356 // these are unconstrained ops. If we needed to insert quieting canonicalizes
15357 // we would be worse off than just doing the promotion.
15358 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
15359 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
15360 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
15361 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
15362}
15363
15364unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
15365 const SDNode *N0,
15366 const SDNode *N1) const {
15367 EVT VT = N0->getValueType(0);
15368
15369 // Only do this if we are not trying to support denormals. v_mad_f32 does not
15370 // support denormals ever.
15371 if (((VT == MVT::f32 &&
15373 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15376 return ISD::FMAD;
15377
15378 const TargetOptions &Options = DAG.getTarget().Options;
15379 if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
15380 (N0->getFlags().hasAllowContract() &&
15381 N1->getFlags().hasAllowContract())) &&
15383 return ISD::FMA;
15384 }
15385
15386 return 0;
15387}
15388
15389// For a reassociatable opcode perform:
15390// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
15391SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
15392 SelectionDAG &DAG) const {
15393 EVT VT = N->getValueType(0);
15394 if (VT != MVT::i32 && VT != MVT::i64)
15395 return SDValue();
15396
15397 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
15398 return SDValue();
15399
15400 unsigned Opc = N->getOpcode();
15401 SDValue Op0 = N->getOperand(0);
15402 SDValue Op1 = N->getOperand(1);
15403
15404 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
15405 return SDValue();
15406
15407 if (Op0->isDivergent())
15408 std::swap(Op0, Op1);
15409
15410 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
15411 return SDValue();
15412
15413 SDValue Op2 = Op1.getOperand(1);
15414 Op1 = Op1.getOperand(0);
15415 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
15416 return SDValue();
15417
15418 if (Op1->isDivergent())
15419 std::swap(Op1, Op2);
15420
15421 SDLoc SL(N);
15422 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
15423 return DAG.getNode(Opc, SL, VT, Add1, Op2);
15424}
15425
15426static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
15427 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
15429 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
15430 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
15431 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
15432}
15433
15434// Fold
15435// y = lshr i64 x, 32
15436// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
15437// with Const.hi == -1
15438// To
15439// res = mad_u64_u32 y.lo ,Const.lo, x.lo
15441 SDValue MulLHS, SDValue MulRHS,
15442 SDValue AddRHS) {
15443 if (MulRHS.getOpcode() == ISD::SRL)
15444 std::swap(MulLHS, MulRHS);
15445
15446 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
15447 return SDValue();
15448
15449 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
15450 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
15451 MulLHS.getOperand(0) != AddRHS)
15452 return SDValue();
15453
15455 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
15456 return SDValue();
15457
15458 SDValue ConstMul =
15459 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
15460 return getMad64_32(DAG, SL, MVT::i64,
15461 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
15462 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
15463}
15464
15465// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
15466// multiplies, if any.
15467//
15468// Full 64-bit multiplies that feed into an addition are lowered here instead
15469// of using the generic expansion. The generic expansion ends up with
15470// a tree of ADD nodes that prevents us from using the "add" part of the
15471// MAD instruction. The expansion produced here results in a chain of ADDs
15472// instead of a tree.
15473SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
15474 DAGCombinerInfo &DCI) const {
15475 assert(N->isAnyAdd());
15476
15477 SelectionDAG &DAG = DCI.DAG;
15478 EVT VT = N->getValueType(0);
15479 SDLoc SL(N);
15480 SDValue LHS = N->getOperand(0);
15481 SDValue RHS = N->getOperand(1);
15482
15483 if (VT.isVector())
15484 return SDValue();
15485
15486 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
15487 // result in scalar registers for uniform values.
15488 if (!N->isDivergent() && Subtarget->hasSMulHi())
15489 return SDValue();
15490
15491 unsigned NumBits = VT.getScalarSizeInBits();
15492 if (NumBits <= 32 || NumBits > 64)
15493 return SDValue();
15494
15495 if (LHS.getOpcode() != ISD::MUL) {
15496 assert(RHS.getOpcode() == ISD::MUL);
15497 std::swap(LHS, RHS);
15498 }
15499
15500 // Avoid the fold if it would unduly increase the number of multiplies due to
15501 // multiple uses, except on hardware with full-rate multiply-add (which is
15502 // part of full-rate 64-bit ops).
15503 if (!Subtarget->hasFullRate64Ops()) {
15504 unsigned NumUsers = 0;
15505 for (SDNode *User : LHS->users()) {
15506 // There is a use that does not feed into addition, so the multiply can't
15507 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
15508 if (!User->isAnyAdd())
15509 return SDValue();
15510
15511 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
15512 // MUL + 3xADD + 3xADDC over 3xMAD.
15513 ++NumUsers;
15514 if (NumUsers >= 3)
15515 return SDValue();
15516 }
15517 }
15518
15519 SDValue MulLHS = LHS.getOperand(0);
15520 SDValue MulRHS = LHS.getOperand(1);
15521 SDValue AddRHS = RHS;
15522
15523 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
15524 return FoldedMAD;
15525
15526 // Always check whether operands are small unsigned values, since that
15527 // knowledge is useful in more cases. Check for small signed values only if
15528 // doing so can unlock a shorter code sequence.
15529 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
15530 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
15531
15532 bool MulSignedLo = false;
15533 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15534 MulSignedLo =
15535 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
15536 }
15537
15538 // The operands and final result all have the same number of bits. If
15539 // operands need to be extended, they can be extended with garbage. The
15540 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
15541 // truncated away in the end.
15542 if (VT != MVT::i64) {
15543 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
15544 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
15545 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
15546 }
15547
15548 // The basic code generated is conceptually straightforward. Pseudo code:
15549 //
15550 // accum = mad_64_32 lhs.lo, rhs.lo, accum
15551 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
15552 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
15553 //
15554 // The second and third lines are optional, depending on whether the factors
15555 // are {sign,zero}-extended or not.
15556 //
15557 // The actual DAG is noisier than the pseudo code, but only due to
15558 // instructions that disassemble values into low and high parts, and
15559 // assemble the final result.
15560 SDValue One = DAG.getConstant(1, SL, MVT::i32);
15561
15562 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
15563 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
15564 SDValue Accum =
15565 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15566
15567 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15568 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15569
15570 if (!MulLHSUnsigned32) {
15571 auto MulLHSHi =
15572 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
15573 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
15574 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15575 }
15576
15577 if (!MulRHSUnsigned32) {
15578 auto MulRHSHi =
15579 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
15580 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
15581 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15582 }
15583
15584 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
15585 Accum = DAG.getBitcast(MVT::i64, Accum);
15586 }
15587
15588 if (VT != MVT::i64)
15589 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
15590 return Accum;
15591}
15592
15593SDValue
15594SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
15595 DAGCombinerInfo &DCI) const {
15596 SDValue RHS = N->getOperand(1);
15597 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
15598 if (!CRHS)
15599 return SDValue();
15600
15601 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
15602 // common.
15603 uint64_t Val = CRHS->getZExtValue();
15604 if (countr_zero(Val) >= 32) {
15605 SelectionDAG &DAG = DCI.DAG;
15606 SDLoc SL(N);
15607 SDValue LHS = N->getOperand(0);
15608
15609 // Avoid carry machinery if we know the low half of the add does not
15610 // contribute to the final result.
15611 //
15612 // add i64:x, K if computeTrailingZeros(K) >= 32
15613 // => build_pair (add x.hi, K.hi), x.lo
15614
15615 // Breaking the 64-bit add here with this strange constant is unlikely
15616 // to interfere with addressing mode patterns.
15617
15618 SDValue Hi = getHiHalf64(LHS, DAG);
15619 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
15620 unsigned Opcode = N->getOpcode();
15621 if (Opcode == ISD::PTRADD)
15622 Opcode = ISD::ADD;
15623 SDValue AddHi =
15624 DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
15625
15626 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
15627 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
15628 }
15629
15630 return SDValue();
15631}
15632
15633// Collect the ultimate src of each of the mul node's operands, and confirm
15634// each operand is 8 bytes.
15635static std::optional<ByteProvider<SDValue>>
15636handleMulOperand(const SDValue &MulOperand) {
15637 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
15638 if (!Byte0 || Byte0->isConstantZero()) {
15639 return std::nullopt;
15640 }
15641 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
15642 if (Byte1 && !Byte1->isConstantZero()) {
15643 return std::nullopt;
15644 }
15645 return Byte0;
15646}
15647
15648static unsigned addPermMasks(unsigned First, unsigned Second) {
15649 unsigned FirstCs = First & 0x0c0c0c0c;
15650 unsigned SecondCs = Second & 0x0c0c0c0c;
15651 unsigned FirstNoCs = First & ~0x0c0c0c0c;
15652 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15653
15654 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15655 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15656 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15657 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15658
15659 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15660}
15661
15662struct DotSrc {
15664 int64_t PermMask;
15666};
15667
15671 SmallVectorImpl<DotSrc> &Src1s, int Step) {
15672
15673 assert(Src0.Src.has_value() && Src1.Src.has_value());
15674 // Src0s and Src1s are empty, just place arbitrarily.
15675 if (Step == 0) {
15676 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
15677 Src0.SrcOffset / 4});
15678 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
15679 Src1.SrcOffset / 4});
15680 return;
15681 }
15682
15683 for (int BPI = 0; BPI < 2; BPI++) {
15684 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
15685 if (BPI == 1) {
15686 BPP = {Src1, Src0};
15687 }
15688 unsigned ZeroMask = 0x0c0c0c0c;
15689 unsigned FMask = 0xFF << (8 * (3 - Step));
15690
15691 unsigned FirstMask =
15692 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15693 unsigned SecondMask =
15694 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15695 // Attempt to find Src vector which contains our SDValue, if so, add our
15696 // perm mask to the existing one. If we are unable to find a match for the
15697 // first SDValue, attempt to find match for the second.
15698 int FirstGroup = -1;
15699 for (int I = 0; I < 2; I++) {
15700 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
15701 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
15702 return IterElt.SrcOp == *BPP.first.Src &&
15703 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15704 };
15705
15706 auto *Match = llvm::find_if(Srcs, MatchesFirst);
15707 if (Match != Srcs.end()) {
15708 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
15709 FirstGroup = I;
15710 break;
15711 }
15712 }
15713 if (FirstGroup != -1) {
15714 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
15715 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
15716 return IterElt.SrcOp == *BPP.second.Src &&
15717 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15718 };
15719 auto *Match = llvm::find_if(Srcs, MatchesSecond);
15720 if (Match != Srcs.end()) {
15721 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
15722 } else
15723 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15724 return;
15725 }
15726 }
15727
15728 // If we have made it here, then we could not find a match in Src0s or Src1s
15729 // for either Src0 or Src1, so just place them arbitrarily.
15730
15731 unsigned ZeroMask = 0x0c0c0c0c;
15732 unsigned FMask = 0xFF << (8 * (3 - Step));
15733
15734 Src0s.push_back(
15735 {*Src0.Src,
15736 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15737 Src0.SrcOffset / 4});
15738 Src1s.push_back(
15739 {*Src1.Src,
15740 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15741 Src1.SrcOffset / 4});
15742}
15743
15745 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
15746 bool IsAny) {
15747
15748 // If we just have one source, just permute it accordingly.
15749 if (Srcs.size() == 1) {
15750 auto *Elt = Srcs.begin();
15751 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
15752
15753 // v_perm will produce the original value
15754 if (Elt->PermMask == 0x3020100)
15755 return EltOp;
15756
15757 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15758 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
15759 }
15760
15761 auto *FirstElt = Srcs.begin();
15762 auto *SecondElt = std::next(FirstElt);
15763
15765
15766 // If we have multiple sources in the chain, combine them via perms (using
15767 // calculated perm mask) and Ors.
15768 while (true) {
15769 auto FirstMask = FirstElt->PermMask;
15770 auto SecondMask = SecondElt->PermMask;
15771
15772 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15773 unsigned FirstPlusFour = FirstMask | 0x04040404;
15774 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
15775 // original 0x0C.
15776 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15777
15778 auto PermMask = addPermMasks(FirstMask, SecondMask);
15779 auto FirstVal =
15780 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15781 auto SecondVal =
15782 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
15783
15784 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
15785 SecondVal,
15786 DAG.getConstant(PermMask, SL, MVT::i32)));
15787
15788 FirstElt = std::next(SecondElt);
15789 if (FirstElt == Srcs.end())
15790 break;
15791
15792 SecondElt = std::next(FirstElt);
15793 // If we only have a FirstElt, then just combine that into the cumulative
15794 // source node.
15795 if (SecondElt == Srcs.end()) {
15796 auto EltOp =
15797 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15798
15799 Perms.push_back(
15800 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15801 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
15802 break;
15803 }
15804 }
15805
15806 assert(Perms.size() == 1 || Perms.size() == 2);
15807 return Perms.size() == 2
15808 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
15809 : Perms[0];
15810}
15811
15812static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
15813 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15814 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15815 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15816 EntryMask += ZeroMask;
15817 }
15818}
15819
15820static bool isMul(const SDValue Op) {
15821 auto Opcode = Op.getOpcode();
15822
15823 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
15824 Opcode == AMDGPUISD::MUL_I24);
15825}
15826
15827static std::optional<bool>
15829 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
15830 const SDValue &S1Op, const SelectionDAG &DAG) {
15831 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
15832 // of the dot4 is irrelevant.
15833 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
15834 return false;
15835
15836 auto Known0 = DAG.computeKnownBits(S0Op, 0);
15837 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
15838 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
15839 auto Known1 = DAG.computeKnownBits(S1Op, 0);
15840 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
15841 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
15842
15843 assert(!(S0IsUnsigned && S0IsSigned));
15844 assert(!(S1IsUnsigned && S1IsSigned));
15845
15846 // There are 9 possible permutations of
15847 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
15848
15849 // In two permutations, the sign bits are known to be the same for both Ops,
15850 // so simply return Signed / Unsigned corresponding to the MSB
15851
15852 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
15853 return S0IsSigned;
15854
15855 // In another two permutations, the sign bits are known to be opposite. In
15856 // this case return std::nullopt to indicate a bad match.
15857
15858 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
15859 return std::nullopt;
15860
15861 // In the remaining five permutations, we don't know the value of the sign
15862 // bit for at least one Op. Since we have a valid ByteProvider, we know that
15863 // the upper bits must be extension bits. Thus, the only ways for the sign
15864 // bit to be unknown is if it was sign extended from unknown value, or if it
15865 // was any extended. In either case, it is correct to use the signed
15866 // version of the signedness semantics of dot4
15867
15868 // In two of such permutations, we known the sign bit is set for
15869 // one op, and the other is unknown. It is okay to used signed version of
15870 // dot4.
15871 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
15872 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
15873 return true;
15874
15875 // In one such permutation, we don't know either of the sign bits. It is okay
15876 // to used the signed version of dot4.
15877 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
15878 return true;
15879
15880 // In two of such permutations, we known the sign bit is unset for
15881 // one op, and the other is unknown. Return std::nullopt to indicate a
15882 // bad match.
15883 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
15884 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
15885 return std::nullopt;
15886
15887 llvm_unreachable("Fully covered condition");
15888}
15889
15890SDValue SITargetLowering::performAddCombine(SDNode *N,
15891 DAGCombinerInfo &DCI) const {
15892 SelectionDAG &DAG = DCI.DAG;
15893 EVT VT = N->getValueType(0);
15894 SDLoc SL(N);
15895 SDValue LHS = N->getOperand(0);
15896 SDValue RHS = N->getOperand(1);
15897
15898 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
15899 if (Subtarget->hasMad64_32()) {
15900 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
15901 return Folded;
15902 }
15903 }
15904
15905 if (SDValue V = reassociateScalarOps(N, DAG)) {
15906 return V;
15907 }
15908
15909 if (VT == MVT::i64) {
15910 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
15911 return Folded;
15912 }
15913
15914 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
15915 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
15916 SDValue TempNode(N, 0);
15917 std::optional<bool> IsSigned;
15921
15922 // Match the v_dot4 tree, while collecting src nodes.
15923 int ChainLength = 0;
15924 for (int I = 0; I < 4; I++) {
15925 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
15926 if (MulIdx == -1)
15927 break;
15928 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
15929 if (!Src0)
15930 break;
15931 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
15932 if (!Src1)
15933 break;
15934
15935 auto IterIsSigned = checkDot4MulSignedness(
15936 TempNode->getOperand(MulIdx), *Src0, *Src1,
15937 TempNode->getOperand(MulIdx)->getOperand(0),
15938 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
15939 if (!IterIsSigned)
15940 break;
15941 if (!IsSigned)
15942 IsSigned = *IterIsSigned;
15943 if (*IterIsSigned != *IsSigned)
15944 break;
15945 placeSources(*Src0, *Src1, Src0s, Src1s, I);
15946 auto AddIdx = 1 - MulIdx;
15947 // Allow the special case where add (add (mul24, 0), mul24) became ->
15948 // add (mul24, mul24).
15949 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
15950 Src2s.push_back(TempNode->getOperand(AddIdx));
15951 auto Src0 =
15952 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
15953 if (!Src0)
15954 break;
15955 auto Src1 =
15956 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
15957 if (!Src1)
15958 break;
15959 auto IterIsSigned = checkDot4MulSignedness(
15960 TempNode->getOperand(AddIdx), *Src0, *Src1,
15961 TempNode->getOperand(AddIdx)->getOperand(0),
15962 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
15963 if (!IterIsSigned)
15964 break;
15965 assert(IsSigned);
15966 if (*IterIsSigned != *IsSigned)
15967 break;
15968 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
15969 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
15970 ChainLength = I + 2;
15971 break;
15972 }
15973
15974 TempNode = TempNode->getOperand(AddIdx);
15975 Src2s.push_back(TempNode);
15976 ChainLength = I + 1;
15977 if (TempNode->getNumOperands() < 2)
15978 break;
15979 LHS = TempNode->getOperand(0);
15980 RHS = TempNode->getOperand(1);
15981 }
15982
15983 if (ChainLength < 2)
15984 return SDValue();
15985
15986 // Masks were constructed with assumption that we would find a chain of
15987 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
15988 // 0x0c) so they do not affect dot calculation.
15989 if (ChainLength < 4) {
15990 fixMasks(Src0s, ChainLength);
15991 fixMasks(Src1s, ChainLength);
15992 }
15993
15994 SDValue Src0, Src1;
15995
15996 // If we are just using a single source for both, and have permuted the
15997 // bytes consistently, we can just use the sources without permuting
15998 // (commutation).
15999 bool UseOriginalSrc = false;
16000 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
16001 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
16002 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
16003 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
16004 SmallVector<unsigned, 4> SrcBytes;
16005 auto Src0Mask = Src0s.begin()->PermMask;
16006 SrcBytes.push_back(Src0Mask & 0xFF000000);
16007 bool UniqueEntries = true;
16008 for (auto I = 1; I < 4; I++) {
16009 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
16010
16011 if (is_contained(SrcBytes, NextByte)) {
16012 UniqueEntries = false;
16013 break;
16014 }
16015 SrcBytes.push_back(NextByte);
16016 }
16017
16018 if (UniqueEntries) {
16019 UseOriginalSrc = true;
16020
16021 auto *FirstElt = Src0s.begin();
16022 auto FirstEltOp =
16023 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16024
16025 auto *SecondElt = Src1s.begin();
16026 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
16027 SecondElt->DWordOffset);
16028
16029 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
16030 MVT::getIntegerVT(32));
16031 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
16032 MVT::getIntegerVT(32));
16033 }
16034 }
16035
16036 if (!UseOriginalSrc) {
16037 Src0 = resolveSources(DAG, SL, Src0s, false, true);
16038 Src1 = resolveSources(DAG, SL, Src1s, false, true);
16039 }
16040
16041 assert(IsSigned);
16042 SDValue Src2 =
16043 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16044
16045 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
16046 : Intrinsic::amdgcn_udot4,
16047 SL, MVT::i64);
16048
16049 assert(!VT.isVector());
16050 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
16051 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
16052
16053 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
16054 }
16055
16056 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16057 return SDValue();
16058
16059 // add x, zext (setcc) => uaddo_carry x, 0, setcc
16060 // add x, sext (setcc) => usubo_carry x, 0, setcc
16061 unsigned Opc = LHS.getOpcode();
16064 std::swap(RHS, LHS);
16065
16066 Opc = RHS.getOpcode();
16067 switch (Opc) {
16068 default:
16069 break;
16070 case ISD::ZERO_EXTEND:
16071 case ISD::SIGN_EXTEND:
16072 case ISD::ANY_EXTEND: {
16073 auto Cond = RHS.getOperand(0);
16074 // If this won't be a real VOPC output, we would still need to insert an
16075 // extra instruction anyway.
16076 if (!isBoolSGPR(Cond))
16077 break;
16078 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16079 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16081 return DAG.getNode(Opc, SL, VTList, Args);
16082 }
16083 case ISD::UADDO_CARRY: {
16084 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
16085 if (!isNullConstant(RHS.getOperand(1)))
16086 break;
16087 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
16088 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
16089 }
16090 }
16091 return SDValue();
16092}
16093
16094SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
16095 DAGCombinerInfo &DCI) const {
16096 SelectionDAG &DAG = DCI.DAG;
16097 SDLoc DL(N);
16098 EVT VT = N->getValueType(0);
16099 SDValue N0 = N->getOperand(0);
16100 SDValue N1 = N->getOperand(1);
16101
16102 // The following folds transform PTRADDs into regular arithmetic in cases
16103 // where the PTRADD wouldn't be folded as an immediate offset into memory
16104 // instructions anyway. They are target-specific in that other targets might
16105 // prefer to not lose information about the pointer arithmetic.
16106
16107 // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
16108 // Adapted from DAGCombiner::visitADDLikeCommutative.
16109 SDValue V, K;
16110 if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
16111 SDNodeFlags ShlFlags = N1->getFlags();
16112 // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
16113 // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
16114 // preserved.
16115 SDNodeFlags NewShlFlags =
16116 ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
16118 : SDNodeFlags();
16119 SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
16120 DCI.AddToWorklist(Inner.getNode());
16121 return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
16122 }
16123
16124 // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
16125 // performAddCombine.
16126 if (N1.getOpcode() == ISD::MUL) {
16127 if (Subtarget->hasMad64_32()) {
16128 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16129 return Folded;
16130 }
16131 }
16132
16133 // If the 32 low bits of the constant are all zero, there is nothing to fold
16134 // into an immediate offset, so it's better to eliminate the unnecessary
16135 // addition for the lower 32 bits than to preserve the PTRADD.
16136 // Analogous to a fold in performAddCombine.
16137 if (VT == MVT::i64) {
16138 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16139 return Folded;
16140 }
16141
16142 if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
16143 return SDValue();
16144
16145 SDValue X = N0;
16146 SDValue Y = N1.getOperand(0);
16147 SDValue Z = N1.getOperand(1);
16148 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
16149 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
16150
16151 if (!YIsConstant && !ZIsConstant && !X->isDivergent() &&
16152 Y->isDivergent() != Z->isDivergent()) {
16153 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
16154 // y are uniform and z isn't.
16155 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
16156 // z are uniform and y isn't.
16157 // The goal is to push uniform operands up in the computation, so that they
16158 // can be handled with scalar operations. We can't use reassociateScalarOps
16159 // for this since it requires two identical commutative operations to
16160 // reassociate.
16161 if (Y->isDivergent())
16162 std::swap(Y, Z);
16163 // If both additions in the original were NUW, reassociation preserves that.
16164 SDNodeFlags ReassocFlags =
16165 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
16166 SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
16167 DCI.AddToWorklist(UniformInner.getNode());
16168 return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
16169 }
16170
16171 return SDValue();
16172}
16173
16174SDValue SITargetLowering::performSubCombine(SDNode *N,
16175 DAGCombinerInfo &DCI) const {
16176 SelectionDAG &DAG = DCI.DAG;
16177 EVT VT = N->getValueType(0);
16178
16179 if (VT == MVT::i64) {
16180 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16181 return Folded;
16182 }
16183
16184 if (VT != MVT::i32)
16185 return SDValue();
16186
16187 SDLoc SL(N);
16188 SDValue LHS = N->getOperand(0);
16189 SDValue RHS = N->getOperand(1);
16190
16191 // sub x, zext (setcc) => usubo_carry x, 0, setcc
16192 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
16193 unsigned Opc = RHS.getOpcode();
16194 switch (Opc) {
16195 default:
16196 break;
16197 case ISD::ZERO_EXTEND:
16198 case ISD::SIGN_EXTEND:
16199 case ISD::ANY_EXTEND: {
16200 auto Cond = RHS.getOperand(0);
16201 // If this won't be a real VOPC output, we would still need to insert an
16202 // extra instruction anyway.
16203 if (!isBoolSGPR(Cond))
16204 break;
16205 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16206 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16208 return DAG.getNode(Opc, SL, VTList, Args);
16209 }
16210 }
16211
16212 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
16213 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
16214 if (!isNullConstant(LHS.getOperand(1)))
16215 return SDValue();
16216 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
16217 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
16218 }
16219 return SDValue();
16220}
16221
16222SDValue
16223SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
16224 DAGCombinerInfo &DCI) const {
16225
16226 if (N->getValueType(0) != MVT::i32)
16227 return SDValue();
16228
16229 if (!isNullConstant(N->getOperand(1)))
16230 return SDValue();
16231
16232 SelectionDAG &DAG = DCI.DAG;
16233 SDValue LHS = N->getOperand(0);
16234
16235 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
16236 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
16237 unsigned LHSOpc = LHS.getOpcode();
16238 unsigned Opc = N->getOpcode();
16239 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
16240 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
16241 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
16242 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
16243 }
16244 return SDValue();
16245}
16246
16247SDValue SITargetLowering::performFAddCombine(SDNode *N,
16248 DAGCombinerInfo &DCI) const {
16249 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16250 return SDValue();
16251
16252 SelectionDAG &DAG = DCI.DAG;
16253 EVT VT = N->getValueType(0);
16254
16255 SDLoc SL(N);
16256 SDValue LHS = N->getOperand(0);
16257 SDValue RHS = N->getOperand(1);
16258
16259 // These should really be instruction patterns, but writing patterns with
16260 // source modifiers is a pain.
16261
16262 // fadd (fadd (a, a), b) -> mad 2.0, a, b
16263 if (LHS.getOpcode() == ISD::FADD) {
16264 SDValue A = LHS.getOperand(0);
16265 if (A == LHS.getOperand(1)) {
16266 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16267 if (FusedOp != 0) {
16268 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16269 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
16270 }
16271 }
16272 }
16273
16274 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
16275 if (RHS.getOpcode() == ISD::FADD) {
16276 SDValue A = RHS.getOperand(0);
16277 if (A == RHS.getOperand(1)) {
16278 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16279 if (FusedOp != 0) {
16280 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16281 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
16282 }
16283 }
16284 }
16285
16286 return SDValue();
16287}
16288
16289SDValue SITargetLowering::performFSubCombine(SDNode *N,
16290 DAGCombinerInfo &DCI) const {
16291 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16292 return SDValue();
16293
16294 SelectionDAG &DAG = DCI.DAG;
16295 SDLoc SL(N);
16296 EVT VT = N->getValueType(0);
16297 assert(!VT.isVector());
16298
16299 // Try to get the fneg to fold into the source modifier. This undoes generic
16300 // DAG combines and folds them into the mad.
16301 //
16302 // Only do this if we are not trying to support denormals. v_mad_f32 does
16303 // not support denormals ever.
16304 SDValue LHS = N->getOperand(0);
16305 SDValue RHS = N->getOperand(1);
16306 if (LHS.getOpcode() == ISD::FADD) {
16307 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
16308 SDValue A = LHS.getOperand(0);
16309 if (A == LHS.getOperand(1)) {
16310 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16311 if (FusedOp != 0) {
16312 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16313 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
16314
16315 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
16316 }
16317 }
16318 }
16319
16320 if (RHS.getOpcode() == ISD::FADD) {
16321 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
16322
16323 SDValue A = RHS.getOperand(0);
16324 if (A == RHS.getOperand(1)) {
16325 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16326 if (FusedOp != 0) {
16327 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
16328 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
16329 }
16330 }
16331 }
16332
16333 return SDValue();
16334}
16335
16336SDValue SITargetLowering::performFDivCombine(SDNode *N,
16337 DAGCombinerInfo &DCI) const {
16338 SelectionDAG &DAG = DCI.DAG;
16339 SDLoc SL(N);
16340 EVT VT = N->getValueType(0);
16341 if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts())
16342 return SDValue();
16343
16344 SDValue LHS = N->getOperand(0);
16345 SDValue RHS = N->getOperand(1);
16346
16347 SDNodeFlags Flags = N->getFlags();
16348 SDNodeFlags RHSFlags = RHS->getFlags();
16349 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
16350 !RHS->hasOneUse())
16351 return SDValue();
16352
16353 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
16354 bool IsNegative = false;
16355 if (CLHS->isExactlyValue(1.0) ||
16356 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16357 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
16358 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
16359 if (RHS.getOpcode() == ISD::FSQRT) {
16360 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
16361 SDValue Rsq =
16362 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
16363 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
16364 }
16365 }
16366 }
16367
16368 return SDValue();
16369}
16370
16371SDValue SITargetLowering::performFMulCombine(SDNode *N,
16372 DAGCombinerInfo &DCI) const {
16373 SelectionDAG &DAG = DCI.DAG;
16374 EVT VT = N->getValueType(0);
16375 EVT ScalarVT = VT.getScalarType();
16376 EVT IntVT = VT.changeElementType(MVT::i32);
16377
16378 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
16379 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16380 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
16381 return SDValue();
16382 }
16383
16384 SDValue LHS = N->getOperand(0);
16385 SDValue RHS = N->getOperand(1);
16386
16387 // It is cheaper to realize i32 inline constants as compared against
16388 // materializing f16 or f64 (or even non-inline f32) values,
16389 // possible via ldexp usage, as shown below :
16390 //
16391 // Given : A = 2^a & B = 2^b ; where a and b are integers.
16392 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
16393 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
16394 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16395 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
16396 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
16397 if (!TrueNode)
16398 return SDValue();
16399 const ConstantFPSDNode *FalseNode =
16400 isConstOrConstSplatFP(RHS.getOperand(2));
16401 if (!FalseNode)
16402 return SDValue();
16403
16404 if (TrueNode->isNegative() != FalseNode->isNegative())
16405 return SDValue();
16406
16407 // For f32, only non-inline constants should be transformed.
16408 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16409 if (ScalarVT == MVT::f32 &&
16410 TII->isInlineConstant(TrueNode->getValueAPF()) &&
16411 TII->isInlineConstant(FalseNode->getValueAPF()))
16412 return SDValue();
16413
16414 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
16415 if (TrueNodeExpVal == INT_MIN)
16416 return SDValue();
16417 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
16418 if (FalseNodeExpVal == INT_MIN)
16419 return SDValue();
16420
16421 SDLoc SL(N);
16422 SDValue SelectNode =
16423 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
16424 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
16425 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
16426
16427 LHS = TrueNode->isNegative()
16428 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
16429 : LHS;
16430
16431 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
16432 }
16433
16434 return SDValue();
16435}
16436
16437SDValue SITargetLowering::performFMACombine(SDNode *N,
16438 DAGCombinerInfo &DCI) const {
16439 SelectionDAG &DAG = DCI.DAG;
16440 EVT VT = N->getValueType(0);
16441 SDLoc SL(N);
16442
16443 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16444 return SDValue();
16445
16446 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
16447 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
16448 SDValue Op1 = N->getOperand(0);
16449 SDValue Op2 = N->getOperand(1);
16450 SDValue FMA = N->getOperand(2);
16451
16452 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
16453 Op2.getOpcode() != ISD::FP_EXTEND)
16454 return SDValue();
16455
16456 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
16457 // regardless of the denorm mode setting. Therefore,
16458 // fp-contract is sufficient to allow generating fdot2.
16459 const TargetOptions &Options = DAG.getTarget().Options;
16460 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
16461 (N->getFlags().hasAllowContract() &&
16462 FMA->getFlags().hasAllowContract())) {
16463 Op1 = Op1.getOperand(0);
16464 Op2 = Op2.getOperand(0);
16465 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16467 return SDValue();
16468
16469 SDValue Vec1 = Op1.getOperand(0);
16470 SDValue Idx1 = Op1.getOperand(1);
16471 SDValue Vec2 = Op2.getOperand(0);
16472
16473 SDValue FMAOp1 = FMA.getOperand(0);
16474 SDValue FMAOp2 = FMA.getOperand(1);
16475 SDValue FMAAcc = FMA.getOperand(2);
16476
16477 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
16478 FMAOp2.getOpcode() != ISD::FP_EXTEND)
16479 return SDValue();
16480
16481 FMAOp1 = FMAOp1.getOperand(0);
16482 FMAOp2 = FMAOp2.getOperand(0);
16483 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16485 return SDValue();
16486
16487 SDValue Vec3 = FMAOp1.getOperand(0);
16488 SDValue Vec4 = FMAOp2.getOperand(0);
16489 SDValue Idx2 = FMAOp1.getOperand(1);
16490
16491 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
16492 // Idx1 and Idx2 cannot be the same.
16493 Idx1 == Idx2)
16494 return SDValue();
16495
16496 if (Vec1 == Vec2 || Vec3 == Vec4)
16497 return SDValue();
16498
16499 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
16500 return SDValue();
16501
16502 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16503 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
16504 DAG.getTargetConstant(0, SL, MVT::i1));
16505 }
16506 }
16507 return SDValue();
16508}
16509
16510SDValue SITargetLowering::performSetCCCombine(SDNode *N,
16511 DAGCombinerInfo &DCI) const {
16512 SelectionDAG &DAG = DCI.DAG;
16513 SDLoc SL(N);
16514
16515 SDValue LHS = N->getOperand(0);
16516 SDValue RHS = N->getOperand(1);
16517 EVT VT = LHS.getValueType();
16518 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
16519
16520 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
16521 if (!CRHS) {
16523 if (CRHS) {
16524 std::swap(LHS, RHS);
16525 CC = getSetCCSwappedOperands(CC);
16526 }
16527 }
16528
16529 if (CRHS) {
16530 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
16531 isBoolSGPR(LHS.getOperand(0))) {
16532 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
16533 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
16534 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
16535 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
16536 if ((CRHS->isAllOnes() &&
16537 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
16538 (CRHS->isZero() &&
16539 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
16540 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16541 DAG.getAllOnesConstant(SL, MVT::i1));
16542 if ((CRHS->isAllOnes() &&
16543 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
16544 (CRHS->isZero() &&
16545 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
16546 return LHS.getOperand(0);
16547 }
16548
16549 const APInt &CRHSVal = CRHS->getAPIntValue();
16550 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
16551 LHS.getOpcode() == ISD::SELECT &&
16552 isa<ConstantSDNode>(LHS.getOperand(1)) &&
16553 isa<ConstantSDNode>(LHS.getOperand(2)) &&
16554 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
16555 isBoolSGPR(LHS.getOperand(0))) {
16556 // Given CT != FT:
16557 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
16558 // setcc (select cc, CT, CF), CF, ne => cc
16559 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
16560 // setcc (select cc, CT, CF), CT, eq => cc
16561 const APInt &CT = LHS.getConstantOperandAPInt(1);
16562 const APInt &CF = LHS.getConstantOperandAPInt(2);
16563
16564 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
16565 (CT == CRHSVal && CC == ISD::SETNE))
16566 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16567 DAG.getAllOnesConstant(SL, MVT::i1));
16568 if ((CF == CRHSVal && CC == ISD::SETNE) ||
16569 (CT == CRHSVal && CC == ISD::SETEQ))
16570 return LHS.getOperand(0);
16571 }
16572 }
16573
16574 if (VT != MVT::f32 && VT != MVT::f64 &&
16575 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16576 return SDValue();
16577
16578 // Match isinf/isfinite pattern
16579 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
16580 // (fcmp one (fabs x), inf) -> (fp_class x,
16581 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
16582 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
16583 LHS.getOpcode() == ISD::FABS) {
16584 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
16585 if (!CRHS)
16586 return SDValue();
16587
16588 const APFloat &APF = CRHS->getValueAPF();
16589 if (APF.isInfinity() && !APF.isNegative()) {
16590 const unsigned IsInfMask =
16592 const unsigned IsFiniteMask =
16596 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
16597 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
16598 DAG.getConstant(Mask, SL, MVT::i32));
16599 }
16600 }
16601
16602 return SDValue();
16603}
16604
16605SDValue
16606SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
16607 DAGCombinerInfo &DCI) const {
16608 SelectionDAG &DAG = DCI.DAG;
16609 SDLoc SL(N);
16610 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
16611
16612 SDValue Src = N->getOperand(0);
16613 SDValue Shift = N->getOperand(0);
16614
16615 // TODO: Extend type shouldn't matter (assuming legal types).
16616 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
16617 Shift = Shift.getOperand(0);
16618
16619 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
16620 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
16621 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
16622 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
16623 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
16624 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
16625 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
16626 SDValue Shifted = DAG.getZExtOrTrunc(
16627 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
16628
16629 unsigned ShiftOffset = 8 * Offset;
16630 if (Shift.getOpcode() == ISD::SHL)
16631 ShiftOffset -= C->getZExtValue();
16632 else
16633 ShiftOffset += C->getZExtValue();
16634
16635 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16636 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
16637 MVT::f32, Shifted);
16638 }
16639 }
16640 }
16641
16642 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16643 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
16644 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
16645 // We simplified Src. If this node is not dead, visit it again so it is
16646 // folded properly.
16647 if (N->getOpcode() != ISD::DELETED_NODE)
16648 DCI.AddToWorklist(N);
16649 return SDValue(N, 0);
16650 }
16651
16652 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
16653 if (SDValue DemandedSrc =
16654 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
16655 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
16656
16657 return SDValue();
16658}
16659
16660SDValue SITargetLowering::performClampCombine(SDNode *N,
16661 DAGCombinerInfo &DCI) const {
16662 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
16663 if (!CSrc)
16664 return SDValue();
16665
16666 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16667 const APFloat &F = CSrc->getValueAPF();
16668 APFloat Zero = APFloat::getZero(F.getSemantics());
16669 if (F < Zero ||
16670 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16671 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
16672 }
16673
16674 APFloat One(F.getSemantics(), "1.0");
16675 if (F > One)
16676 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
16677
16678 return SDValue(CSrc, 0);
16679}
16680
16681SDValue SITargetLowering::performSelectCombine(SDNode *N,
16682 DAGCombinerInfo &DCI) const {
16683
16684 // Try to fold CMP + SELECT patterns with shared constants (both FP and
16685 // integer).
16686 // Detect when CMP and SELECT use the same constant and fold them to avoid
16687 // loading the constant twice. Specifically handles patterns like:
16688 // %cmp = icmp eq i32 %val, 4242
16689 // %sel = select i1 %cmp, i32 4242, i32 %other
16690 // It can be optimized to reuse %val instead of 4242 in select.
16691 SDValue Cond = N->getOperand(0);
16692 SDValue TrueVal = N->getOperand(1);
16693 SDValue FalseVal = N->getOperand(2);
16694
16695 // Check if condition is a comparison.
16696 if (Cond.getOpcode() != ISD::SETCC)
16697 return SDValue();
16698
16699 SDValue LHS = Cond.getOperand(0);
16700 SDValue RHS = Cond.getOperand(1);
16701 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
16702
16703 bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
16704 bool isInteger = LHS.getValueType().isInteger();
16705
16706 // Handle simple floating-point and integer types only.
16707 if (!isFloatingPoint && !isInteger)
16708 return SDValue();
16709
16710 bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
16711 bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
16712 if (!isEquality && !isNonEquality)
16713 return SDValue();
16714
16715 SDValue ArgVal, ConstVal;
16716 if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
16717 (isInteger && isa<ConstantSDNode>(RHS))) {
16718 ConstVal = RHS;
16719 ArgVal = LHS;
16720 } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
16721 (isInteger && isa<ConstantSDNode>(LHS))) {
16722 ConstVal = LHS;
16723 ArgVal = RHS;
16724 } else {
16725 return SDValue();
16726 }
16727
16728 // Skip optimization for inlinable immediates.
16729 if (isFloatingPoint) {
16730 const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
16731 if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
16732 return SDValue();
16733 } else {
16735 cast<ConstantSDNode>(ConstVal)->getSExtValue()))
16736 return SDValue();
16737 }
16738
16739 // For equality and non-equality comparisons, patterns:
16740 // select (setcc x, const), const, y -> select (setcc x, const), x, y
16741 // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
16742 if (!(isEquality && TrueVal == ConstVal) &&
16743 !(isNonEquality && FalseVal == ConstVal))
16744 return SDValue();
16745
16746 SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
16747 SDValue SelectRHS =
16748 (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
16749 return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
16750 SelectLHS, SelectRHS);
16751}
16752
16754 DAGCombinerInfo &DCI) const {
16755 switch (N->getOpcode()) {
16756 case ISD::ADD:
16757 case ISD::SUB:
16758 case ISD::SHL:
16759 case ISD::SRL:
16760 case ISD::SRA:
16761 case ISD::AND:
16762 case ISD::OR:
16763 case ISD::XOR:
16764 case ISD::MUL:
16765 case ISD::SETCC:
16766 case ISD::SELECT:
16767 case ISD::SMIN:
16768 case ISD::SMAX:
16769 case ISD::UMIN:
16770 case ISD::UMAX:
16771 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
16772 return Res;
16773 break;
16774 default:
16775 break;
16776 }
16777
16778 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
16779 return SDValue();
16780
16781 switch (N->getOpcode()) {
16782 case ISD::ADD:
16783 return performAddCombine(N, DCI);
16784 case ISD::PTRADD:
16785 return performPtrAddCombine(N, DCI);
16786 case ISD::SUB:
16787 return performSubCombine(N, DCI);
16788 case ISD::UADDO_CARRY:
16789 case ISD::USUBO_CARRY:
16790 return performAddCarrySubCarryCombine(N, DCI);
16791 case ISD::FADD:
16792 return performFAddCombine(N, DCI);
16793 case ISD::FSUB:
16794 return performFSubCombine(N, DCI);
16795 case ISD::FDIV:
16796 return performFDivCombine(N, DCI);
16797 case ISD::FMUL:
16798 return performFMulCombine(N, DCI);
16799 case ISD::SETCC:
16800 return performSetCCCombine(N, DCI);
16801 case ISD::SELECT:
16802 if (auto Res = performSelectCombine(N, DCI))
16803 return Res;
16804 break;
16805 case ISD::FMAXNUM:
16806 case ISD::FMINNUM:
16807 case ISD::FMAXNUM_IEEE:
16808 case ISD::FMINNUM_IEEE:
16809 case ISD::FMAXIMUM:
16810 case ISD::FMINIMUM:
16811 case ISD::FMAXIMUMNUM:
16812 case ISD::FMINIMUMNUM:
16813 case ISD::SMAX:
16814 case ISD::SMIN:
16815 case ISD::UMAX:
16816 case ISD::UMIN:
16819 return performMinMaxCombine(N, DCI);
16820 case ISD::FMA:
16821 return performFMACombine(N, DCI);
16822 case ISD::AND:
16823 return performAndCombine(N, DCI);
16824 case ISD::OR:
16825 return performOrCombine(N, DCI);
16826 case ISD::FSHR: {
16828 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
16829 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
16830 return matchPERM(N, DCI);
16831 }
16832 break;
16833 }
16834 case ISD::XOR:
16835 return performXorCombine(N, DCI);
16836 case ISD::ZERO_EXTEND:
16837 return performZeroExtendCombine(N, DCI);
16839 return performSignExtendInRegCombine(N, DCI);
16841 return performClassCombine(N, DCI);
16842 case ISD::FCANONICALIZE:
16843 return performFCanonicalizeCombine(N, DCI);
16844 case AMDGPUISD::RCP:
16845 return performRcpCombine(N, DCI);
16846 case ISD::FLDEXP:
16847 case AMDGPUISD::FRACT:
16848 case AMDGPUISD::RSQ:
16851 case AMDGPUISD::RSQ_CLAMP: {
16852 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
16853 SDValue Src = N->getOperand(0);
16854 if (Src.isUndef())
16855 return Src;
16856 break;
16857 }
16858 case ISD::SINT_TO_FP:
16859 case ISD::UINT_TO_FP:
16860 return performUCharToFloatCombine(N, DCI);
16861 case ISD::FCOPYSIGN:
16862 return performFCopySignCombine(N, DCI);
16867 return performCvtF32UByteNCombine(N, DCI);
16868 case AMDGPUISD::FMED3:
16869 return performFMed3Combine(N, DCI);
16871 return performCvtPkRTZCombine(N, DCI);
16872 case AMDGPUISD::CLAMP:
16873 return performClampCombine(N, DCI);
16874 case ISD::SCALAR_TO_VECTOR: {
16875 SelectionDAG &DAG = DCI.DAG;
16876 EVT VT = N->getValueType(0);
16877
16878 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
16879 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
16880 SDLoc SL(N);
16881 SDValue Src = N->getOperand(0);
16882 EVT EltVT = Src.getValueType();
16883 if (EltVT != MVT::i16)
16884 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
16885
16886 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
16887 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
16888 }
16889
16890 break;
16891 }
16893 return performExtractVectorEltCombine(N, DCI);
16895 return performInsertVectorEltCombine(N, DCI);
16896 case ISD::FP_ROUND:
16897 return performFPRoundCombine(N, DCI);
16898 case ISD::LOAD: {
16899 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
16900 return Widened;
16901 [[fallthrough]];
16902 }
16903 default: {
16904 if (!DCI.isBeforeLegalize()) {
16905 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
16906 return performMemSDNodeCombine(MemNode, DCI);
16907 }
16908
16909 break;
16910 }
16911 }
16912
16914}
16915
16916/// Helper function for adjustWritemask
16917static unsigned SubIdx2Lane(unsigned Idx) {
16918 switch (Idx) {
16919 default:
16920 return ~0u;
16921 case AMDGPU::sub0:
16922 return 0;
16923 case AMDGPU::sub1:
16924 return 1;
16925 case AMDGPU::sub2:
16926 return 2;
16927 case AMDGPU::sub3:
16928 return 3;
16929 case AMDGPU::sub4:
16930 return 4; // Possible with TFE/LWE
16931 }
16932}
16933
16934/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
16935SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
16936 SelectionDAG &DAG) const {
16937 unsigned Opcode = Node->getMachineOpcode();
16938
16939 // Subtract 1 because the vdata output is not a MachineSDNode operand.
16940 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
16941 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
16942 return Node; // not implemented for D16
16943
16944 SDNode *Users[5] = {nullptr};
16945 unsigned Lane = 0;
16946 unsigned DmaskIdx =
16947 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
16948 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
16949 unsigned NewDmask = 0;
16950 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
16951 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
16952 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
16953 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx));
16954 unsigned TFCLane = 0;
16955 bool HasChain = Node->getNumValues() > 1;
16956
16957 if (OldDmask == 0) {
16958 // These are folded out, but on the chance it happens don't assert.
16959 return Node;
16960 }
16961
16962 unsigned OldBitsSet = llvm::popcount(OldDmask);
16963 // Work out which is the TFE/LWE lane if that is enabled.
16964 if (UsesTFC) {
16965 TFCLane = OldBitsSet;
16966 }
16967
16968 // Try to figure out the used register components
16969 for (SDUse &Use : Node->uses()) {
16970
16971 // Don't look at users of the chain.
16972 if (Use.getResNo() != 0)
16973 continue;
16974
16975 SDNode *User = Use.getUser();
16976
16977 // Abort if we can't understand the usage
16978 if (!User->isMachineOpcode() ||
16979 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
16980 return Node;
16981
16982 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
16983 // Note that subregs are packed, i.e. Lane==0 is the first bit set
16984 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
16985 // set, etc.
16986 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
16987 if (Lane == ~0u)
16988 return Node;
16989
16990 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
16991 if (UsesTFC && Lane == TFCLane) {
16992 Users[Lane] = User;
16993 } else {
16994 // Set which texture component corresponds to the lane.
16995 unsigned Comp;
16996 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
16997 Comp = llvm::countr_zero(Dmask);
16998 Dmask &= ~(1 << Comp);
16999 }
17000
17001 // Abort if we have more than one user per component.
17002 if (Users[Lane])
17003 return Node;
17004
17005 Users[Lane] = User;
17006 NewDmask |= 1 << Comp;
17007 }
17008 }
17009
17010 // Don't allow 0 dmask, as hardware assumes one channel enabled.
17011 bool NoChannels = !NewDmask;
17012 if (NoChannels) {
17013 if (!UsesTFC) {
17014 // No uses of the result and not using TFC. Then do nothing.
17015 return Node;
17016 }
17017 // If the original dmask has one channel - then nothing to do
17018 if (OldBitsSet == 1)
17019 return Node;
17020 // Use an arbitrary dmask - required for the instruction to work
17021 NewDmask = 1;
17022 }
17023 // Abort if there's no change
17024 if (NewDmask == OldDmask)
17025 return Node;
17026
17027 unsigned BitsSet = llvm::popcount(NewDmask);
17028
17029 // Check for TFE or LWE - increase the number of channels by one to account
17030 // for the extra return value
17031 // This will need adjustment for D16 if this is also included in
17032 // adjustWriteMask (this function) but at present D16 are excluded.
17033 unsigned NewChannels = BitsSet + UsesTFC;
17034
17035 int NewOpcode =
17036 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
17037 assert(NewOpcode != -1 &&
17038 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
17039 "failed to find equivalent MIMG op");
17040
17041 // Adjust the writemask in the node
17043 llvm::append_range(Ops, Node->ops().take_front(DmaskIdx));
17044 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
17045 llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1));
17046
17047 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
17048
17049 MVT ResultVT = NewChannels == 1
17050 ? SVT
17051 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
17052 : NewChannels == 5 ? 8
17053 : NewChannels);
17054 SDVTList NewVTList =
17055 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
17056
17057 MachineSDNode *NewNode =
17058 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
17059
17060 if (HasChain) {
17061 // Update chain.
17062 DAG.setNodeMemRefs(NewNode, Node->memoperands());
17063 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
17064 }
17065
17066 if (NewChannels == 1) {
17067 assert(Node->hasNUsesOfValue(1, 0));
17068 SDNode *Copy =
17069 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
17070 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
17071 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
17072 return nullptr;
17073 }
17074
17075 // Update the users of the node with the new indices
17076 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17077 SDNode *User = Users[i];
17078 if (!User) {
17079 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
17080 // Users[0] is still nullptr because channel 0 doesn't really have a use.
17081 if (i || !NoChannels)
17082 continue;
17083 } else {
17084 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
17085 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
17086 if (NewUser != User) {
17087 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
17088 DAG.RemoveDeadNode(User);
17089 }
17090 }
17091
17092 switch (Idx) {
17093 default:
17094 break;
17095 case AMDGPU::sub0:
17096 Idx = AMDGPU::sub1;
17097 break;
17098 case AMDGPU::sub1:
17099 Idx = AMDGPU::sub2;
17100 break;
17101 case AMDGPU::sub2:
17102 Idx = AMDGPU::sub3;
17103 break;
17104 case AMDGPU::sub3:
17105 Idx = AMDGPU::sub4;
17106 break;
17107 }
17108 }
17109
17110 DAG.RemoveDeadNode(Node);
17111 return nullptr;
17112}
17113
17115 if (Op.getOpcode() == ISD::AssertZext)
17116 Op = Op.getOperand(0);
17117
17118 return isa<FrameIndexSDNode>(Op);
17119}
17120
17121/// Legalize target independent instructions (e.g. INSERT_SUBREG)
17122/// with frame index operands.
17123/// LLVM assumes that inputs are to these instructions are registers.
17124SDNode *
17126 SelectionDAG &DAG) const {
17127 if (Node->getOpcode() == ISD::CopyToReg) {
17128 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
17129 SDValue SrcVal = Node->getOperand(2);
17130
17131 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
17132 // to try understanding copies to physical registers.
17133 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
17134 SDLoc SL(Node);
17136 SDValue VReg = DAG.getRegister(
17137 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17138
17139 SDNode *Glued = Node->getGluedNode();
17140 SDValue ToVReg = DAG.getCopyToReg(
17141 Node->getOperand(0), SL, VReg, SrcVal,
17142 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
17143 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
17144 VReg, ToVReg.getValue(1));
17145 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
17146 DAG.RemoveDeadNode(Node);
17147 return ToResultReg.getNode();
17148 }
17149 }
17150
17152 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
17153 if (!isFrameIndexOp(Node->getOperand(i))) {
17154 Ops.push_back(Node->getOperand(i));
17155 continue;
17156 }
17157
17158 SDLoc DL(Node);
17159 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
17160 Node->getOperand(i).getValueType(),
17161 Node->getOperand(i)),
17162 0));
17163 }
17164
17165 return DAG.UpdateNodeOperands(Node, Ops);
17166}
17167
17168/// Fold the instructions after selecting them.
17169/// Returns null if users were already updated.
17171 SelectionDAG &DAG) const {
17173 unsigned Opcode = Node->getMachineOpcode();
17174
17175 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
17176 !TII->isGather4(Opcode) &&
17177 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
17178 return adjustWritemask(Node, DAG);
17179 }
17180
17181 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17183 return Node;
17184 }
17185
17186 switch (Opcode) {
17187 case AMDGPU::V_DIV_SCALE_F32_e64:
17188 case AMDGPU::V_DIV_SCALE_F64_e64: {
17189 // Satisfy the operand register constraint when one of the inputs is
17190 // undefined. Ordinarily each undef value will have its own implicit_def of
17191 // a vreg, so force these to use a single register.
17192 SDValue Src0 = Node->getOperand(1);
17193 SDValue Src1 = Node->getOperand(3);
17194 SDValue Src2 = Node->getOperand(5);
17195
17196 if ((Src0.isMachineOpcode() &&
17197 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
17198 (Src0 == Src1 || Src0 == Src2))
17199 break;
17200
17201 MVT VT = Src0.getValueType().getSimpleVT();
17202 const TargetRegisterClass *RC =
17203 getRegClassFor(VT, Src0.getNode()->isDivergent());
17204
17206 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
17207
17208 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
17209 Src0, SDValue());
17210
17211 // src0 must be the same register as src1 or src2, even if the value is
17212 // undefined, so make sure we don't violate this constraint.
17213 if (Src0.isMachineOpcode() &&
17214 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
17215 if (Src1.isMachineOpcode() &&
17216 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17217 Src0 = Src1;
17218 else if (Src2.isMachineOpcode() &&
17219 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17220 Src0 = Src2;
17221 else {
17222 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
17223 Src0 = UndefReg;
17224 Src1 = UndefReg;
17225 }
17226 } else
17227 break;
17228
17230 Ops[1] = Src0;
17231 Ops[3] = Src1;
17232 Ops[5] = Src2;
17233 Ops.push_back(ImpDef.getValue(1));
17234 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
17235 }
17236 default:
17237 break;
17238 }
17239
17240 return Node;
17241}
17242
17243// Any MIMG instructions that use tfe or lwe require an initialization of the
17244// result register that will be written in the case of a memory access failure.
17245// The required code is also added to tie this init code to the result of the
17246// img instruction.
17249 const SIRegisterInfo &TRI = TII->getRegisterInfo();
17250 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
17251 MachineBasicBlock &MBB = *MI.getParent();
17252
17253 int DstIdx =
17254 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
17255 unsigned InitIdx = 0;
17256
17257 if (TII->isImage(MI)) {
17258 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
17259 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
17260 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
17261
17262 if (!TFE && !LWE) // intersect_ray
17263 return;
17264
17265 unsigned TFEVal = TFE ? TFE->getImm() : 0;
17266 unsigned LWEVal = LWE ? LWE->getImm() : 0;
17267 unsigned D16Val = D16 ? D16->getImm() : 0;
17268
17269 if (!TFEVal && !LWEVal)
17270 return;
17271
17272 // At least one of TFE or LWE are non-zero
17273 // We have to insert a suitable initialization of the result value and
17274 // tie this to the dest of the image instruction.
17275
17276 // Calculate which dword we have to initialize to 0.
17277 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
17278
17279 // check that dmask operand is found.
17280 assert(MO_Dmask && "Expected dmask operand in instruction");
17281
17282 unsigned dmask = MO_Dmask->getImm();
17283 // Determine the number of active lanes taking into account the
17284 // Gather4 special case
17285 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
17286
17287 bool Packed = !Subtarget->hasUnpackedD16VMem();
17288
17289 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17290
17291 // Abandon attempt if the dst size isn't large enough
17292 // - this is in fact an error but this is picked up elsewhere and
17293 // reported correctly.
17294 uint32_t DstSize =
17295 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
17296 if (DstSize < InitIdx)
17297 return;
17298 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
17299 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
17300 } else {
17301 return;
17302 }
17303
17304 const DebugLoc &DL = MI.getDebugLoc();
17305
17306 // Create a register for the initialization value.
17307 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
17308 unsigned NewDst = 0; // Final initialized value will be in here
17309
17310 // If PRTStrictNull feature is enabled (the default) then initialize
17311 // all the result registers to 0, otherwise just the error indication
17312 // register (VGPRn+1)
17313 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17314 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17315
17316 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
17317 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17318 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
17319 // Initialize dword
17320 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
17321 // clang-format off
17322 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
17323 .addImm(0);
17324 // clang-format on
17325 // Insert into the super-reg
17326 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
17327 .addReg(PrevDst)
17328 .addReg(SubReg)
17330
17331 PrevDst = NewDst;
17332 }
17333
17334 // Add as an implicit operand
17335 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
17336
17337 // Tie the just added implicit operand to the dst
17338 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
17339}
17340
17341/// Assign the register class depending on the number of
17342/// bits set in the writemask
17344 SDNode *Node) const {
17346
17347 MachineFunction *MF = MI.getParent()->getParent();
17350
17351 if (TII->isVOP3(MI.getOpcode())) {
17352 // Make sure constant bus requirements are respected.
17353 TII->legalizeOperandsVOP3(MRI, MI);
17354
17355 // Prefer VGPRs over AGPRs in mAI instructions where possible.
17356 // This saves a chain-copy of registers and better balance register
17357 // use between vgpr and agpr as agpr tuples tend to be big.
17358 if (!MI.getDesc().operands().empty()) {
17359 unsigned Opc = MI.getOpcode();
17360 bool HasAGPRs = Info->mayNeedAGPRs();
17361 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17362 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
17363 for (auto I :
17364 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
17365 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
17366 if (I == -1)
17367 break;
17368 if ((I == Src2Idx) && (HasAGPRs))
17369 break;
17370 MachineOperand &Op = MI.getOperand(I);
17371 if (!Op.isReg() || !Op.getReg().isVirtual())
17372 continue;
17373 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
17374 if (!TRI->hasAGPRs(RC))
17375 continue;
17376 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
17377 if (!Src || !Src->isCopy() ||
17378 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
17379 continue;
17380 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
17381 // All uses of agpr64 and agpr32 can also accept vgpr except for
17382 // v_accvgpr_read, but we do not produce agpr reads during selection,
17383 // so no use checks are needed.
17384 MRI.setRegClass(Op.getReg(), NewRC);
17385 }
17386
17387 if (TII->isMAI(MI)) {
17388 // The ordinary src0, src1, src2 were legalized above.
17389 //
17390 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
17391 // as a separate instruction.
17392 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17393 AMDGPU::OpName::scale_src0);
17394 if (Src0Idx != -1) {
17395 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17396 AMDGPU::OpName::scale_src1);
17397 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
17398 TII->usesConstantBus(MRI, MI, Src1Idx))
17399 TII->legalizeOpWithMove(MI, Src1Idx);
17400 }
17401 }
17402
17403 if (!HasAGPRs)
17404 return;
17405
17406 // Resolve the rest of AV operands to AGPRs.
17407 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
17408 if (Src2->isReg() && Src2->getReg().isVirtual()) {
17409 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
17410 if (TRI->isVectorSuperClass(RC)) {
17411 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
17412 MRI.setRegClass(Src2->getReg(), NewRC);
17413 if (Src2->isTied())
17414 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
17415 }
17416 }
17417 }
17418 }
17419
17420 return;
17421 }
17422
17423 if (TII->isImage(MI))
17424 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
17425}
17426
17428 uint64_t Val) {
17429 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
17430 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
17431}
17432
17434 const SDLoc &DL,
17435 SDValue Ptr) const {
17437
17438 // Build the half of the subregister with the constants before building the
17439 // full 128-bit register. If we are building multiple resource descriptors,
17440 // this will allow CSEing of the 2-component register.
17441 const SDValue Ops0[] = {
17442 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
17443 buildSMovImm32(DAG, DL, 0),
17444 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17445 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
17446 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
17447
17448 SDValue SubRegHi = SDValue(
17449 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
17450
17451 // Combine the constants and the pointer.
17452 const SDValue Ops1[] = {
17453 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
17454 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
17455 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
17456
17457 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
17458}
17459
17460/// Return a resource descriptor with the 'Add TID' bit enabled
17461/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
17462/// of the resource descriptor) to create an offset, which is added to
17463/// the resource pointer.
17465 SDValue Ptr, uint32_t RsrcDword1,
17466 uint64_t RsrcDword2And3) const {
17467 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
17468 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
17469 if (RsrcDword1) {
17470 PtrHi =
17471 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
17472 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
17473 0);
17474 }
17475
17476 SDValue DataLo =
17477 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
17478 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
17479
17480 const SDValue Ops[] = {
17481 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
17482 PtrLo,
17483 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17484 PtrHi,
17485 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
17486 DataLo,
17487 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
17488 DataHi,
17489 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
17490
17491 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
17492}
17493
17494//===----------------------------------------------------------------------===//
17495// SI Inline Assembly Support
17496//===----------------------------------------------------------------------===//
17497
17498std::pair<unsigned, const TargetRegisterClass *>
17500 StringRef Constraint,
17501 MVT VT) const {
17502 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
17503
17504 const TargetRegisterClass *RC = nullptr;
17505 if (Constraint.size() == 1) {
17506 // Check if we cannot determine the bit size of the given value type. This
17507 // can happen, for example, in this situation where we have an empty struct
17508 // (size 0): `call void asm "", "v"({} poison)`-
17509 if (VT == MVT::Other)
17510 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17511 const unsigned BitWidth = VT.getSizeInBits();
17512 switch (Constraint[0]) {
17513 default:
17514 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17515 case 's':
17516 case 'r':
17517 switch (BitWidth) {
17518 case 16:
17519 RC = &AMDGPU::SReg_32RegClass;
17520 break;
17521 case 64:
17522 RC = &AMDGPU::SGPR_64RegClass;
17523 break;
17524 default:
17526 if (!RC)
17527 return std::pair(0U, nullptr);
17528 break;
17529 }
17530 break;
17531 case 'v':
17532 switch (BitWidth) {
17533 case 16:
17534 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17535 : &AMDGPU::VGPR_32_Lo256RegClass;
17536 break;
17537 default:
17538 RC = Subtarget->has1024AddressableVGPRs()
17539 ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
17540 : TRI->getVGPRClassForBitWidth(BitWidth);
17541 if (!RC)
17542 return std::pair(0U, nullptr);
17543 break;
17544 }
17545 break;
17546 case 'a':
17547 if (!Subtarget->hasMAIInsts())
17548 break;
17549 switch (BitWidth) {
17550 case 16:
17551 RC = &AMDGPU::AGPR_32RegClass;
17552 break;
17553 default:
17554 RC = TRI->getAGPRClassForBitWidth(BitWidth);
17555 if (!RC)
17556 return std::pair(0U, nullptr);
17557 break;
17558 }
17559 break;
17560 }
17561 } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
17562 const unsigned BitWidth = VT.getSizeInBits();
17563 switch (BitWidth) {
17564 case 16:
17565 RC = &AMDGPU::AV_32RegClass;
17566 break;
17567 default:
17568 RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
17569 if (!RC)
17570 return std::pair(0U, nullptr);
17571 break;
17572 }
17573 }
17574
17575 // We actually support i128, i16 and f16 as inline parameters
17576 // even if they are not reported as legal
17577 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
17578 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
17579 return std::pair(0U, RC);
17580
17581 auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
17582 if (Kind != '\0') {
17583 if (Kind == 'v') {
17584 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17585 } else if (Kind == 's') {
17586 RC = &AMDGPU::SGPR_32RegClass;
17587 } else if (Kind == 'a') {
17588 RC = &AMDGPU::AGPR_32RegClass;
17589 }
17590
17591 if (RC) {
17592 if (NumRegs > 1) {
17593 if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
17594 return std::pair(0U, nullptr);
17595
17596 uint32_t Width = NumRegs * 32;
17597 // Prohibit constraints for register ranges with a width that does not
17598 // match the required type.
17599 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
17600 return std::pair(0U, nullptr);
17601
17602 MCRegister Reg = RC->getRegister(Idx);
17604 RC = TRI->getVGPRClassForBitWidth(Width);
17605 else if (SIRegisterInfo::isSGPRClass(RC))
17606 RC = TRI->getSGPRClassForBitWidth(Width);
17607 else if (SIRegisterInfo::isAGPRClass(RC))
17608 RC = TRI->getAGPRClassForBitWidth(Width);
17609 if (RC) {
17610 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17611 if (!Reg) {
17612 // The register class does not contain the requested register,
17613 // e.g., because it is an SGPR pair that would violate alignment
17614 // requirements.
17615 return std::pair(0U, nullptr);
17616 }
17617 return std::pair(Reg, RC);
17618 }
17619 }
17620
17621 // Check for lossy scalar/vector conversions.
17622 if (VT.isVector() && VT.getSizeInBits() != 32)
17623 return std::pair(0U, nullptr);
17624 if (Idx < RC->getNumRegs())
17625 return std::pair(RC->getRegister(Idx), RC);
17626 return std::pair(0U, nullptr);
17627 }
17628 }
17629
17630 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17631 if (Ret.first)
17632 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
17633
17634 return Ret;
17635}
17636
17637static bool isImmConstraint(StringRef Constraint) {
17638 if (Constraint.size() == 1) {
17639 switch (Constraint[0]) {
17640 default:
17641 break;
17642 case 'I':
17643 case 'J':
17644 case 'A':
17645 case 'B':
17646 case 'C':
17647 return true;
17648 }
17649 } else if (Constraint == "DA" || Constraint == "DB") {
17650 return true;
17651 }
17652 return false;
17653}
17654
17657 if (Constraint.size() == 1) {
17658 switch (Constraint[0]) {
17659 default:
17660 break;
17661 case 's':
17662 case 'v':
17663 case 'a':
17664 return C_RegisterClass;
17665 }
17666 } else if (Constraint.size() == 2) {
17667 if (Constraint == "VA")
17668 return C_RegisterClass;
17669 }
17670 if (isImmConstraint(Constraint)) {
17671 return C_Other;
17672 }
17673 return TargetLowering::getConstraintType(Constraint);
17674}
17675
17676static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
17678 Val = Val & maskTrailingOnes<uint64_t>(Size);
17679 }
17680 return Val;
17681}
17682
17684 StringRef Constraint,
17685 std::vector<SDValue> &Ops,
17686 SelectionDAG &DAG) const {
17687 if (isImmConstraint(Constraint)) {
17688 uint64_t Val;
17689 if (getAsmOperandConstVal(Op, Val) &&
17690 checkAsmConstraintVal(Op, Constraint, Val)) {
17691 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
17692 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
17693 }
17694 } else {
17696 }
17697}
17698
17700 unsigned Size = Op.getScalarValueSizeInBits();
17701 if (Size > 64)
17702 return false;
17703
17704 if (Size == 16 && !Subtarget->has16BitInsts())
17705 return false;
17706
17708 Val = C->getSExtValue();
17709 return true;
17710 }
17712 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17713 return true;
17714 }
17716 if (Size != 16 || Op.getNumOperands() != 2)
17717 return false;
17718 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
17719 return false;
17720 if (ConstantSDNode *C = V->getConstantSplatNode()) {
17721 Val = C->getSExtValue();
17722 return true;
17723 }
17724 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
17725 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17726 return true;
17727 }
17728 }
17729
17730 return false;
17731}
17732
17734 uint64_t Val) const {
17735 if (Constraint.size() == 1) {
17736 switch (Constraint[0]) {
17737 case 'I':
17739 case 'J':
17740 return isInt<16>(Val);
17741 case 'A':
17742 return checkAsmConstraintValA(Op, Val);
17743 case 'B':
17744 return isInt<32>(Val);
17745 case 'C':
17746 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
17748 default:
17749 break;
17750 }
17751 } else if (Constraint.size() == 2) {
17752 if (Constraint == "DA") {
17753 int64_t HiBits = static_cast<int32_t>(Val >> 32);
17754 int64_t LoBits = static_cast<int32_t>(Val);
17755 return checkAsmConstraintValA(Op, HiBits, 32) &&
17756 checkAsmConstraintValA(Op, LoBits, 32);
17757 }
17758 if (Constraint == "DB") {
17759 return true;
17760 }
17761 }
17762 llvm_unreachable("Invalid asm constraint");
17763}
17764
17766 unsigned MaxSize) const {
17767 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
17768 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
17769 if (Size == 16) {
17770 MVT VT = Op.getSimpleValueType();
17771 switch (VT.SimpleTy) {
17772 default:
17773 return false;
17774 case MVT::i16:
17775 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
17776 case MVT::f16:
17777 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
17778 case MVT::bf16:
17779 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
17780 case MVT::v2i16:
17781 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
17782 case MVT::v2f16:
17783 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
17784 case MVT::v2bf16:
17785 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
17786 }
17787 }
17788 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
17789 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
17790 return true;
17791 return false;
17792}
17793
17794static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
17795 switch (UnalignedClassID) {
17796 case AMDGPU::VReg_64RegClassID:
17797 return AMDGPU::VReg_64_Align2RegClassID;
17798 case AMDGPU::VReg_96RegClassID:
17799 return AMDGPU::VReg_96_Align2RegClassID;
17800 case AMDGPU::VReg_128RegClassID:
17801 return AMDGPU::VReg_128_Align2RegClassID;
17802 case AMDGPU::VReg_160RegClassID:
17803 return AMDGPU::VReg_160_Align2RegClassID;
17804 case AMDGPU::VReg_192RegClassID:
17805 return AMDGPU::VReg_192_Align2RegClassID;
17806 case AMDGPU::VReg_224RegClassID:
17807 return AMDGPU::VReg_224_Align2RegClassID;
17808 case AMDGPU::VReg_256RegClassID:
17809 return AMDGPU::VReg_256_Align2RegClassID;
17810 case AMDGPU::VReg_288RegClassID:
17811 return AMDGPU::VReg_288_Align2RegClassID;
17812 case AMDGPU::VReg_320RegClassID:
17813 return AMDGPU::VReg_320_Align2RegClassID;
17814 case AMDGPU::VReg_352RegClassID:
17815 return AMDGPU::VReg_352_Align2RegClassID;
17816 case AMDGPU::VReg_384RegClassID:
17817 return AMDGPU::VReg_384_Align2RegClassID;
17818 case AMDGPU::VReg_512RegClassID:
17819 return AMDGPU::VReg_512_Align2RegClassID;
17820 case AMDGPU::VReg_1024RegClassID:
17821 return AMDGPU::VReg_1024_Align2RegClassID;
17822 case AMDGPU::AReg_64RegClassID:
17823 return AMDGPU::AReg_64_Align2RegClassID;
17824 case AMDGPU::AReg_96RegClassID:
17825 return AMDGPU::AReg_96_Align2RegClassID;
17826 case AMDGPU::AReg_128RegClassID:
17827 return AMDGPU::AReg_128_Align2RegClassID;
17828 case AMDGPU::AReg_160RegClassID:
17829 return AMDGPU::AReg_160_Align2RegClassID;
17830 case AMDGPU::AReg_192RegClassID:
17831 return AMDGPU::AReg_192_Align2RegClassID;
17832 case AMDGPU::AReg_256RegClassID:
17833 return AMDGPU::AReg_256_Align2RegClassID;
17834 case AMDGPU::AReg_512RegClassID:
17835 return AMDGPU::AReg_512_Align2RegClassID;
17836 case AMDGPU::AReg_1024RegClassID:
17837 return AMDGPU::AReg_1024_Align2RegClassID;
17838 default:
17839 return -1;
17840 }
17841}
17842
17843// Figure out which registers should be reserved for stack access. Only after
17844// the function is legalized do we know all of the non-spill stack objects or if
17845// calls are present.
17849 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
17850 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17851 const SIInstrInfo *TII = ST.getInstrInfo();
17852
17853 if (Info->isEntryFunction()) {
17854 // Callable functions have fixed registers used for stack access.
17856 }
17857
17858 // TODO: Move this logic to getReservedRegs()
17859 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
17860 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
17861 Register SReg = ST.isWave32()
17862 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
17863 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
17864 &AMDGPU::SGPR_64RegClass);
17865 Info->setSGPRForEXECCopy(SReg);
17866
17867 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
17868 Info->getStackPtrOffsetReg()));
17869 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
17870 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
17871
17872 // We need to worry about replacing the default register with itself in case
17873 // of MIR testcases missing the MFI.
17874 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
17875 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
17876
17877 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
17878 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
17879
17880 Info->limitOccupancy(MF);
17881
17882 if (ST.isWave32() && !MF.empty()) {
17883 for (auto &MBB : MF) {
17884 for (auto &MI : MBB) {
17885 TII->fixImplicitOperands(MI);
17886 }
17887 }
17888 }
17889
17890 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
17891 // classes if required. Ideally the register class constraints would differ
17892 // per-subtarget, but there's no easy way to achieve that right now. This is
17893 // not a problem for VGPRs because the correctly aligned VGPR class is implied
17894 // from using them as the register class for legal types.
17895 if (ST.needsAlignedVGPRs()) {
17896 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
17897 const Register Reg = Register::index2VirtReg(I);
17898 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
17899 if (!RC)
17900 continue;
17901 int NewClassID = getAlignedAGPRClassID(RC->getID());
17902 if (NewClassID != -1)
17903 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
17904 }
17905 }
17906
17908}
17909
17911 KnownBits &Known,
17912 const APInt &DemandedElts,
17913 const SelectionDAG &DAG,
17914 unsigned Depth) const {
17915 Known.resetAll();
17916 unsigned Opc = Op.getOpcode();
17917 switch (Opc) {
17919 unsigned IID = Op.getConstantOperandVal(0);
17920 switch (IID) {
17921 case Intrinsic::amdgcn_mbcnt_lo:
17922 case Intrinsic::amdgcn_mbcnt_hi: {
17923 const GCNSubtarget &ST =
17925 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
17926 // most 31 + src1.
17927 Known.Zero.setBitsFrom(
17928 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
17929 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
17930 Known = KnownBits::add(Known, Known2);
17931 return;
17932 }
17933 }
17934 break;
17935 }
17936 }
17938 Op, Known, DemandedElts, DAG, Depth);
17939}
17940
17942 const int FI, KnownBits &Known, const MachineFunction &MF) const {
17944
17945 // Set the high bits to zero based on the maximum allowed scratch size per
17946 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
17947 // calculation won't overflow, so assume the sign bit is never set.
17948 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
17949}
17950
17952 GISelValueTracking &VT, KnownBits &Known,
17953 unsigned Dim) {
17954 unsigned MaxValue =
17955 ST.getMaxWorkitemID(VT.getMachineFunction().getFunction(), Dim);
17956 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
17957}
17958
17960 KnownBits &Known, const APInt &DemandedElts,
17961 unsigned BFEWidth, bool SExt, unsigned Depth) {
17963 const MachineOperand &Src1 = MI.getOperand(2);
17964
17965 unsigned Src1Cst = 0;
17966 if (Src1.isImm()) {
17967 Src1Cst = Src1.getImm();
17968 } else if (Src1.isReg()) {
17969 auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI);
17970 if (!Cst)
17971 return;
17972 Src1Cst = Cst->Value.getZExtValue();
17973 } else {
17974 return;
17975 }
17976
17977 // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
17978 // Width is always [22:16].
17979 const unsigned Offset =
17980 Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
17981 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
17982
17983 if (Width >= BFEWidth) // Ill-formed.
17984 return;
17985
17986 VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
17987 Depth + 1);
17988
17989 Known = Known.extractBits(Width, Offset);
17990
17991 if (SExt)
17992 Known = Known.sext(BFEWidth);
17993 else
17994 Known = Known.zext(BFEWidth);
17995}
17996
17998 GISelValueTracking &VT, Register R, KnownBits &Known,
17999 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
18000 unsigned Depth) const {
18001 Known.resetAll();
18002 const MachineInstr *MI = MRI.getVRegDef(R);
18003 switch (MI->getOpcode()) {
18004 case AMDGPU::S_BFE_I32:
18005 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18006 /*SExt=*/true, Depth);
18007 case AMDGPU::S_BFE_U32:
18008 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18009 /*SExt=*/false, Depth);
18010 case AMDGPU::S_BFE_I64:
18011 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18012 /*SExt=*/true, Depth);
18013 case AMDGPU::S_BFE_U64:
18014 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18015 /*SExt=*/false, Depth);
18016 case AMDGPU::G_INTRINSIC:
18017 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18018 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
18019 switch (IID) {
18020 case Intrinsic::amdgcn_workitem_id_x:
18021 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 0);
18022 break;
18023 case Intrinsic::amdgcn_workitem_id_y:
18024 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 1);
18025 break;
18026 case Intrinsic::amdgcn_workitem_id_z:
18027 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 2);
18028 break;
18029 case Intrinsic::amdgcn_mbcnt_lo:
18030 case Intrinsic::amdgcn_mbcnt_hi: {
18031 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18032 // most 31 + src1.
18033 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
18034 ? getSubtarget()->getWavefrontSizeLog2()
18035 : 5);
18036 KnownBits Known2;
18037 VT.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
18038 Depth + 1);
18039 Known = KnownBits::add(Known, Known2);
18040 break;
18041 }
18042 case Intrinsic::amdgcn_groupstaticsize: {
18043 // We can report everything over the maximum size as 0. We can't report
18044 // based on the actual size because we don't know if it's accurate or not
18045 // at any given point.
18046 Known.Zero.setHighBits(
18047 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
18048 break;
18049 }
18050 }
18051 break;
18052 }
18053 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18054 Known.Zero.setHighBits(24);
18055 break;
18056 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18057 Known.Zero.setHighBits(16);
18058 break;
18059 case AMDGPU::G_AMDGPU_SMED3:
18060 case AMDGPU::G_AMDGPU_UMED3: {
18061 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
18062
18063 KnownBits Known2;
18064 VT.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
18065 if (Known2.isUnknown())
18066 break;
18067
18068 KnownBits Known1;
18069 VT.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
18070 if (Known1.isUnknown())
18071 break;
18072
18073 KnownBits Known0;
18074 VT.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
18075 if (Known0.isUnknown())
18076 break;
18077
18078 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
18079 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
18080 Known.One = Known0.One & Known1.One & Known2.One;
18081 break;
18082 }
18083 }
18084}
18085
18088 unsigned Depth) const {
18089 const MachineInstr *MI = MRI.getVRegDef(R);
18090 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
18091 // FIXME: Can this move to generic code? What about the case where the call
18092 // site specifies a lower alignment?
18093 Intrinsic::ID IID = GI->getIntrinsicID();
18095 AttributeList Attrs =
18096 Intrinsic::getAttributes(Ctx, IID, Intrinsic::getType(Ctx, IID));
18097 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
18098 return *RetAlign;
18099 }
18100 return Align(1);
18101}
18102
18105 const Align CacheLineAlign = Align(64);
18106
18107 // Pre-GFX10 target did not benefit from loop alignment
18108 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
18109 getSubtarget()->hasInstFwdPrefetchBug())
18110 return PrefAlign;
18111
18112 // On GFX10 I$ is 4 x 64 bytes cache lines.
18113 // By default prefetcher keeps one cache line behind and reads two ahead.
18114 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
18115 // behind and one ahead.
18116 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
18117 // If loop fits 64 bytes it always spans no more than two cache lines and
18118 // does not need an alignment.
18119 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
18120 // Else if loop is less or equal 192 bytes we need two lines behind.
18121
18123 const MachineBasicBlock *Header = ML->getHeader();
18124 if (Header->getAlignment() != PrefAlign)
18125 return Header->getAlignment(); // Already processed.
18126
18127 unsigned LoopSize = 0;
18128 for (const MachineBasicBlock *MBB : ML->blocks()) {
18129 // If inner loop block is aligned assume in average half of the alignment
18130 // size to be added as nops.
18131 if (MBB != Header)
18132 LoopSize += MBB->getAlignment().value() / 2;
18133
18134 for (const MachineInstr &MI : *MBB) {
18135 LoopSize += TII->getInstSizeInBytes(MI);
18136 if (LoopSize > 192)
18137 return PrefAlign;
18138 }
18139 }
18140
18141 if (LoopSize <= 64)
18142 return PrefAlign;
18143
18144 if (LoopSize <= 128)
18145 return CacheLineAlign;
18146
18147 // If any of parent loops is surrounded by prefetch instructions do not
18148 // insert new for inner loop, which would reset parent's settings.
18149 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
18150 if (MachineBasicBlock *Exit = P->getExitBlock()) {
18151 auto I = Exit->getFirstNonDebugInstr();
18152 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18153 return CacheLineAlign;
18154 }
18155 }
18156
18157 MachineBasicBlock *Pre = ML->getLoopPreheader();
18158 MachineBasicBlock *Exit = ML->getExitBlock();
18159
18160 if (Pre && Exit) {
18161 auto PreTerm = Pre->getFirstTerminator();
18162 if (PreTerm == Pre->begin() ||
18163 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18164 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18165 .addImm(1); // prefetch 2 lines behind PC
18166
18167 auto ExitHead = Exit->getFirstNonDebugInstr();
18168 if (ExitHead == Exit->end() ||
18169 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18170 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18171 .addImm(2); // prefetch 1 line behind PC
18172 }
18173
18174 return CacheLineAlign;
18175}
18176
18178static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
18179 assert(N->getOpcode() == ISD::CopyFromReg);
18180 do {
18181 // Follow the chain until we find an INLINEASM node.
18182 N = N->getOperand(0).getNode();
18183 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
18184 return true;
18185 } while (N->getOpcode() == ISD::CopyFromReg);
18186 return false;
18187}
18188
18191 UniformityInfo *UA) const {
18192 switch (N->getOpcode()) {
18193 case ISD::CopyFromReg: {
18194 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
18195 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
18196 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18197 Register Reg = R->getReg();
18198
18199 // FIXME: Why does this need to consider isLiveIn?
18200 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
18201 return !TRI->isSGPRReg(MRI, Reg);
18202
18203 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
18204 return UA->isDivergent(V);
18205
18207 return !TRI->isSGPRReg(MRI, Reg);
18208 }
18209 case ISD::LOAD: {
18210 const LoadSDNode *L = cast<LoadSDNode>(N);
18211 unsigned AS = L->getAddressSpace();
18212 // A flat load may access private memory.
18214 }
18215 case ISD::CALLSEQ_END:
18216 return true;
18218 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
18220 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
18239 // Target-specific read-modify-write atomics are sources of divergence.
18240 return true;
18241 default:
18242 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
18243 // Generic read-modify-write atomics are sources of divergence.
18244 return A->readMem() && A->writeMem();
18245 }
18246 return false;
18247 }
18248}
18249
18251 EVT VT) const {
18252 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
18253 case MVT::f32:
18255 case MVT::f64:
18256 case MVT::f16:
18258 default:
18259 return false;
18260 }
18261}
18262
18264 LLT Ty, const MachineFunction &MF) const {
18265 switch (Ty.getScalarSizeInBits()) {
18266 case 32:
18267 return !denormalModeIsFlushAllF32(MF);
18268 case 64:
18269 case 16:
18270 return !denormalModeIsFlushAllF64F16(MF);
18271 default:
18272 return false;
18273 }
18274}
18275
18277 const APInt &DemandedElts,
18278 const SelectionDAG &DAG,
18279 bool SNaN,
18280 unsigned Depth) const {
18281 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
18282 const MachineFunction &MF = DAG.getMachineFunction();
18284
18285 if (Info->getMode().DX10Clamp)
18286 return true; // Clamped to 0.
18287 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
18288 }
18289
18291 DAG, SNaN, Depth);
18292}
18293
18294// On older subtargets, global FP atomic instructions have a hardcoded FP mode
18295// and do not support FP32 denormals, and only support v2f16/f64 denormals.
18297 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
18298 return true;
18299
18301 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
18302 if (DenormMode == DenormalMode::getPreserveSign())
18303 return true;
18304
18305 // TODO: Remove this.
18306 return RMW->getFunction()
18307 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
18308 .getValueAsBool();
18309}
18310
18312 LLVMContext &Ctx = RMW->getContext();
18313 StringRef MemScope =
18314 Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system");
18315
18316 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
18317 << "Hardware instruction generated for atomic "
18318 << RMW->getOperationName(RMW->getOperation())
18319 << " operation at memory scope " << MemScope;
18320}
18321
18322static bool isV2F16OrV2BF16(Type *Ty) {
18323 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
18324 Type *EltTy = VT->getElementType();
18325 return VT->getNumElements() == 2 &&
18326 (EltTy->isHalfTy() || EltTy->isBFloatTy());
18327 }
18328
18329 return false;
18330}
18331
18332static bool isV2F16(Type *Ty) {
18334 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
18335}
18336
18337static bool isV2BF16(Type *Ty) {
18339 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
18340}
18341
18342/// \return true if atomicrmw integer ops work for the type.
18343static bool isAtomicRMWLegalIntTy(Type *Ty) {
18344 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
18345 unsigned BW = IT->getBitWidth();
18346 return BW == 32 || BW == 64;
18347 }
18348
18349 return false;
18350}
18351
18352/// \return true if this atomicrmw xchg type can be selected.
18353static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
18354 Type *Ty = RMW->getType();
18355 if (isAtomicRMWLegalIntTy(Ty))
18356 return true;
18357
18358 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
18359 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
18360 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
18361 return BW == 32 || BW == 64;
18362 }
18363
18364 if (Ty->isFloatTy() || Ty->isDoubleTy())
18365 return true;
18366
18368 return VT->getNumElements() == 2 &&
18369 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18370 }
18371
18372 return false;
18373}
18374
18375/// \returns true if it's valid to emit a native instruction for \p RMW, based
18376/// on the properties of the target memory.
18377static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
18378 const AtomicRMWInst *RMW,
18379 bool HasSystemScope) {
18380 // The remote/fine-grained access logic is different from the integer
18381 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
18382 // fine-grained access does not work, even for a device local allocation.
18383 //
18384 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
18385 // allocations work.
18386 if (HasSystemScope) {
18388 RMW->hasMetadata("amdgpu.no.remote.memory"))
18389 return true;
18390 if (Subtarget.hasEmulatedSystemScopeAtomics())
18391 return true;
18393 return true;
18394
18395 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
18396}
18397
18398/// \return Action to perform on AtomicRMWInsts for integer operations.
18405
18406/// Return if a flat address space atomicrmw can access private memory.
18408 const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace);
18409 return !MD ||
18411}
18412
18420
18423 unsigned AS = RMW->getPointerAddressSpace();
18424 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
18426
18427 // 64-bit flat atomics that dynamically reside in private memory will silently
18428 // be dropped.
18429 //
18430 // Note that we will emit a new copy of the original atomic in the expansion,
18431 // which will be incrementally relegalized.
18432 const DataLayout &DL = RMW->getFunction()->getDataLayout();
18433 if (AS == AMDGPUAS::FLAT_ADDRESS &&
18434 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
18437
18438 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
18440 ORE.emit([=]() {
18441 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
18442 });
18443 return Kind;
18444 };
18445
18446 auto SSID = RMW->getSyncScopeID();
18447 bool HasSystemScope =
18448 SSID == SyncScope::System ||
18449 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
18450
18451 auto Op = RMW->getOperation();
18452 switch (Op) {
18454 // PCIe supports add and xchg for system atomics.
18455 return isAtomicRMWLegalXChgTy(RMW)
18458 case AtomicRMWInst::Add:
18459 // PCIe supports add and xchg for system atomics.
18461 case AtomicRMWInst::Sub:
18462 case AtomicRMWInst::And:
18463 case AtomicRMWInst::Or:
18464 case AtomicRMWInst::Xor:
18465 case AtomicRMWInst::Max:
18466 case AtomicRMWInst::Min:
18473 if (Subtarget->hasEmulatedSystemScopeAtomics())
18475
18476 // On most subtargets, for atomicrmw operations other than add/xchg,
18477 // whether or not the instructions will behave correctly depends on where
18478 // the address physically resides and what interconnect is used in the
18479 // system configuration. On some some targets the instruction will nop,
18480 // and in others synchronization will only occur at degraded device scope.
18481 //
18482 // If the allocation is known local to the device, the instructions should
18483 // work correctly.
18484 if (RMW->hasMetadata("amdgpu.no.remote.memory"))
18486
18487 // If fine-grained remote memory works at device scope, we don't need to
18488 // do anything.
18489 if (!HasSystemScope &&
18490 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18492
18493 // If we are targeting a remote allocated address, it depends what kind of
18494 // allocation the address belongs to.
18495 //
18496 // If the allocation is fine-grained (in host memory, or in PCIe peer
18497 // device memory), the operation will fail depending on the target.
18498 //
18499 // Note fine-grained host memory access does work on APUs or if XGMI is
18500 // used, but we do not know if we are targeting an APU or the system
18501 // configuration from the ISA version/target-cpu.
18502 if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
18504
18507 // Atomic sub/or/xor do not work over PCI express, but atomic add
18508 // does. InstCombine transforms these with 0 to or, so undo that.
18509 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
18510 ConstVal && ConstVal->isNullValue())
18512 }
18513
18514 // If the allocation could be in remote, fine-grained memory, the rmw
18515 // instructions may fail. cmpxchg should work, so emit that. On some
18516 // system configurations, PCIe atomics aren't supported so cmpxchg won't
18517 // even work, so you're out of luck anyway.
18518
18519 // In summary:
18520 //
18521 // Cases that may fail:
18522 // - fine-grained pinned host memory
18523 // - fine-grained migratable host memory
18524 // - fine-grained PCIe peer device
18525 //
18526 // Cases that should work, but may be treated overly conservatively.
18527 // - fine-grained host memory on an APU
18528 // - fine-grained XGMI peer device
18530 }
18531
18533 }
18534 case AtomicRMWInst::FAdd: {
18535 Type *Ty = RMW->getType();
18536
18537 // TODO: Handle REGION_ADDRESS
18538 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18539 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
18540 // is fixed to round-to-nearest-even.
18541 //
18542 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
18543 // round-to-nearest-even.
18544 //
18545 // We ignore the rounding mode problem, even in strictfp. The C++ standard
18546 // suggests it is OK if the floating-point mode may not match the calling
18547 // thread.
18548 if (Ty->isFloatTy()) {
18549 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
18551 }
18552
18553 if (Ty->isDoubleTy()) {
18554 // Ignores denormal mode, but we don't consider flushing mandatory.
18555 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
18557 }
18558
18559 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18561
18563 }
18564
18565 // LDS atomics respect the denormal mode from the mode register.
18566 //
18567 // Traditionally f32 global/buffer memory atomics would unconditionally
18568 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
18569 // flush.
18570 //
18571 // On targets with flat atomic fadd, denormals would flush depending on
18572 // whether the target address resides in LDS or global memory. We consider
18573 // this flat-maybe-flush as will-flush.
18574 if (Ty->isFloatTy() &&
18575 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18578
18579 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
18580 // safe. The message phrasing also should be better.
18581 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18582 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18583 // gfx942, gfx12
18584 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18585 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18586 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
18587 // gfx90a, gfx942, gfx12
18588 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18589 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18590
18591 // gfx942, gfx12
18592 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
18593 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18594 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
18595 // gfx90a, gfx942, gfx12
18596 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18597 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18598
18599 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
18600 // buffer. gfx12 does have the buffer version.
18601 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
18602 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18603 }
18604
18605 // global and flat atomic fadd f64: gfx90a, gfx942.
18606 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18607 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18608
18609 if (AS != AMDGPUAS::FLAT_ADDRESS) {
18610 if (Ty->isFloatTy()) {
18611 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
18612 // gfx11+.
18613 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18614 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18615 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
18616 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18617 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18618 } else {
18619 // gfx908
18620 if (RMW->use_empty() &&
18621 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18622 isV2F16(Ty))
18623 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18624 }
18625 }
18626
18627 // flat atomic fadd f32: gfx942, gfx11+.
18628 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
18629 if (Subtarget->hasFlatAtomicFaddF32Inst())
18630 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18631
18632 // If it is in flat address space, and the type is float, we will try to
18633 // expand it, if the target supports global and lds atomic fadd. The
18634 // reason we need that is, in the expansion, we emit the check of
18635 // address space. If it is in global address space, we emit the global
18636 // atomic fadd; if it is in shared address space, we emit the LDS atomic
18637 // fadd.
18638 if (Subtarget->hasLDSFPAtomicAddF32()) {
18639 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18641 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18643 }
18644 }
18645 }
18646
18648 }
18650 case AtomicRMWInst::FMax: {
18651 Type *Ty = RMW->getType();
18652
18653 // LDS float and double fmin/fmax were always supported.
18654 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18655 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
18657 }
18658
18659 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18660 // For flat and global cases:
18661 // float, double in gfx7. Manual claims denormal support.
18662 // Removed in gfx8.
18663 // float, double restored in gfx10.
18664 // double removed again in gfx11, so only f32 for gfx11/gfx12.
18665 //
18666 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
18667 // no f32.
18668 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18669 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18670 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18671 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18672 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18673 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
18675 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18676 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18677 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18678 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18679 }
18680 }
18681
18683 }
18686 default:
18688 }
18689
18690 llvm_unreachable("covered atomicrmw op switch");
18691}
18692
18699
18706
18709 unsigned AddrSpace = CmpX->getPointerAddressSpace();
18710 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
18712
18713 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
18715
18716 const DataLayout &DL = CmpX->getDataLayout();
18717
18718 Type *ValTy = CmpX->getNewValOperand()->getType();
18719
18720 // If a 64-bit flat atomic may alias private, we need to avoid using the
18721 // atomic in the private case.
18722 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand
18724}
18725
18726const TargetRegisterClass *
18727SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
18729 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18730 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18731 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18732 : &AMDGPU::SReg_32RegClass;
18733 if (!TRI->isSGPRClass(RC) && !isDivergent)
18734 return TRI->getEquivalentSGPRClass(RC);
18735 if (TRI->isSGPRClass(RC) && isDivergent)
18736 return TRI->getEquivalentVGPRClass(RC);
18737
18738 return RC;
18739}
18740
18741// FIXME: This is a workaround for DivergenceAnalysis not understanding always
18742// uniform values (as produced by the mask results of control flow intrinsics)
18743// used outside of divergent blocks. The phi users need to also be treated as
18744// always uniform.
18745//
18746// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
18747static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
18748 unsigned WaveSize) {
18749 // FIXME: We assume we never cast the mask results of a control flow
18750 // intrinsic.
18751 // Early exit if the type won't be consistent as a compile time hack.
18752 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
18753 if (!IT || IT->getBitWidth() != WaveSize)
18754 return false;
18755
18756 if (!isa<Instruction>(V))
18757 return false;
18758 if (!Visited.insert(V).second)
18759 return false;
18760 bool Result = false;
18761 for (const auto *U : V->users()) {
18763 if (V == U->getOperand(1)) {
18764 switch (Intrinsic->getIntrinsicID()) {
18765 default:
18766 Result = false;
18767 break;
18768 case Intrinsic::amdgcn_if_break:
18769 case Intrinsic::amdgcn_if:
18770 case Intrinsic::amdgcn_else:
18771 Result = true;
18772 break;
18773 }
18774 }
18775 if (V == U->getOperand(0)) {
18776 switch (Intrinsic->getIntrinsicID()) {
18777 default:
18778 Result = false;
18779 break;
18780 case Intrinsic::amdgcn_end_cf:
18781 case Intrinsic::amdgcn_loop:
18782 Result = true;
18783 break;
18784 }
18785 }
18786 } else {
18787 Result = hasCFUser(U, Visited, WaveSize);
18788 }
18789 if (Result)
18790 break;
18791 }
18792 return Result;
18793}
18794
18796 const Value *V) const {
18797 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
18798 if (CI->isInlineAsm()) {
18799 // FIXME: This cannot give a correct answer. This should only trigger in
18800 // the case where inline asm returns mixed SGPR and VGPR results, used
18801 // outside the defining block. We don't have a specific result to
18802 // consider, so this assumes if any value is SGPR, the overall register
18803 // also needs to be SGPR.
18804 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
18806 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
18807 for (auto &TC : TargetConstraints) {
18808 if (TC.Type == InlineAsm::isOutput) {
18810 const TargetRegisterClass *RC =
18811 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
18812 TC.ConstraintVT)
18813 .second;
18814 if (RC && SIRI->isSGPRClass(RC))
18815 return true;
18816 }
18817 }
18818 }
18819 }
18821 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
18822}
18823
18825 for (SDUse &Use : N->uses()) {
18827 if (getBasePtrIndex(M) == Use.getOperandNo())
18828 return true;
18829 }
18830 }
18831 return false;
18832}
18833
18835 SDValue N1) const {
18836 if (!N0.hasOneUse())
18837 return false;
18838 // Take care of the opportunity to keep N0 uniform
18839 if (N0->isDivergent() || !N1->isDivergent())
18840 return true;
18841 // Check if we have a good chance to form the memory access pattern with the
18842 // base and offset
18843 return (DAG.isBaseWithConstantOffset(N0) &&
18845}
18846
18848 Register N0, Register N1) const {
18849 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
18850}
18851
18854 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
18856 if (I.getMetadata("amdgpu.noclobber"))
18857 Flags |= MONoClobber;
18858 if (I.getMetadata("amdgpu.last.use"))
18859 Flags |= MOLastUse;
18860 return Flags;
18861}
18862
18864 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
18865 const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const {
18866 if (User->getOpcode() != ISD::CopyToReg)
18867 return false;
18868 if (!Def->isMachineOpcode())
18869 return false;
18871 if (!MDef)
18872 return false;
18873
18874 unsigned ResNo = User->getOperand(Op).getResNo();
18875 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
18876 return false;
18877 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
18878 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
18879 PhysReg = AMDGPU::SCC;
18880 const TargetRegisterClass *RC =
18881 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
18882 Cost = RC->getCopyCost();
18883 return true;
18884 }
18885 return false;
18886}
18887
18889 Instruction *AI) const {
18890 // Given: atomicrmw fadd ptr %addr, float %val ordering
18891 //
18892 // With this expansion we produce the following code:
18893 // [...]
18894 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
18895 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
18896 //
18897 // atomicrmw.shared:
18898 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
18899 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
18900 // float %val ordering
18901 // br label %atomicrmw.phi
18902 //
18903 // atomicrmw.check.private:
18904 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
18905 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
18906 //
18907 // atomicrmw.private:
18908 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
18909 // %loaded.private = load float, ptr addrspace(5) %cast.private
18910 // %val.new = fadd float %loaded.private, %val
18911 // store float %val.new, ptr addrspace(5) %cast.private
18912 // br label %atomicrmw.phi
18913 //
18914 // atomicrmw.global:
18915 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
18916 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
18917 // float %val ordering
18918 // br label %atomicrmw.phi
18919 //
18920 // atomicrmw.phi:
18921 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
18922 // [ %loaded.private, %atomicrmw.private ],
18923 // [ %loaded.global, %atomicrmw.global ]
18924 // br label %atomicrmw.end
18925 //
18926 // atomicrmw.end:
18927 // [...]
18928 //
18929 //
18930 // For 64-bit atomics which may reside in private memory, we perform a simpler
18931 // version that only inserts the private check, and uses the flat operation.
18932
18933 IRBuilder<> Builder(AI);
18934 LLVMContext &Ctx = Builder.getContext();
18935
18936 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
18937 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
18939 Value *Addr = AI->getOperand(PtrOpIdx);
18940
18941 /// TODO: Only need to check private, then emit flat-known-not private (no
18942 /// need for shared block, or cast to global).
18944
18945 Align Alignment;
18946 if (RMW)
18947 Alignment = RMW->getAlign();
18948 else if (CX)
18949 Alignment = CX->getAlign();
18950 else
18951 llvm_unreachable("unhandled atomic operation");
18952
18953 // FullFlatEmulation is true if we need to issue the private, shared, and
18954 // global cases.
18955 //
18956 // If this is false, we are only dealing with the flat-targeting-private case,
18957 // where we only insert a check for private and still use the flat instruction
18958 // for global and shared.
18959
18960 bool FullFlatEmulation =
18961 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
18962 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
18963 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
18964 RMW->getType()->isDoubleTy()));
18965
18966 // If the return value isn't used, do not introduce a false use in the phi.
18967 bool ReturnValueIsUsed = !AI->use_empty();
18968
18969 BasicBlock *BB = Builder.GetInsertBlock();
18970 Function *F = BB->getParent();
18971 BasicBlock *ExitBB =
18972 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
18973 BasicBlock *SharedBB = nullptr;
18974
18975 BasicBlock *CheckPrivateBB = BB;
18976 if (FullFlatEmulation) {
18977 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
18978 CheckPrivateBB =
18979 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
18980 }
18981
18982 BasicBlock *PrivateBB =
18983 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
18984 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
18985 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
18986
18987 std::prev(BB->end())->eraseFromParent();
18988 Builder.SetInsertPoint(BB);
18989
18990 Value *LoadedShared = nullptr;
18991 if (FullFlatEmulation) {
18992 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
18993 {Addr}, nullptr, "is.shared");
18994 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
18995 Builder.SetInsertPoint(SharedBB);
18996 Value *CastToLocal = Builder.CreateAddrSpaceCast(
18998
18999 Instruction *Clone = AI->clone();
19000 Clone->insertInto(SharedBB, SharedBB->end());
19001 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
19002 LoadedShared = Clone;
19003
19004 Builder.CreateBr(PhiBB);
19005 Builder.SetInsertPoint(CheckPrivateBB);
19006 }
19007
19008 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19009 {Addr}, nullptr, "is.private");
19010 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19011
19012 Builder.SetInsertPoint(PrivateBB);
19013
19014 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19016
19017 Value *LoadedPrivate;
19018 if (RMW) {
19019 LoadedPrivate = Builder.CreateAlignedLoad(
19020 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
19021
19022 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
19023 LoadedPrivate, RMW->getValOperand());
19024
19025 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19026 } else {
19027 auto [ResultLoad, Equal] =
19028 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
19029 CX->getNewValOperand(), CX->getAlign());
19030
19031 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
19032 ResultLoad, 0);
19033 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19034 }
19035
19036 Builder.CreateBr(PhiBB);
19037
19038 Builder.SetInsertPoint(GlobalBB);
19039
19040 // Continue using a flat instruction if we only emitted the check for private.
19041 Instruction *LoadedGlobal = AI;
19042 if (FullFlatEmulation) {
19043 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19045 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
19046 }
19047
19048 AI->removeFromParent();
19049 AI->insertInto(GlobalBB, GlobalBB->end());
19050
19051 // The new atomicrmw may go through another round of legalization later.
19052 if (!FullFlatEmulation) {
19053 // We inserted the runtime check already, make sure we do not try to
19054 // re-expand this.
19055 // TODO: Should union with any existing metadata.
19056 MDBuilder MDB(F->getContext());
19057 MDNode *RangeNotPrivate =
19060 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
19061 RangeNotPrivate);
19062 }
19063
19064 Builder.CreateBr(PhiBB);
19065
19066 Builder.SetInsertPoint(PhiBB);
19067
19068 if (ReturnValueIsUsed) {
19069 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
19070 AI->replaceAllUsesWith(Loaded);
19071 if (FullFlatEmulation)
19072 Loaded->addIncoming(LoadedShared, SharedBB);
19073 Loaded->addIncoming(LoadedPrivate, PrivateBB);
19074 Loaded->addIncoming(LoadedGlobal, GlobalBB);
19075 Loaded->takeName(AI);
19076 }
19077
19078 Builder.CreateBr(ExitBB);
19079}
19080
19082 unsigned PtrOpIdx) {
19083 Value *PtrOp = I->getOperand(PtrOpIdx);
19086
19087 Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
19088 Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
19089 I->getIterator());
19090 I->setOperand(PtrOpIdx, ASCast);
19091}
19092
19095
19098
19101 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
19102 ConstVal && ConstVal->isNullValue()) {
19103 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
19105
19106 // We may still need the private-alias-flat handling below.
19107
19108 // TODO: Skip this for cases where we cannot access remote memory.
19109 }
19110 }
19111
19112 // The non-flat expansions should only perform the de-canonicalization of
19113 // identity values.
19115 return;
19116
19118}
19119
19126
19130
19132 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19133}
19134
19136 if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19137 return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
19138
19140 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19141}
19142
19143LoadInst *
19145 IRBuilder<> Builder(AI);
19146 auto Order = AI->getOrdering();
19147
19148 // The optimization removes store aspect of the atomicrmw. Therefore, cache
19149 // must be flushed if the atomic ordering had a release semantics. This is
19150 // not necessary a fence, a release fence just coincides to do that flush.
19151 // Avoid replacing of an atomicrmw with a release semantics.
19152 if (isReleaseOrStronger(Order))
19153 return nullptr;
19154
19155 LoadInst *LI = Builder.CreateAlignedLoad(
19156 AI->getType(), AI->getPointerOperand(), AI->getAlign());
19157 LI->setAtomic(Order, AI->getSyncScopeID());
19158 LI->copyMetadata(*AI);
19159 LI->takeName(AI);
19160 AI->replaceAllUsesWith(LI);
19161 AI->eraseFromParent();
19162 return LI;
19163}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
return SDValue()
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
constexpr LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition Compiler.h:298
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
dxil translate DXIL Translate Metadata
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
#define P(N)
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1254
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition SIDefines.h:1251
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
The Input class is used to parse a yaml document into in-memory structs and vectors.
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1120
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:6057
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1497
bool isNegative() const
Definition APFloat.h:1449
bool isNormal() const
Definition APFloat.h:1453
APInt bitcastToAPInt() const
Definition APFloat.h:1353
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1138
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1098
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1079
bool isInfinity() const
Definition APFloat.h:1446
Class for arbitrary precision integers.
Definition APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1385
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:466
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
Definition APInt.h:366
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1237
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1221
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition Function.cpp:339
const Function * getParent() const
Definition Argument.h:44
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:361
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
BitVector & set()
Definition BitVector.h:370
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ ICMP_NE
not equal
Definition InstrTypes.h:700
bool isSigned() const
Definition InstrTypes.h:932
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:772
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:778
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:214
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
bool isBigEndian() const
Definition DataLayout.h:208
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
iterator_range< arg_iterator > args()
Definition Function.h:890
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition Function.cpp:803
Argument * getArg(unsigned i) const
Definition Function.h:884
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMadF16() const
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasPrivateSegmentBuffer() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition MDBuilder.cpp:96
Metadata node.
Definition Metadata.h:1078
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1442
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition ModRef.h:221
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:215
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition ModRef.h:218
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition Register.h:67
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:420
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:344
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition Type.h:258
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:107
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI void set(Value *Val)
Definition Value.h:905
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
const Use & getOperandUse(unsigned i) const
Definition User.h:245
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:201
constexpr bool isZero() const
Definition TypeSize.h:154
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:774
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ GlobalAddress
Definition ISDOpcodes.h:88
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:656
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:773
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:528
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:535
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
Offsets
Offsets in bytes from the start of the input buffer.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
Definition MathExtras.h:54
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
@ Offset
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition Analysis.cpp:241
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1705
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:841
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644
@ Done
Definition Threading.h:60
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition MathExtras.h:232
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:289
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2116
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:557
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
Definition ModRef.h:296
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:396
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:282
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1712
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:222
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition Analysis.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition MathExtras.h:241
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:560
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1877
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:86
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:304
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition APFloat.cpp:264
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:42
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
unsigned getOrigArgIndex() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:172
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:225
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:180
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:248
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs