LLVM 22.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "AMDGPUTargetMachine.h"
19#include "GCNSubtarget.h"
22#include "SIRegisterInfo.h"
23#include "llvm/ADT/APInt.h"
25#include "llvm/ADT/Statistic.h"
39#include "llvm/IR/IRBuilder.h"
41#include "llvm/IR/IntrinsicsAMDGPU.h"
42#include "llvm/IR/IntrinsicsR600.h"
43#include "llvm/IR/MDBuilder.h"
46#include "llvm/Support/ModRef.h"
48#include <optional>
49
50using namespace llvm;
51using namespace llvm::SDPatternMatch;
52
53#define DEBUG_TYPE "si-lower"
54
55STATISTIC(NumTailCalls, "Number of tail calls");
56
57static cl::opt<bool>
58 DisableLoopAlignment("amdgpu-disable-loop-alignment",
59 cl::desc("Do not align and prefetch loops"),
60 cl::init(false));
61
63 "amdgpu-use-divergent-register-indexing", cl::Hidden,
64 cl::desc("Use indirect register addressing for divergent indexes"),
65 cl::init(false));
66
69 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
70}
71
74 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
75}
76
77static unsigned findFirstFreeSGPR(CCState &CCInfo) {
78 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
79 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
80 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
81 return AMDGPU::SGPR0 + Reg;
82 }
83 }
84 llvm_unreachable("Cannot allocate sgpr");
85}
86
88 const GCNSubtarget &STI)
89 : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) {
90 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
91 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
92
93 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
94 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
95
96 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
97
98 const SIRegisterInfo *TRI = STI.getRegisterInfo();
99 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
100
101 addRegisterClass(MVT::f64, V64RegClass);
102 addRegisterClass(MVT::v2f32, V64RegClass);
103 addRegisterClass(MVT::Untyped, V64RegClass);
104
105 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
106 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
107
108 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
109 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
110
111 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
112 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
113
114 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
115 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
116
117 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
118 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
119
120 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
121 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
122
123 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
124 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
125
126 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
127 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
128
129 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
130 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
131
132 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
133 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
134
135 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
136 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
137
138 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
139 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
140
141 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
142 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
143
144 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
145 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
146
147 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
148 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
149
150 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
151 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
152
153 if (Subtarget->has16BitInsts()) {
154 if (Subtarget->useRealTrue16Insts()) {
155 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
156 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
157 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
158 } else {
159 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
160 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
161 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
162 }
163
164 // Unless there are also VOP3P operations, not operations are really legal.
165 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
166 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
167 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
168 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
169 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
170 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
171 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
172 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
173 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
174 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
175 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
176 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
177 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
178 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
179 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
180 }
181
182 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
183 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
184
185 computeRegisterProperties(Subtarget->getRegisterInfo());
186
187 // The boolean content concept here is too inflexible. Compares only ever
188 // really produce a 1-bit result. Any copy/extend from these will turn into a
189 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
190 // it's what most targets use.
193
194 // We need to custom lower vector stores from local memory
195 setOperationAction(ISD::LOAD,
196 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
197 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
198 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
199 MVT::i1, MVT::v32i32},
200 Custom);
201
202 setOperationAction(ISD::STORE,
203 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
204 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
205 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
206 MVT::i1, MVT::v32i32},
207 Custom);
208
209 if (isTypeLegal(MVT::bf16)) {
210 for (unsigned Opc :
212 ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM,
213 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
214 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
215 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
216 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
217 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
218 ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE,
219 ISD::SETCC}) {
220 // FIXME: The promoted to type shouldn't need to be explicit
221 setOperationAction(Opc, MVT::bf16, Promote);
222 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
223 }
224
226
228 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
229
230 setOperationAction(ISD::FABS, MVT::bf16, Legal);
231 setOperationAction(ISD::FNEG, MVT::bf16, Legal);
233
234 // We only need to custom lower because we can't specify an action for bf16
235 // sources.
238 }
239
240 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
241 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
242 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
243 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
244 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
245 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
246 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
247 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
248 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
249 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
250 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
251 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
252 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
253 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
254 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
255 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
256
257 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
258 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
259 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
260 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
261 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
262 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
263 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
264
265 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
266
270 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
271
272 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
273
275 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
276
278 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
279 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
280
282 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
283 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
284 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
285 Expand);
287 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
288 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
289 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
290 Expand);
291
293 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
294 MVT::v3i16, MVT::v4i16, MVT::Other},
295 Custom);
296
297 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
298 setOperationAction(ISD::BR_CC,
299 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
300
302
304
306 Expand);
307
308#if 0
310#endif
311
312 // We only support LOAD/STORE and vector manipulation ops for vectors
313 // with > 4 elements.
314 for (MVT VT :
315 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
316 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
317 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
318 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
319 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
320 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
321 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
322 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
323 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
324 switch (Op) {
325 case ISD::LOAD:
326 case ISD::STORE:
328 case ISD::BITCAST:
329 case ISD::UNDEF:
333 case ISD::IS_FPCLASS:
334 break;
339 break;
340 default:
342 break;
343 }
344 }
345 }
346
347 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
348
349 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
350 // is expanded to avoid having two separate loops in case the index is a VGPR.
351
352 // Most operations are naturally 32-bit vector operations. We only support
353 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
354 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
356 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
357
359 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
360
362 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
363
365 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
366 }
367
368 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
370 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
371
373 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
374
376 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
377
379 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
380 }
381
382 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
384 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
385
387 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
388
390 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
391
393 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
394 }
395
396 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
398 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
399
401 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
402
404 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
405
407 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
408 }
409
410 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
412 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
413
415 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
416
418 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
419
421 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
422 }
423
425 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
426 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
427 Custom);
428
429 if (Subtarget->hasPkMovB32()) {
430 // TODO: 16-bit element vectors should be legal with even aligned elements.
431 // TODO: Can be legal with wider source types than the result with
432 // subregister extracts.
433 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
434 }
435
437 // Prevent SELECT v2i32 from being implemented with the above bitwise ops and
438 // instead lower to cndmask in SITargetLowering::LowerSELECT().
440 // Enable MatchRotate to produce ISD::ROTR, which is later transformed to
441 // alignbit.
442 setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
443
444 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
445 Custom);
446
447 // Avoid stack access for these.
448 // TODO: Generalize to more vector types.
450 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
451 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
452 Custom);
453
454 // Deal with vec3 vector operations when widened to vec4.
456 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
457
458 // Deal with vec5/6/7 vector operations when widened to vec8.
460 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
461 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
462 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
463 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
464 Custom);
465
466 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
467 // and output demarshalling
468 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
469
470 // We can't return success/failure, only the old value,
471 // let LLVM add the comparison
472 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, {MVT::i32, MVT::i64},
473 Expand);
474
475 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
476
477 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
478
479 // FIXME: This should be narrowed to i32, but that only happens if i64 is
480 // illegal.
481 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
482 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
483
484 // On SI this is s_memtime and s_memrealtime on VI.
485 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
486
487 if (Subtarget->hasSMemRealTime() ||
488 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
489 setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
490 setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom);
491
492 if (Subtarget->has16BitInsts()) {
493 setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote);
494 setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom);
495 } else {
496 setOperationAction(ISD::FSQRT, MVT::f16, Custom);
497 }
498
499 if (Subtarget->hasMadMacF32Insts())
501
502 if (!Subtarget->hasBFI())
503 // fcopysign can be done in a single instruction with BFI.
504 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
505
506 if (!Subtarget->hasBCNT(32))
508
509 if (!Subtarget->hasBCNT(64))
511
512 if (Subtarget->hasFFBH())
514
515 if (Subtarget->hasFFBL())
517
518 // We only really have 32-bit BFE instructions (and 16-bit on VI).
519 //
520 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
521 // effort to match them now. We want this to be false for i64 cases when the
522 // extraction isn't restricted to the upper or lower half. Ideally we would
523 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
524 // span the midpoint are probably relatively rare, so don't worry about them
525 // for now.
526 if (Subtarget->hasBFE())
528
529 // Clamp modifier on add/sub
530 if (Subtarget->hasIntClamp())
532
533 if (Subtarget->hasAddNoCarry())
534 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
535 Legal);
536
538 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
539 {MVT::f32, MVT::f64}, Custom);
540
541 // These are really only legal for ieee_mode functions. We should be avoiding
542 // them for functions that don't have ieee_mode enabled, so just say they are
543 // legal.
544 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
545 {MVT::f32, MVT::f64}, Legal);
546
547 if (Subtarget->haveRoundOpsF64())
548 setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, MVT::f64,
549 Legal);
550 else
551 setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
552 MVT::f64, Custom);
553
554 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
555 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
556 Legal);
557 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
558
559 setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom);
561
562 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
563 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
564
565 // Custom lower these because we can't specify a rule based on an illegal
566 // source bf16.
567 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom);
568 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f64, Custom);
569
570 if (Subtarget->has16BitInsts()) {
573 MVT::i16, Legal);
574
575 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
576
578 MVT::i16, Expand);
579
583 ISD::CTPOP},
584 MVT::i16, Promote);
585
586 setOperationAction(ISD::LOAD, MVT::i16, Custom);
587
588 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
589
590 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
591 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
592 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
593 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
594
598
600
601 // F16 - Constant Actions.
604
605 // F16 - Load/Store Actions.
606 setOperationAction(ISD::LOAD, MVT::f16, Promote);
607 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
608 setOperationAction(ISD::STORE, MVT::f16, Promote);
609 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
610
611 // BF16 - Load/Store Actions.
612 setOperationAction(ISD::LOAD, MVT::bf16, Promote);
613 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
614 setOperationAction(ISD::STORE, MVT::bf16, Promote);
615 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
616
617 // F16 - VOP1 Actions.
619 ISD::FSIN, ISD::FROUND},
620 MVT::f16, Custom);
621
622 // BF16 - VOP1 Actions.
623 if (Subtarget->hasBF16TransInsts())
624 setOperationAction({ISD::FCOS, ISD::FSIN, ISD::FDIV}, MVT::bf16, Custom);
625
628
629 // F16 - VOP2 Actions.
630 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
631 Expand);
632 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom);
633 setOperationAction(ISD::FFREXP, MVT::f16, Custom);
635
636 // F16 - VOP3 Actions.
638 if (STI.hasMadF16())
640
641 for (MVT VT :
642 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
643 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
644 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
645 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
646 switch (Op) {
647 case ISD::LOAD:
648 case ISD::STORE:
650 case ISD::BITCAST:
651 case ISD::UNDEF:
656 case ISD::IS_FPCLASS:
657 break;
661 break;
662 default:
664 break;
665 }
666 }
667 }
668
669 // v_perm_b32 can handle either of these.
670 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
672
673 // XXX - Do these do anything? Vector constants turn into build_vector.
674 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
675
676 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
677 Legal);
678
679 setOperationAction(ISD::STORE, MVT::v2i16, Promote);
680 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
681 setOperationAction(ISD::STORE, MVT::v2f16, Promote);
682 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
683
684 setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
685 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
686 setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
687 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
688
689 setOperationAction(ISD::AND, MVT::v2i16, Promote);
690 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
691 setOperationAction(ISD::OR, MVT::v2i16, Promote);
692 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
693 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
694 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
695
696 setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
697 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
698 setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
699 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
700 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
701 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
702
703 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
704 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
705 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
706 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
707 setOperationAction(ISD::STORE, MVT::v4bf16, Promote);
708 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
709
710 setOperationAction(ISD::LOAD, MVT::v8i16, Promote);
711 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
712 setOperationAction(ISD::LOAD, MVT::v8f16, Promote);
713 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
714 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
715 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
716
717 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
718 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
719 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
720 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
721
722 setOperationAction(ISD::STORE, MVT::v8i16, Promote);
723 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
724 setOperationAction(ISD::STORE, MVT::v8f16, Promote);
725 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
726 setOperationAction(ISD::STORE, MVT::v8bf16, Promote);
727 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
728
729 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
730 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
731 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
732 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
733 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
734 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
735
736 setOperationAction(ISD::STORE, MVT::v16i16, Promote);
737 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
738 setOperationAction(ISD::STORE, MVT::v16f16, Promote);
739 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
740 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
741 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
742
743 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
744 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
745 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
746 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
747 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
748 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
749
750 setOperationAction(ISD::STORE, MVT::v32i16, Promote);
751 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
752 setOperationAction(ISD::STORE, MVT::v32f16, Promote);
753 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
754 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
755 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
756
758 MVT::v2i32, Expand);
759 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
760
762 MVT::v4i32, Expand);
763
765 MVT::v8i32, Expand);
766
767 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
768 Subtarget->hasVOP3PInsts() ? Legal : Custom);
769
770 setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal);
771 // This isn't really legal, but this avoids the legalizer unrolling it (and
772 // allows matching fneg (fabs x) patterns)
773 setOperationAction(ISD::FABS, {MVT::v2f16, MVT::v2bf16}, Legal);
774
775 // Can do this in one BFI plus a constant materialize.
777 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
778 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
779 MVT::v32f16, MVT::v32bf16},
780 Custom);
781
783 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
784 MVT::f16, Custom);
785 setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
786
787 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM,
788 ISD::FMAXIMUMNUM},
789 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
790 Custom);
791
792 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},
793 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
794 Expand);
795
796 for (MVT Vec16 :
797 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
798 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
801 Vec16, Custom);
803 }
804 }
805
806 if (Subtarget->hasVOP3PInsts()) {
810 MVT::v2i16, Legal);
811
812 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
813 ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
814 MVT::v2f16, Legal);
815
817 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
818
820 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
821 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
822 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
823 Custom);
824
825 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
826 // Split vector operations.
831 VT, Custom);
832
833 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
834 // Split vector operations.
836 VT, Custom);
837
839 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
840 {MVT::v2f16, MVT::v4f16}, Custom);
841
842 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
843 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
844 Custom);
845
846 if (Subtarget->hasBF16PackedInsts()) {
847 for (MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
848 // Split vector operations.
850 VT, Custom);
851 }
852
853 if (Subtarget->hasPackedFP32Ops()) {
855 MVT::v2f32, Legal);
857 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
858 Custom);
859 }
860 }
861
862 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom);
863
864 if (Subtarget->has16BitInsts()) {
866 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
868 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
869 } else {
870 // Legalization hack.
871 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
872
873 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom);
874 }
875
877 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
878 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
879 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
880 MVT::v32f16, MVT::v32bf16},
881 Custom);
882
884
885 if (Subtarget->hasVectorMulU64())
887 else if (Subtarget->hasScalarSMulU64())
889
890 if (Subtarget->hasMad64_32())
892
893 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
894 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
895
896 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
897 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM},
898 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
899 } else {
900 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
901 if (Subtarget->hasMinimum3Maximum3F32())
902 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal);
903
904 if (Subtarget->hasMinimum3Maximum3PKF16()) {
905 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::v2f16, Legal);
906
907 // If only the vector form is available, we need to widen to a vector.
908 if (!Subtarget->hasMinimum3Maximum3F16())
909 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f16, Custom);
910 }
911 }
912
913 if (Subtarget->hasVOP3PInsts()) {
914 // We want to break these into v2f16 pieces, not scalarize.
915 setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM},
916 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
917 Custom);
918 }
919
920 if (Subtarget->hasIntMinMax64())
922 Legal);
923
925 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
926 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
927 MVT::i8},
928 Custom);
929
931 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
932 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
933 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
934 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
935 Custom);
936
938 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
939 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
940 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
941 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
942 Custom);
943
944 setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
946 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
947 setOperationAction(ISD::GET_FPENV, MVT::i64, Custom);
948 setOperationAction(ISD::SET_FPENV, MVT::i64, Custom);
949
950 // TODO: Could move this to custom lowering, could benefit from combines on
951 // extract of relevant bits.
952 setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
953
955
956 if (Subtarget->hasBF16ConversionInsts()) {
957 setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom);
959 }
960
961 if (Subtarget->hasBF16PackedInsts()) {
963 {ISD::FADD, ISD::FMUL, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMA},
964 MVT::v2bf16, Legal);
965 }
966
967 if (Subtarget->hasBF16TransInsts()) {
968 setOperationAction({ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, MVT::bf16, Legal);
969 }
970
971 if (Subtarget->hasCvtPkF16F32Inst()) {
973 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
974 Custom);
975 }
976
978 ISD::PTRADD,
980 ISD::SUB,
982 ISD::MUL,
983 ISD::FADD,
984 ISD::FSUB,
985 ISD::FDIV,
986 ISD::FMUL,
987 ISD::FMINNUM,
988 ISD::FMAXNUM,
989 ISD::FMINNUM_IEEE,
990 ISD::FMAXNUM_IEEE,
991 ISD::FMINIMUM,
992 ISD::FMAXIMUM,
993 ISD::FMINIMUMNUM,
994 ISD::FMAXIMUMNUM,
995 ISD::FMA,
996 ISD::SMIN,
997 ISD::SMAX,
998 ISD::UMIN,
999 ISD::UMAX,
1000 ISD::SETCC,
1002 ISD::SMIN,
1003 ISD::SMAX,
1004 ISD::UMIN,
1005 ISD::UMAX,
1006 ISD::AND,
1007 ISD::OR,
1008 ISD::XOR,
1009 ISD::SHL,
1010 ISD::SRL,
1011 ISD::SRA,
1012 ISD::FSHR,
1022
1023 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1025
1026 // All memory operations. Some folding on the pointer operand is done to help
1027 // matching the constant offsets in the addressing modes.
1028 setTargetDAGCombine({ISD::LOAD,
1029 ISD::STORE,
1030 ISD::ATOMIC_LOAD,
1031 ISD::ATOMIC_STORE,
1032 ISD::ATOMIC_CMP_SWAP,
1033 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1034 ISD::ATOMIC_SWAP,
1035 ISD::ATOMIC_LOAD_ADD,
1036 ISD::ATOMIC_LOAD_SUB,
1037 ISD::ATOMIC_LOAD_AND,
1038 ISD::ATOMIC_LOAD_OR,
1039 ISD::ATOMIC_LOAD_XOR,
1040 ISD::ATOMIC_LOAD_NAND,
1041 ISD::ATOMIC_LOAD_MIN,
1042 ISD::ATOMIC_LOAD_MAX,
1043 ISD::ATOMIC_LOAD_UMIN,
1044 ISD::ATOMIC_LOAD_UMAX,
1045 ISD::ATOMIC_LOAD_FADD,
1046 ISD::ATOMIC_LOAD_FMIN,
1047 ISD::ATOMIC_LOAD_FMAX,
1048 ISD::ATOMIC_LOAD_UINC_WRAP,
1049 ISD::ATOMIC_LOAD_UDEC_WRAP,
1052
1053 // FIXME: In other contexts we pretend this is a per-function property.
1055
1057}
1058
1059const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1060
1062 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1063 return RCRegs;
1064}
1065
1066//===----------------------------------------------------------------------===//
1067// TargetLowering queries
1068//===----------------------------------------------------------------------===//
1069
1070// v_mad_mix* support a conversion from f16 to f32.
1071//
1072// There is only one special case when denormals are enabled we don't currently,
1073// where this is OK to use.
1074bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1075 EVT DestVT, EVT SrcVT) const {
1076 return DestVT.getScalarType() == MVT::f32 &&
1077 ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1078 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1079 SrcVT.getScalarType() == MVT::f16) ||
1080 (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1081 SrcVT.getScalarType() == MVT::bf16)) &&
1082 // TODO: This probably only requires no input flushing?
1084}
1085
1087 LLT DestTy, LLT SrcTy) const {
1088 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1089 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1090 DestTy.getScalarSizeInBits() == 32 &&
1091 SrcTy.getScalarSizeInBits() == 16 &&
1092 // TODO: This probably only requires no input flushing?
1093 denormalModeIsFlushAllF32(*MI.getMF());
1094}
1095
1097 // SI has some legal vector types, but no legal vector operations. Say no
1098 // shuffles are legal in order to prefer scalarizing some vector operations.
1099 return false;
1100}
1101
1103 CallingConv::ID CC,
1104 EVT VT) const {
1106 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1107
1108 if (VT.isVector()) {
1109 EVT ScalarVT = VT.getScalarType();
1110 unsigned Size = ScalarVT.getSizeInBits();
1111 if (Size == 16) {
1112 if (Subtarget->has16BitInsts()) {
1113 if (VT.isInteger())
1114 return MVT::v2i16;
1115 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1116 }
1117 return VT.isInteger() ? MVT::i32 : MVT::f32;
1118 }
1119
1120 if (Size < 16)
1121 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1122 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1123 }
1124
1125 if (VT.getSizeInBits() > 32)
1126 return MVT::i32;
1127
1128 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1129}
1130
1132 CallingConv::ID CC,
1133 EVT VT) const {
1135 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1136
1137 if (VT.isVector()) {
1138 unsigned NumElts = VT.getVectorNumElements();
1139 EVT ScalarVT = VT.getScalarType();
1140 unsigned Size = ScalarVT.getSizeInBits();
1141
1142 // FIXME: Should probably promote 8-bit vectors to i16.
1143 if (Size == 16 && Subtarget->has16BitInsts())
1144 return (NumElts + 1) / 2;
1145
1146 if (Size <= 32)
1147 return NumElts;
1148
1149 if (Size > 32)
1150 return NumElts * ((Size + 31) / 32);
1151 } else if (VT.getSizeInBits() > 32)
1152 return (VT.getSizeInBits() + 31) / 32;
1153
1154 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1155}
1156
1158 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1159 unsigned &NumIntermediates, MVT &RegisterVT) const {
1160 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1161 unsigned NumElts = VT.getVectorNumElements();
1162 EVT ScalarVT = VT.getScalarType();
1163 unsigned Size = ScalarVT.getSizeInBits();
1164 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1165 // support, but unless we can properly handle 3-vectors, it will be still be
1166 // inconsistent.
1167 if (Size == 16 && Subtarget->has16BitInsts()) {
1168 if (ScalarVT == MVT::bf16) {
1169 RegisterVT = MVT::i32;
1170 IntermediateVT = MVT::v2bf16;
1171 } else {
1172 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1173 IntermediateVT = RegisterVT;
1174 }
1175 NumIntermediates = (NumElts + 1) / 2;
1176 return NumIntermediates;
1177 }
1178
1179 if (Size == 32) {
1180 RegisterVT = ScalarVT.getSimpleVT();
1181 IntermediateVT = RegisterVT;
1182 NumIntermediates = NumElts;
1183 return NumIntermediates;
1184 }
1185
1186 if (Size < 16 && Subtarget->has16BitInsts()) {
1187 // FIXME: Should probably form v2i16 pieces
1188 RegisterVT = MVT::i16;
1189 IntermediateVT = ScalarVT;
1190 NumIntermediates = NumElts;
1191 return NumIntermediates;
1192 }
1193
1194 if (Size != 16 && Size <= 32) {
1195 RegisterVT = MVT::i32;
1196 IntermediateVT = ScalarVT;
1197 NumIntermediates = NumElts;
1198 return NumIntermediates;
1199 }
1200
1201 if (Size > 32) {
1202 RegisterVT = MVT::i32;
1203 IntermediateVT = RegisterVT;
1204 NumIntermediates = NumElts * ((Size + 31) / 32);
1205 return NumIntermediates;
1206 }
1207 }
1208
1210 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1211}
1212
1214 const DataLayout &DL, Type *Ty,
1215 unsigned MaxNumLanes) {
1216 assert(MaxNumLanes != 0);
1217
1218 LLVMContext &Ctx = Ty->getContext();
1219 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1220 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1221 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1222 NumElts);
1223 }
1224
1225 return TLI.getValueType(DL, Ty);
1226}
1227
1228// Peek through TFE struct returns to only use the data size.
1230 const DataLayout &DL, Type *Ty,
1231 unsigned MaxNumLanes) {
1232 auto *ST = dyn_cast<StructType>(Ty);
1233 if (!ST)
1234 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1235
1236 // TFE intrinsics return an aggregate type.
1237 assert(ST->getNumContainedTypes() == 2 &&
1238 ST->getContainedType(1)->isIntegerTy(32));
1239 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1240}
1241
1242/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1243/// in-memory representation. This return value is a custom type because there
1244/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1245/// could cause issues during codegen, these address space 7 pointers will be
1246/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1247/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1248/// for cost modeling, to work. (This also sets us up decently for doing the
1249/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1251 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1252 return MVT::amdgpuBufferFatPointer;
1254 DL.getPointerSizeInBits(AS) == 192)
1255 return MVT::amdgpuBufferStridedPointer;
1257}
1258/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1259/// v8i32 when padding is added.
1260/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1261/// also v8i32 with padding.
1263 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1264 DL.getPointerSizeInBits(AS) == 160) ||
1266 DL.getPointerSizeInBits(AS) == 192))
1267 return MVT::v8i32;
1269}
1270
1271static unsigned getIntrMemWidth(unsigned IntrID) {
1272 switch (IntrID) {
1273 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1274 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1275 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1276 return 8;
1277 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1278 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1279 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1280 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1281 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1282 return 32;
1283 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1284 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1285 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1286 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1287 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1288 return 64;
1289 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1290 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1291 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1292 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1293 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1294 return 128;
1295 default:
1296 llvm_unreachable("Unknown width");
1297 }
1298}
1299
1300static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad,
1302 Value *OrderingArg = CI.getArgOperand(IsLoad ? 1 : 2);
1303 unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
1304 switch (AtomicOrderingCABI(Ord)) {
1307 break;
1310 break;
1313 break;
1314 default:
1316 break;
1317 }
1318
1319 Info.flags =
1321 Info.flags |= MOCooperative;
1322
1323 MDNode *ScopeMD = cast<MDNode>(
1324 cast<MetadataAsValue>(CI.getArgOperand(IsLoad ? 2 : 3))->getMetadata());
1325 StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
1326 Info.ssid = CI.getContext().getOrInsertSyncScopeID(Scope);
1327}
1328
1330 const CallInst &CI,
1331 MachineFunction &MF,
1332 unsigned IntrID) const {
1333 Info.flags = MachineMemOperand::MONone;
1334 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1335 Info.flags |= MachineMemOperand::MOInvariant;
1336 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1338 Info.flags |= getTargetMMOFlags(CI);
1339
1340 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1342 AttributeSet Attr =
1344 MemoryEffects ME = Attr.getMemoryEffects();
1345 if (ME.doesNotAccessMemory())
1346 return false;
1347
1348 // TODO: Should images get their own address space?
1349 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1350
1351 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1352 if (RsrcIntr->IsImage) {
1353 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1355 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1356 Info.align.reset();
1357 }
1358
1359 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1360 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1361 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1362 // We conservatively set the memory operand of a buffer intrinsic to the
1363 // base resource pointer, so that we can access alias information about
1364 // those pointers. Cases like "this points at the same value
1365 // but with a different offset" are handled in
1366 // areMemAccessesTriviallyDisjoint.
1367 Info.ptrVal = RsrcArg;
1368 }
1369
1370 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1371 if (!IsSPrefetch) {
1372 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1373 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1374 Info.flags |= MachineMemOperand::MOVolatile;
1375 }
1376
1378 if (ME.onlyReadsMemory()) {
1379 if (RsrcIntr->IsImage) {
1380 unsigned MaxNumLanes = 4;
1381
1382 if (!BaseOpcode->Gather4) {
1383 // If this isn't a gather, we may have excess loaded elements in the
1384 // IR type. Check the dmask for the real number of elements loaded.
1385 unsigned DMask =
1386 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1387 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1388 }
1389
1390 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1391 CI.getType(), MaxNumLanes);
1392 } else {
1393 Info.memVT =
1395 std::numeric_limits<unsigned>::max());
1396 }
1397
1398 // FIXME: What does alignment mean for an image?
1399 Info.opc = ISD::INTRINSIC_W_CHAIN;
1400 Info.flags |= MachineMemOperand::MOLoad;
1401 } else if (ME.onlyWritesMemory()) {
1402 Info.opc = ISD::INTRINSIC_VOID;
1403
1404 Type *DataTy = CI.getArgOperand(0)->getType();
1405 if (RsrcIntr->IsImage) {
1406 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1407 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1408 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1409 DMaskLanes);
1410 } else
1411 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1412
1413 Info.flags |= MachineMemOperand::MOStore;
1414 } else {
1415 // Atomic, NoReturn Sampler or prefetch
1416 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1418 Info.flags |=
1420
1421 if (!IsSPrefetch)
1422 Info.flags |= MachineMemOperand::MOStore;
1423
1424 switch (IntrID) {
1425 default:
1426 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1427 // Fake memory access type for no return sampler intrinsics
1428 Info.memVT = MVT::i32;
1429 } else {
1430 // XXX - Should this be volatile without known ordering?
1431 Info.flags |= MachineMemOperand::MOVolatile;
1432 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1433 }
1434 break;
1435 case Intrinsic::amdgcn_raw_buffer_load_lds:
1436 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1437 case Intrinsic::amdgcn_struct_buffer_load_lds:
1438 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1439 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1440 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1441 Info.ptrVal = CI.getArgOperand(1);
1442 return true;
1443 }
1444 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1445 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1446 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1447 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1448 Info.memVT =
1450 std::numeric_limits<unsigned>::max());
1451 Info.flags &= ~MachineMemOperand::MOStore;
1452 return true;
1453 }
1454 }
1455 }
1456 return true;
1457 }
1458
1459 switch (IntrID) {
1460 case Intrinsic::amdgcn_ds_ordered_add:
1461 case Intrinsic::amdgcn_ds_ordered_swap: {
1462 Info.opc = ISD::INTRINSIC_W_CHAIN;
1463 Info.memVT = MVT::getVT(CI.getType());
1464 Info.ptrVal = CI.getOperand(0);
1465 Info.align.reset();
1467
1468 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1469 if (!Vol->isZero())
1470 Info.flags |= MachineMemOperand::MOVolatile;
1471
1472 return true;
1473 }
1474 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1475 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1476 Info.opc = ISD::INTRINSIC_W_CHAIN;
1477 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1478 Info.ptrVal = nullptr;
1479 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1481 return true;
1482 }
1483 case Intrinsic::amdgcn_ds_append:
1484 case Intrinsic::amdgcn_ds_consume: {
1485 Info.opc = ISD::INTRINSIC_W_CHAIN;
1486 Info.memVT = MVT::getVT(CI.getType());
1487 Info.ptrVal = CI.getOperand(0);
1488 Info.align.reset();
1490
1491 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1492 if (!Vol->isZero())
1493 Info.flags |= MachineMemOperand::MOVolatile;
1494
1495 return true;
1496 }
1497 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1498 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1499 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1502 Info.memVT = MVT::getVT(CI.getType());
1503 Info.ptrVal = CI.getOperand(0);
1504 Info.memVT = MVT::i64;
1505 Info.size = 8;
1506 Info.align.reset();
1508 return true;
1509 }
1510 case Intrinsic::amdgcn_global_atomic_csub: {
1511 Info.opc = ISD::INTRINSIC_W_CHAIN;
1512 Info.memVT = MVT::getVT(CI.getType());
1513 Info.ptrVal = CI.getOperand(0);
1514 Info.align.reset();
1517 return true;
1518 }
1519 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1520 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1521 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1522 Info.opc = ISD::INTRINSIC_W_CHAIN;
1523 Info.memVT =
1524 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1525 ? CI.getType()
1527 ->getElementType(0)); // XXX: what is correct VT?
1528
1529 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1530 Info.align.reset();
1531 Info.flags |=
1533 return true;
1534 }
1535 case Intrinsic::amdgcn_global_atomic_fmin_num:
1536 case Intrinsic::amdgcn_global_atomic_fmax_num:
1537 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1538 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1539 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1540 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1541 Info.opc = ISD::INTRINSIC_W_CHAIN;
1542 Info.memVT = MVT::getVT(CI.getType());
1543 Info.ptrVal = CI.getOperand(0);
1544 Info.align.reset();
1548 return true;
1549 }
1550 case Intrinsic::amdgcn_flat_load_monitor_b32:
1551 case Intrinsic::amdgcn_flat_load_monitor_b64:
1552 case Intrinsic::amdgcn_flat_load_monitor_b128:
1553 case Intrinsic::amdgcn_global_load_monitor_b32:
1554 case Intrinsic::amdgcn_global_load_monitor_b64:
1555 case Intrinsic::amdgcn_global_load_monitor_b128:
1556 case Intrinsic::amdgcn_cluster_load_b32:
1557 case Intrinsic::amdgcn_cluster_load_b64:
1558 case Intrinsic::amdgcn_cluster_load_b128:
1559 case Intrinsic::amdgcn_ds_load_tr6_b96:
1560 case Intrinsic::amdgcn_ds_load_tr4_b64:
1561 case Intrinsic::amdgcn_ds_load_tr8_b64:
1562 case Intrinsic::amdgcn_ds_load_tr16_b128:
1563 case Intrinsic::amdgcn_global_load_tr6_b96:
1564 case Intrinsic::amdgcn_global_load_tr4_b64:
1565 case Intrinsic::amdgcn_global_load_tr_b64:
1566 case Intrinsic::amdgcn_global_load_tr_b128:
1567 case Intrinsic::amdgcn_ds_read_tr4_b64:
1568 case Intrinsic::amdgcn_ds_read_tr6_b96:
1569 case Intrinsic::amdgcn_ds_read_tr8_b64:
1570 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1571 Info.opc = ISD::INTRINSIC_W_CHAIN;
1572 Info.memVT = MVT::getVT(CI.getType());
1573 Info.ptrVal = CI.getOperand(0);
1574 Info.align.reset();
1575 Info.flags |= MachineMemOperand::MOLoad;
1576 return true;
1577 }
1578 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1579 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1580 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1581 Info.opc = ISD::INTRINSIC_W_CHAIN;
1582 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1583 Info.ptrVal = CI.getOperand(0);
1584 Info.align.reset();
1585 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/true, Info);
1586 return true;
1587 }
1588 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1589 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1590 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1591 Info.opc = ISD::INTRINSIC_VOID;
1592 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1593 Info.ptrVal = CI.getArgOperand(0);
1594 Info.align.reset();
1595 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/false, Info);
1596 return true;
1597 }
1598 case Intrinsic::amdgcn_ds_gws_init:
1599 case Intrinsic::amdgcn_ds_gws_barrier:
1600 case Intrinsic::amdgcn_ds_gws_sema_v:
1601 case Intrinsic::amdgcn_ds_gws_sema_br:
1602 case Intrinsic::amdgcn_ds_gws_sema_p:
1603 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1604 Info.opc = ISD::INTRINSIC_VOID;
1605
1606 const GCNTargetMachine &TM =
1607 static_cast<const GCNTargetMachine &>(getTargetMachine());
1608
1610 Info.ptrVal = MFI->getGWSPSV(TM);
1611
1612 // This is an abstract access, but we need to specify a type and size.
1613 Info.memVT = MVT::i32;
1614 Info.size = 4;
1615 Info.align = Align(4);
1616
1617 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1618 Info.flags |= MachineMemOperand::MOLoad;
1619 else
1620 Info.flags |= MachineMemOperand::MOStore;
1621 return true;
1622 }
1623 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1624 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1625 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1626 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1627 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1628 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1629 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1630 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1631 Info.opc = ISD::INTRINSIC_VOID;
1632 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1633 Info.ptrVal = CI.getArgOperand(1);
1635 return true;
1636 }
1637 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1638 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1639 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1640 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1641 Info.opc = ISD::INTRINSIC_VOID;
1642 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1643 Info.ptrVal = CI.getArgOperand(0);
1645 return true;
1646 }
1647 case Intrinsic::amdgcn_load_to_lds:
1648 case Intrinsic::amdgcn_global_load_lds: {
1649 Info.opc = ISD::INTRINSIC_VOID;
1650 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1651 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1652 Info.ptrVal = CI.getArgOperand(1);
1654 return true;
1655 }
1656 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1657 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1658 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1659 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1660 Info.opc = ISD::INTRINSIC_W_CHAIN;
1661
1662 const GCNTargetMachine &TM =
1663 static_cast<const GCNTargetMachine &>(getTargetMachine());
1664
1666 Info.ptrVal = MFI->getGWSPSV(TM);
1667
1668 // This is an abstract access, but we need to specify a type and size.
1669 Info.memVT = MVT::i32;
1670 Info.size = 4;
1671 Info.align = Align(4);
1672
1674 return true;
1675 }
1676 case Intrinsic::amdgcn_s_prefetch_data:
1677 case Intrinsic::amdgcn_flat_prefetch:
1678 case Intrinsic::amdgcn_global_prefetch: {
1679 Info.opc = ISD::INTRINSIC_VOID;
1680 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1681 Info.ptrVal = CI.getArgOperand(0);
1682 Info.flags |= MachineMemOperand::MOLoad;
1683 return true;
1684 }
1685 default:
1686 return false;
1687 }
1688}
1689
1691 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1693 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1694 // The DAG's ValueType loses the addrspaces.
1695 // Add them as 2 extra Constant operands "from" and "to".
1696 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1697 unsigned DstAS = I.getType()->getPointerAddressSpace();
1698 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1699 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1700 break;
1701 }
1702 default:
1703 break;
1704 }
1705}
1706
1709 Type *&AccessTy) const {
1710 Value *Ptr = nullptr;
1711 switch (II->getIntrinsicID()) {
1712 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1713 case Intrinsic::amdgcn_cluster_load_b128:
1714 case Intrinsic::amdgcn_cluster_load_b64:
1715 case Intrinsic::amdgcn_cluster_load_b32:
1716 case Intrinsic::amdgcn_ds_append:
1717 case Intrinsic::amdgcn_ds_consume:
1718 case Intrinsic::amdgcn_ds_load_tr8_b64:
1719 case Intrinsic::amdgcn_ds_load_tr16_b128:
1720 case Intrinsic::amdgcn_ds_load_tr4_b64:
1721 case Intrinsic::amdgcn_ds_load_tr6_b96:
1722 case Intrinsic::amdgcn_ds_read_tr4_b64:
1723 case Intrinsic::amdgcn_ds_read_tr6_b96:
1724 case Intrinsic::amdgcn_ds_read_tr8_b64:
1725 case Intrinsic::amdgcn_ds_read_tr16_b64:
1726 case Intrinsic::amdgcn_ds_ordered_add:
1727 case Intrinsic::amdgcn_ds_ordered_swap:
1728 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1729 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1730 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1731 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1732 case Intrinsic::amdgcn_flat_load_monitor_b128:
1733 case Intrinsic::amdgcn_flat_load_monitor_b32:
1734 case Intrinsic::amdgcn_flat_load_monitor_b64:
1735 case Intrinsic::amdgcn_global_atomic_csub:
1736 case Intrinsic::amdgcn_global_atomic_fmax_num:
1737 case Intrinsic::amdgcn_global_atomic_fmin_num:
1738 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1739 case Intrinsic::amdgcn_global_load_monitor_b128:
1740 case Intrinsic::amdgcn_global_load_monitor_b32:
1741 case Intrinsic::amdgcn_global_load_monitor_b64:
1742 case Intrinsic::amdgcn_global_load_tr_b64:
1743 case Intrinsic::amdgcn_global_load_tr_b128:
1744 case Intrinsic::amdgcn_global_load_tr4_b64:
1745 case Intrinsic::amdgcn_global_load_tr6_b96:
1746 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1747 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1748 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1749 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1750 Ptr = II->getArgOperand(0);
1751 break;
1752 case Intrinsic::amdgcn_load_to_lds:
1753 case Intrinsic::amdgcn_global_load_lds:
1754 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1755 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1756 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1757 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1758 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1759 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1760 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1761 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1762 Ptr = II->getArgOperand(1);
1763 break;
1764 default:
1765 return false;
1766 }
1767 AccessTy = II->getType();
1768 Ops.push_back(Ptr);
1769 return true;
1770}
1771
1773 unsigned AddrSpace) const {
1774 if (!Subtarget->hasFlatInstOffsets()) {
1775 // Flat instructions do not have offsets, and only have the register
1776 // address.
1777 return AM.BaseOffs == 0 && AM.Scale == 0;
1778 }
1779
1780 decltype(SIInstrFlags::FLAT) FlatVariant =
1784
1785 return AM.Scale == 0 &&
1786 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1787 AM.BaseOffs, AddrSpace, FlatVariant));
1788}
1789
1791 if (Subtarget->hasFlatGlobalInsts())
1793
1794 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1795 // Assume the we will use FLAT for all global memory accesses
1796 // on VI.
1797 // FIXME: This assumption is currently wrong. On VI we still use
1798 // MUBUF instructions for the r + i addressing mode. As currently
1799 // implemented, the MUBUF instructions only work on buffer < 4GB.
1800 // It may be possible to support > 4GB buffers with MUBUF instructions,
1801 // by setting the stride value in the resource descriptor which would
1802 // increase the size limit to (stride * 4GB). However, this is risky,
1803 // because it has never been validated.
1805 }
1806
1807 return isLegalMUBUFAddressingMode(AM);
1808}
1809
1810bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1811 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1812 // additionally can do r + r + i with addr64. 32-bit has more addressing
1813 // mode options. Depending on the resource constant, it can also do
1814 // (i64 r0) + (i32 r1) * (i14 i).
1815 //
1816 // Private arrays end up using a scratch buffer most of the time, so also
1817 // assume those use MUBUF instructions. Scratch loads / stores are currently
1818 // implemented as mubuf instructions with offen bit set, so slightly
1819 // different than the normal addr64.
1820 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1821 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1822 return false;
1823
1824 // FIXME: Since we can split immediate into soffset and immediate offset,
1825 // would it make sense to allow any immediate?
1826
1827 switch (AM.Scale) {
1828 case 0: // r + i or just i, depending on HasBaseReg.
1829 return true;
1830 case 1:
1831 return true; // We have r + r or r + i.
1832 case 2:
1833 if (AM.HasBaseReg) {
1834 // Reject 2 * r + r.
1835 return false;
1836 }
1837
1838 // Allow 2 * r as r + r
1839 // Or 2 * r + i is allowed as r + r + i.
1840 return true;
1841 default: // Don't allow n * r
1842 return false;
1843 }
1844}
1845
1847 const AddrMode &AM, Type *Ty,
1848 unsigned AS,
1849 Instruction *I) const {
1850 // No global is ever allowed as a base.
1851 if (AM.BaseGV)
1852 return false;
1853
1854 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1855 return isLegalGlobalAddressingMode(AM);
1856
1857 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1861 // If the offset isn't a multiple of 4, it probably isn't going to be
1862 // correctly aligned.
1863 // FIXME: Can we get the real alignment here?
1864 if (AM.BaseOffs % 4 != 0)
1865 return isLegalMUBUFAddressingMode(AM);
1866
1867 if (!Subtarget->hasScalarSubwordLoads()) {
1868 // There are no SMRD extloads, so if we have to do a small type access we
1869 // will use a MUBUF load.
1870 // FIXME?: We also need to do this if unaligned, but we don't know the
1871 // alignment here.
1872 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1873 return isLegalGlobalAddressingMode(AM);
1874 }
1875
1876 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1877 // SMRD instructions have an 8-bit, dword offset on SI.
1878 if (!isUInt<8>(AM.BaseOffs / 4))
1879 return false;
1880 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1881 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1882 // in 8-bits, it can use a smaller encoding.
1883 if (!isUInt<32>(AM.BaseOffs / 4))
1884 return false;
1885 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1886 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1887 if (!isUInt<20>(AM.BaseOffs))
1888 return false;
1889 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1890 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1891 // for S_BUFFER_* instructions).
1892 if (!isInt<21>(AM.BaseOffs))
1893 return false;
1894 } else {
1895 // On GFX12, all offsets are signed 24-bit in bytes.
1896 if (!isInt<24>(AM.BaseOffs))
1897 return false;
1898 }
1899
1900 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1902 AM.BaseOffs < 0) {
1903 // Scalar (non-buffer) loads can only use a negative offset if
1904 // soffset+offset is non-negative. Since the compiler can only prove that
1905 // in a few special cases, it is safer to claim that negative offsets are
1906 // not supported.
1907 return false;
1908 }
1909
1910 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1911 return true;
1912
1913 if (AM.Scale == 1 && AM.HasBaseReg)
1914 return true;
1915
1916 return false;
1917 }
1918
1919 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1920 return Subtarget->enableFlatScratch()
1922 : isLegalMUBUFAddressingMode(AM);
1923
1924 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1925 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1926 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1927 // field.
1928 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1929 // an 8-bit dword offset but we don't know the alignment here.
1930 if (!isUInt<16>(AM.BaseOffs))
1931 return false;
1932
1933 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1934 return true;
1935
1936 if (AM.Scale == 1 && AM.HasBaseReg)
1937 return true;
1938
1939 return false;
1940 }
1941
1943 // For an unknown address space, this usually means that this is for some
1944 // reason being used for pure arithmetic, and not based on some addressing
1945 // computation. We don't have instructions that compute pointers with any
1946 // addressing modes, so treat them as having no offset like flat
1947 // instructions.
1949 }
1950
1951 // Assume a user alias of global for unknown address spaces.
1952 return isLegalGlobalAddressingMode(AM);
1953}
1954
1956 const MachineFunction &MF) const {
1958 return (MemVT.getSizeInBits() <= 4 * 32);
1959 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1960 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1961 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1962 }
1964 return (MemVT.getSizeInBits() <= 2 * 32);
1965 return true;
1966}
1967
1969 unsigned Size, unsigned AddrSpace, Align Alignment,
1970 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1971 if (IsFast)
1972 *IsFast = 0;
1973
1974 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1975 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1976 // Check if alignment requirements for ds_read/write instructions are
1977 // disabled.
1978 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1979 return false;
1980
1981 Align RequiredAlignment(
1982 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1983 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1984 Alignment < RequiredAlignment)
1985 return false;
1986
1987 // Either, the alignment requirements are "enabled", or there is an
1988 // unaligned LDS access related hardware bug though alignment requirements
1989 // are "disabled". In either case, we need to check for proper alignment
1990 // requirements.
1991 //
1992 switch (Size) {
1993 case 64:
1994 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1995 // address is negative, then the instruction is incorrectly treated as
1996 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1997 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1998 // load later in the SILoadStoreOptimizer.
1999 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
2000 return false;
2001
2002 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
2003 // can do a 4 byte aligned, 8 byte access in a single operation using
2004 // ds_read2/write2_b32 with adjacent offsets.
2005 RequiredAlignment = Align(4);
2006
2007 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2008 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
2009 // ds_write2_b32 depending on the alignment. In either case with either
2010 // alignment there is no faster way of doing this.
2011
2012 // The numbers returned here and below are not additive, it is a 'speed
2013 // rank'. They are just meant to be compared to decide if a certain way
2014 // of lowering an operation is faster than another. For that purpose
2015 // naturally aligned operation gets it bitsize to indicate that "it
2016 // operates with a speed comparable to N-bit wide load". With the full
2017 // alignment ds128 is slower than ds96 for example. If underaligned it
2018 // is comparable to a speed of a single dword access, which would then
2019 // mean 32 < 128 and it is faster to issue a wide load regardless.
2020 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
2021 // wider load which will not be aligned anymore the latter is slower.
2022 if (IsFast)
2023 *IsFast = (Alignment >= RequiredAlignment) ? 64
2024 : (Alignment < Align(4)) ? 32
2025 : 1;
2026 return true;
2027 }
2028
2029 break;
2030 case 96:
2031 if (!Subtarget->hasDS96AndDS128())
2032 return false;
2033
2034 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
2035 // gfx8 and older.
2036
2037 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2038 // Naturally aligned access is fastest. However, also report it is Fast
2039 // if memory is aligned less than DWORD. A narrow load or store will be
2040 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
2041 // be more of them, so overall we will pay less penalty issuing a single
2042 // instruction.
2043
2044 // See comment on the values above.
2045 if (IsFast)
2046 *IsFast = (Alignment >= RequiredAlignment) ? 96
2047 : (Alignment < Align(4)) ? 32
2048 : 1;
2049 return true;
2050 }
2051
2052 break;
2053 case 128:
2054 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2055 return false;
2056
2057 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
2058 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
2059 // single operation using ds_read2/write2_b64.
2060 RequiredAlignment = Align(8);
2061
2062 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2063 // Naturally aligned access is fastest. However, also report it is Fast
2064 // if memory is aligned less than DWORD. A narrow load or store will be
2065 // be equally slow as a single ds_read_b128/ds_write_b128, but there
2066 // will be more of them, so overall we will pay less penalty issuing a
2067 // single instruction.
2068
2069 // See comment on the values above.
2070 if (IsFast)
2071 *IsFast = (Alignment >= RequiredAlignment) ? 128
2072 : (Alignment < Align(4)) ? 32
2073 : 1;
2074 return true;
2075 }
2076
2077 break;
2078 default:
2079 if (Size > 32)
2080 return false;
2081
2082 break;
2083 }
2084
2085 // See comment on the values above.
2086 // Note that we have a single-dword or sub-dword here, so if underaligned
2087 // it is a slowest possible access, hence returned value is 0.
2088 if (IsFast)
2089 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
2090
2091 return Alignment >= RequiredAlignment ||
2092 Subtarget->hasUnalignedDSAccessEnabled();
2093 }
2094
2095 // FIXME: We have to be conservative here and assume that flat operations
2096 // will access scratch. If we had access to the IR function, then we
2097 // could determine if any private memory was used in the function.
2098 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
2099 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2100 bool AlignedBy4 = Alignment >= Align(4);
2101 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2102 if (IsFast)
2103 *IsFast = AlignedBy4 ? Size : 1;
2104 return true;
2105 }
2106
2107 if (IsFast)
2108 *IsFast = AlignedBy4;
2109
2110 return AlignedBy4;
2111 }
2112
2113 // So long as they are correct, wide global memory operations perform better
2114 // than multiple smaller memory ops -- even when misaligned
2115 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
2116 if (IsFast)
2117 *IsFast = Size;
2118
2119 return Alignment >= Align(4) ||
2120 Subtarget->hasUnalignedBufferAccessEnabled();
2121 }
2122
2123 // Ensure robust out-of-bounds guarantees for buffer accesses are met if
2124 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
2125 // out-of-bounds behavior, but in the edge case where an access starts
2126 // out-of-bounds and then enter in-bounds, the entire access would be treated
2127 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
2128 // natural alignment of buffer accesses.
2129 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
2130 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
2131 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2132 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2133 Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
2134 return false;
2135 }
2136
2137 // Smaller than dword value must be aligned.
2138 if (Size < 32)
2139 return false;
2140
2141 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2142 // byte-address are ignored, thus forcing Dword alignment.
2143 // This applies to private, global, and constant memory.
2144 if (IsFast)
2145 *IsFast = 1;
2146
2147 return Size >= 32 && Alignment >= Align(4);
2148}
2149
2151 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2152 unsigned *IsFast) const {
2154 Alignment, Flags, IsFast);
2155}
2156
2158 LLVMContext &Context, const MemOp &Op,
2159 const AttributeList &FuncAttributes) const {
2160 // FIXME: Should account for address space here.
2161
2162 // The default fallback uses the private pointer size as a guess for a type to
2163 // use. Make sure we switch these to 64-bit accesses.
2164
2165 if (Op.size() >= 16 &&
2166 Op.isDstAligned(Align(4))) // XXX: Should only do for global
2167 return MVT::v4i32;
2168
2169 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
2170 return MVT::v2i32;
2171
2172 // Use the default.
2173 return MVT::Other;
2174}
2175
2177 const MemSDNode *MemNode = cast<MemSDNode>(N);
2178 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2179}
2180
2185
2187 unsigned DestAS) const {
2188 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2189 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2190 Subtarget->hasGloballyAddressableScratch()) {
2191 // Flat -> private requires subtracting src_flat_scratch_base_lo.
2192 return false;
2193 }
2194
2195 // Flat -> private/local is a simple truncate.
2196 // Flat -> global is no-op
2197 return true;
2198 }
2199
2200 const GCNTargetMachine &TM =
2201 static_cast<const GCNTargetMachine &>(getTargetMachine());
2202 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2203}
2204
2212
2214 Type *Ty) const {
2215 // FIXME: Could be smarter if called for vector constants.
2216 return true;
2217}
2218
2220 unsigned Index) const {
2222 return false;
2223
2224 // TODO: Add more cases that are cheap.
2225 return Index == 0;
2226}
2227
2228bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2229 // TODO: This should be more aggressive, particular for 16-bit element
2230 // vectors. However there are some mixed improvements and regressions.
2231 EVT EltTy = VT.getVectorElementType();
2232 return EltTy.getSizeInBits() % 32 == 0;
2233}
2234
2236 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2237 switch (Op) {
2238 case ISD::LOAD:
2239 case ISD::STORE:
2240 return true;
2241 default:
2242 return false;
2243 }
2244 }
2245
2246 // SimplifySetCC uses this function to determine whether or not it should
2247 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2248 if (VT == MVT::i1 && Op == ISD::SETCC)
2249 return false;
2250
2252}
2253
2254SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2255 const SDLoc &SL,
2256 SDValue Chain,
2257 uint64_t Offset) const {
2258 const DataLayout &DL = DAG.getDataLayout();
2262
2263 auto [InputPtrReg, RC, ArgTy] =
2264 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2265
2266 // We may not have the kernarg segment argument if we have no kernel
2267 // arguments.
2268 if (!InputPtrReg)
2269 return DAG.getConstant(Offset, SL, PtrVT);
2270
2272 SDValue BasePtr = DAG.getCopyFromReg(
2273 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2274
2275 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2276}
2277
2278SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2279 const SDLoc &SL) const {
2282 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2283}
2284
2285SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2286 const SDLoc &SL) const {
2287
2289 std::optional<uint32_t> KnownSize =
2291 if (KnownSize.has_value())
2292 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2293 return SDValue();
2294}
2295
2296SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2297 const SDLoc &SL, SDValue Val,
2298 bool Signed,
2299 const ISD::InputArg *Arg) const {
2300 // First, if it is a widened vector, narrow it.
2301 if (VT.isVector() &&
2303 EVT NarrowedVT =
2306 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2307 DAG.getConstant(0, SL, MVT::i32));
2308 }
2309
2310 // Then convert the vector elements or scalar value.
2311 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2312 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2313 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2314 }
2315
2316 if (MemVT.isFloatingPoint())
2317 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2318 else if (Signed)
2319 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2320 else
2321 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2322
2323 return Val;
2324}
2325
2326SDValue SITargetLowering::lowerKernargMemParameter(
2327 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2328 uint64_t Offset, Align Alignment, bool Signed,
2329 const ISD::InputArg *Arg) const {
2330 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2331
2332 // Try to avoid using an extload by loading earlier than the argument address,
2333 // and extracting the relevant bits. The load should hopefully be merged with
2334 // the previous argument.
2335 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2336 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2337 int64_t AlignDownOffset = alignDown(Offset, 4);
2338 int64_t OffsetDiff = Offset - AlignDownOffset;
2339
2340 EVT IntVT = MemVT.changeTypeToInteger();
2341
2342 // TODO: If we passed in the base kernel offset we could have a better
2343 // alignment than 4, but we don't really need it.
2344 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2345 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2348
2349 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2350 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2351
2352 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2353 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2354 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2355
2356 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2357 }
2358
2359 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2360 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2363
2364 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2365 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2366}
2367
2368/// Coerce an argument which was passed in a different ABI type to the original
2369/// expected value type.
2370SDValue SITargetLowering::convertABITypeToValueType(SelectionDAG &DAG,
2371 SDValue Val,
2372 CCValAssign &VA,
2373 const SDLoc &SL) const {
2374 EVT ValVT = VA.getValVT();
2375
2376 // If this is an 8 or 16-bit value, it is really passed promoted
2377 // to 32 bits. Insert an assert[sz]ext to capture this, then
2378 // truncate to the right size.
2379 switch (VA.getLocInfo()) {
2380 case CCValAssign::Full:
2381 return Val;
2382 case CCValAssign::BCvt:
2383 return DAG.getNode(ISD::BITCAST, SL, ValVT, Val);
2384 case CCValAssign::SExt:
2385 Val = DAG.getNode(ISD::AssertSext, SL, VA.getLocVT(), Val,
2386 DAG.getValueType(ValVT));
2387 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2388 case CCValAssign::ZExt:
2389 Val = DAG.getNode(ISD::AssertZext, SL, VA.getLocVT(), Val,
2390 DAG.getValueType(ValVT));
2391 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2392 case CCValAssign::AExt:
2393 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2394 default:
2395 llvm_unreachable("Unknown loc info!");
2396 }
2397}
2398
2399SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2400 CCValAssign &VA, const SDLoc &SL,
2401 SDValue Chain,
2402 const ISD::InputArg &Arg) const {
2403 MachineFunction &MF = DAG.getMachineFunction();
2404 MachineFrameInfo &MFI = MF.getFrameInfo();
2405
2406 if (Arg.Flags.isByVal()) {
2407 unsigned Size = Arg.Flags.getByValSize();
2408 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2409 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2410 }
2411
2412 unsigned ArgOffset = VA.getLocMemOffset();
2413 unsigned ArgSize = VA.getValVT().getStoreSize();
2414
2415 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2416
2417 // Create load nodes to retrieve arguments from the stack.
2418 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2419
2420 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2422 MVT MemVT = VA.getValVT();
2423
2424 switch (VA.getLocInfo()) {
2425 default:
2426 break;
2427 case CCValAssign::BCvt:
2428 MemVT = VA.getLocVT();
2429 break;
2430 case CCValAssign::SExt:
2431 ExtType = ISD::SEXTLOAD;
2432 break;
2433 case CCValAssign::ZExt:
2434 ExtType = ISD::ZEXTLOAD;
2435 break;
2436 case CCValAssign::AExt:
2437 ExtType = ISD::EXTLOAD;
2438 break;
2439 }
2440
2441 SDValue ArgValue = DAG.getExtLoad(
2442 ExtType, SL, VA.getLocVT(), Chain, FIN,
2444
2445 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2446 if (ConvertedVal == ArgValue)
2447 return ConvertedVal;
2448
2449 return DAG.getMergeValues({ConvertedVal, ArgValue.getValue(1)}, SL);
2450}
2451
2452SDValue SITargetLowering::lowerWorkGroupId(
2453 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2456 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
2457 if (!Subtarget->hasClusters())
2458 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2459
2460 // Clusters are supported. Return the global position in the grid. If clusters
2461 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
2462
2463 // WorkGroupIdXYZ = ClusterId == 0 ?
2464 // ClusterIdXYZ :
2465 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
2466 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2467 SDLoc SL(ClusterIdXYZ);
2468 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2469 SDValue One = DAG.getConstant(1, SL, VT);
2470 SDValue ClusterSizeXYZ = DAG.getNode(ISD::ADD, SL, VT, ClusterMaxIdXYZ, One);
2471 SDValue ClusterWorkGroupIdXYZ =
2472 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2473 SDValue GlobalIdXYZ =
2474 DAG.getNode(ISD::ADD, SL, VT, ClusterWorkGroupIdXYZ,
2475 DAG.getNode(ISD::MUL, SL, VT, ClusterIdXYZ, ClusterSizeXYZ));
2476
2477 switch (MFI.getClusterDims().getKind()) {
2480 return GlobalIdXYZ;
2482 return ClusterIdXYZ;
2484 using namespace AMDGPU::Hwreg;
2485 SDValue ClusterIdField =
2486 DAG.getTargetConstant(HwregEncoding::encode(ID_IB_STS2, 6, 4), SL, VT);
2487 SDNode *GetReg =
2488 DAG.getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2489 SDValue ClusterId(GetReg, 0);
2490 SDValue Zero = DAG.getConstant(0, SL, VT);
2491 return DAG.getNode(ISD::SELECT_CC, SL, VT, ClusterId, Zero, ClusterIdXYZ,
2492 GlobalIdXYZ, DAG.getCondCode(ISD::SETEQ));
2493 }
2494 }
2495
2496 llvm_unreachable("nothing should reach here");
2497}
2498
2499SDValue SITargetLowering::getPreloadedValue(
2500 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2502 const ArgDescriptor *Reg = nullptr;
2503 const TargetRegisterClass *RC;
2504 LLT Ty;
2505
2507 const ArgDescriptor WorkGroupIDX =
2508 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2509 // If GridZ is not programmed in an entry function then the hardware will set
2510 // it to all zeros, so there is no need to mask the GridY value in the low
2511 // order bits.
2512 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2513 AMDGPU::TTMP7,
2514 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2515 const ArgDescriptor WorkGroupIDZ =
2516 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2517 const ArgDescriptor ClusterWorkGroupIDX =
2518 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
2519 const ArgDescriptor ClusterWorkGroupIDY =
2520 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
2521 const ArgDescriptor ClusterWorkGroupIDZ =
2522 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
2523 const ArgDescriptor ClusterWorkGroupMaxIDX =
2524 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
2525 const ArgDescriptor ClusterWorkGroupMaxIDY =
2526 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
2527 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2528 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
2529 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2530 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
2531
2532 auto LoadConstant = [&](unsigned N) {
2533 return DAG.getConstant(N, SDLoc(), VT);
2534 };
2535
2536 if (Subtarget->hasArchitectedSGPRs() &&
2538 AMDGPU::ClusterDimsAttr ClusterDims = MFI.getClusterDims();
2539 bool HasFixedDims = ClusterDims.isFixedDims();
2540
2541 switch (PVID) {
2543 Reg = &WorkGroupIDX;
2544 RC = &AMDGPU::SReg_32RegClass;
2545 Ty = LLT::scalar(32);
2546 break;
2548 Reg = &WorkGroupIDY;
2549 RC = &AMDGPU::SReg_32RegClass;
2550 Ty = LLT::scalar(32);
2551 break;
2553 Reg = &WorkGroupIDZ;
2554 RC = &AMDGPU::SReg_32RegClass;
2555 Ty = LLT::scalar(32);
2556 break;
2558 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
2559 return LoadConstant(0);
2560 Reg = &ClusterWorkGroupIDX;
2561 RC = &AMDGPU::SReg_32RegClass;
2562 Ty = LLT::scalar(32);
2563 break;
2565 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
2566 return LoadConstant(0);
2567 Reg = &ClusterWorkGroupIDY;
2568 RC = &AMDGPU::SReg_32RegClass;
2569 Ty = LLT::scalar(32);
2570 break;
2572 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
2573 return LoadConstant(0);
2574 Reg = &ClusterWorkGroupIDZ;
2575 RC = &AMDGPU::SReg_32RegClass;
2576 Ty = LLT::scalar(32);
2577 break;
2579 if (HasFixedDims)
2580 return LoadConstant(ClusterDims.getDims()[0] - 1);
2581 Reg = &ClusterWorkGroupMaxIDX;
2582 RC = &AMDGPU::SReg_32RegClass;
2583 Ty = LLT::scalar(32);
2584 break;
2586 if (HasFixedDims)
2587 return LoadConstant(ClusterDims.getDims()[1] - 1);
2588 Reg = &ClusterWorkGroupMaxIDY;
2589 RC = &AMDGPU::SReg_32RegClass;
2590 Ty = LLT::scalar(32);
2591 break;
2593 if (HasFixedDims)
2594 return LoadConstant(ClusterDims.getDims()[2] - 1);
2595 Reg = &ClusterWorkGroupMaxIDZ;
2596 RC = &AMDGPU::SReg_32RegClass;
2597 Ty = LLT::scalar(32);
2598 break;
2600 Reg = &ClusterWorkGroupMaxFlatID;
2601 RC = &AMDGPU::SReg_32RegClass;
2602 Ty = LLT::scalar(32);
2603 break;
2604 default:
2605 break;
2606 }
2607 }
2608
2609 if (!Reg)
2610 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2611 if (!Reg) {
2613 // It's possible for a kernarg intrinsic call to appear in a kernel with
2614 // no allocated segment, in which case we do not add the user sgpr
2615 // argument, so just return null.
2616 return DAG.getConstant(0, SDLoc(), VT);
2617 }
2618
2619 // It's undefined behavior if a function marked with the amdgpu-no-*
2620 // attributes uses the corresponding intrinsic.
2621 return DAG.getPOISON(VT);
2622 }
2623
2624 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2625}
2626
2628 CallingConv::ID CallConv,
2629 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2630 FunctionType *FType,
2632 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2633 const ISD::InputArg *Arg = &Ins[I];
2634
2635 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2636 "vector type argument should have been split");
2637
2638 // First check if it's a PS input addr.
2639 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2640 PSInputNum <= 15) {
2641 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2642
2643 // Inconveniently only the first part of the split is marked as isSplit,
2644 // so skip to the end. We only want to increment PSInputNum once for the
2645 // entire split argument.
2646 if (Arg->Flags.isSplit()) {
2647 while (!Arg->Flags.isSplitEnd()) {
2648 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2649 "unexpected vector split in ps argument type");
2650 if (!SkipArg)
2651 Splits.push_back(*Arg);
2652 Arg = &Ins[++I];
2653 }
2654 }
2655
2656 if (SkipArg) {
2657 // We can safely skip PS inputs.
2658 Skipped.set(Arg->getOrigArgIndex());
2659 ++PSInputNum;
2660 continue;
2661 }
2662
2663 Info->markPSInputAllocated(PSInputNum);
2664 if (Arg->Used)
2665 Info->markPSInputEnabled(PSInputNum);
2666
2667 ++PSInputNum;
2668 }
2669
2670 Splits.push_back(*Arg);
2671 }
2672}
2673
2674// Allocate special inputs passed in VGPRs.
2676 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2677 SIMachineFunctionInfo &Info) const {
2678 const LLT S32 = LLT::scalar(32);
2680
2681 if (Info.hasWorkItemIDX()) {
2682 Register Reg = AMDGPU::VGPR0;
2683 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2684
2685 CCInfo.AllocateReg(Reg);
2686 unsigned Mask =
2687 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2688 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2689 }
2690
2691 if (Info.hasWorkItemIDY()) {
2692 assert(Info.hasWorkItemIDX());
2693 if (Subtarget->hasPackedTID()) {
2694 Info.setWorkItemIDY(
2695 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2696 } else {
2697 unsigned Reg = AMDGPU::VGPR1;
2698 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2699
2700 CCInfo.AllocateReg(Reg);
2701 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2702 }
2703 }
2704
2705 if (Info.hasWorkItemIDZ()) {
2706 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2707 if (Subtarget->hasPackedTID()) {
2708 Info.setWorkItemIDZ(
2709 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2710 } else {
2711 unsigned Reg = AMDGPU::VGPR2;
2712 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2713
2714 CCInfo.AllocateReg(Reg);
2715 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2716 }
2717 }
2718}
2719
2720// Try to allocate a VGPR at the end of the argument list, or if no argument
2721// VGPRs are left allocating a stack slot.
2722// If \p Mask is is given it indicates bitfield position in the register.
2723// If \p Arg is given use it with new ]p Mask instead of allocating new.
2724static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2725 ArgDescriptor Arg = ArgDescriptor()) {
2726 if (Arg.isSet())
2727 return ArgDescriptor::createArg(Arg, Mask);
2728
2729 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2730 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2731 if (RegIdx == ArgVGPRs.size()) {
2732 // Spill to stack required.
2733 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2734
2735 return ArgDescriptor::createStack(Offset, Mask);
2736 }
2737
2738 unsigned Reg = ArgVGPRs[RegIdx];
2739 Reg = CCInfo.AllocateReg(Reg);
2740 assert(Reg != AMDGPU::NoRegister);
2741
2742 MachineFunction &MF = CCInfo.getMachineFunction();
2743 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2744 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2745 return ArgDescriptor::createRegister(Reg, Mask);
2746}
2747
2749 const TargetRegisterClass *RC,
2750 unsigned NumArgRegs) {
2751 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2752 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2753 if (RegIdx == ArgSGPRs.size())
2754 report_fatal_error("ran out of SGPRs for arguments");
2755
2756 unsigned Reg = ArgSGPRs[RegIdx];
2757 Reg = CCInfo.AllocateReg(Reg);
2758 assert(Reg != AMDGPU::NoRegister);
2759
2760 MachineFunction &MF = CCInfo.getMachineFunction();
2761 MF.addLiveIn(Reg, RC);
2763}
2764
2765// If this has a fixed position, we still should allocate the register in the
2766// CCInfo state. Technically we could get away with this for values passed
2767// outside of the normal argument range.
2769 const TargetRegisterClass *RC,
2770 MCRegister Reg) {
2771 Reg = CCInfo.AllocateReg(Reg);
2772 assert(Reg != AMDGPU::NoRegister);
2773 MachineFunction &MF = CCInfo.getMachineFunction();
2774 MF.addLiveIn(Reg, RC);
2775}
2776
2777static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2778 if (Arg) {
2779 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2780 Arg.getRegister());
2781 } else
2782 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2783}
2784
2785static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2786 if (Arg) {
2787 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2788 Arg.getRegister());
2789 } else
2790 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2791}
2792
2793/// Allocate implicit function VGPR arguments at the end of allocated user
2794/// arguments.
2796 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2797 SIMachineFunctionInfo &Info) const {
2798 const unsigned Mask = 0x3ff;
2799 ArgDescriptor Arg;
2800
2801 if (Info.hasWorkItemIDX()) {
2802 Arg = allocateVGPR32Input(CCInfo, Mask);
2803 Info.setWorkItemIDX(Arg);
2804 }
2805
2806 if (Info.hasWorkItemIDY()) {
2807 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2808 Info.setWorkItemIDY(Arg);
2809 }
2810
2811 if (Info.hasWorkItemIDZ())
2812 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2813}
2814
2815/// Allocate implicit function VGPR arguments in fixed registers.
2817 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2818 SIMachineFunctionInfo &Info) const {
2819 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2820 if (!Reg)
2821 report_fatal_error("failed to allocate VGPR for implicit arguments");
2822
2823 const unsigned Mask = 0x3ff;
2824 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2825 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2826 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2827}
2828
2830 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2831 SIMachineFunctionInfo &Info) const {
2832 auto &ArgInfo = Info.getArgInfo();
2833 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2834
2835 // TODO: Unify handling with private memory pointers.
2836 if (UserSGPRInfo.hasDispatchPtr())
2837 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2838
2839 if (UserSGPRInfo.hasQueuePtr())
2840 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2841
2842 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2843 // constant offset from the kernarg segment.
2844 if (Info.hasImplicitArgPtr())
2845 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2846
2847 if (UserSGPRInfo.hasDispatchID())
2848 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2849
2850 // flat_scratch_init is not applicable for non-kernel functions.
2851
2852 if (Info.hasWorkGroupIDX())
2853 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2854
2855 if (Info.hasWorkGroupIDY())
2856 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2857
2858 if (Info.hasWorkGroupIDZ())
2859 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2860
2861 if (Info.hasLDSKernelId())
2862 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2863}
2864
2865// Allocate special inputs passed in user SGPRs.
2867 MachineFunction &MF,
2868 const SIRegisterInfo &TRI,
2869 SIMachineFunctionInfo &Info) const {
2870 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2871 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2872 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2873 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2874 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2875 }
2876
2877 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2878 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2879 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2880 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2881 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2882 }
2883
2884 if (UserSGPRInfo.hasDispatchPtr()) {
2885 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2886 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2887 CCInfo.AllocateReg(DispatchPtrReg);
2888 }
2889
2890 if (UserSGPRInfo.hasQueuePtr()) {
2891 Register QueuePtrReg = Info.addQueuePtr(TRI);
2892 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2893 CCInfo.AllocateReg(QueuePtrReg);
2894 }
2895
2896 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2898 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2899 CCInfo.AllocateReg(InputPtrReg);
2900
2901 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2902 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2903 }
2904
2905 if (UserSGPRInfo.hasDispatchID()) {
2906 Register DispatchIDReg = Info.addDispatchID(TRI);
2907 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2908 CCInfo.AllocateReg(DispatchIDReg);
2909 }
2910
2911 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2912 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2913 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2914 CCInfo.AllocateReg(FlatScratchInitReg);
2915 }
2916
2917 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2918 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2919 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2920 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2921 }
2922
2923 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2924 // these from the dispatch pointer.
2925}
2926
2927// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2928// sequential starting from the first argument.
2930 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2932 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2933 Function &F = MF.getFunction();
2934 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2935 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2936 bool InPreloadSequence = true;
2937 unsigned InIdx = 0;
2938 bool AlignedForImplictArgs = false;
2939 unsigned ImplicitArgOffset = 0;
2940 for (auto &Arg : F.args()) {
2941 if (!InPreloadSequence || !Arg.hasInRegAttr())
2942 break;
2943
2944 unsigned ArgIdx = Arg.getArgNo();
2945 // Don't preload non-original args or parts not in the current preload
2946 // sequence.
2947 if (InIdx < Ins.size() &&
2948 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2949 break;
2950
2951 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2952 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2953 InIdx++) {
2954 assert(ArgLocs[ArgIdx].isMemLoc());
2955 auto &ArgLoc = ArgLocs[InIdx];
2956 const Align KernelArgBaseAlign = Align(16);
2957 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2958 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2959 unsigned NumAllocSGPRs =
2960 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2961
2962 // Fix alignment for hidden arguments.
2963 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2964 if (!AlignedForImplictArgs) {
2965 ImplicitArgOffset =
2966 alignTo(LastExplicitArgOffset,
2967 Subtarget->getAlignmentForImplicitArgPtr()) -
2968 LastExplicitArgOffset;
2969 AlignedForImplictArgs = true;
2970 }
2971 ArgOffset += ImplicitArgOffset;
2972 }
2973
2974 // Arg is preloaded into the previous SGPR.
2975 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2976 assert(InIdx >= 1 && "No previous SGPR");
2977 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2978 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2979 continue;
2980 }
2981
2982 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2983 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2984 // Check for free user SGPRs for preloading.
2985 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2986 InPreloadSequence = false;
2987 break;
2988 }
2989
2990 // Preload this argument.
2991 const TargetRegisterClass *RC =
2992 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2993 SmallVectorImpl<MCRegister> *PreloadRegs =
2994 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2995
2996 if (PreloadRegs->size() > 1)
2997 RC = &AMDGPU::SGPR_32RegClass;
2998 for (auto &Reg : *PreloadRegs) {
2999 assert(Reg);
3000 MF.addLiveIn(Reg, RC);
3001 CCInfo.AllocateReg(Reg);
3002 }
3003
3004 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3005 }
3006 }
3007}
3008
3010 const SIRegisterInfo &TRI,
3011 SIMachineFunctionInfo &Info) const {
3012 // Always allocate this last since it is a synthetic preload.
3013 if (Info.hasLDSKernelId()) {
3014 Register Reg = Info.addLDSKernelId();
3015 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3016 CCInfo.AllocateReg(Reg);
3017 }
3018}
3019
3020// Allocate special input registers that are initialized per-wave.
3023 CallingConv::ID CallConv,
3024 bool IsShader) const {
3025 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3026 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
3027 // Note: user SGPRs are handled by the front-end for graphics shaders
3028 // Pad up the used user SGPRs with dead inputs.
3029
3030 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
3031 // before enabling architected SGPRs for workgroup IDs.
3032 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
3033
3034 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3035 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
3036 // rely on it to reach 16 since if we end up having no stack usage, it will
3037 // not really be added.
3038 unsigned NumRequiredSystemSGPRs =
3039 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3040 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3041 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3042 Register Reg = Info.addReservedUserSGPR();
3043 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3044 CCInfo.AllocateReg(Reg);
3045 }
3046 }
3047
3048 if (!HasArchitectedSGPRs) {
3049 if (Info.hasWorkGroupIDX()) {
3050 Register Reg = Info.addWorkGroupIDX();
3051 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3052 CCInfo.AllocateReg(Reg);
3053 }
3054
3055 if (Info.hasWorkGroupIDY()) {
3056 Register Reg = Info.addWorkGroupIDY();
3057 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3058 CCInfo.AllocateReg(Reg);
3059 }
3060
3061 if (Info.hasWorkGroupIDZ()) {
3062 Register Reg = Info.addWorkGroupIDZ();
3063 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3064 CCInfo.AllocateReg(Reg);
3065 }
3066 }
3067
3068 if (Info.hasWorkGroupInfo()) {
3069 Register Reg = Info.addWorkGroupInfo();
3070 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3071 CCInfo.AllocateReg(Reg);
3072 }
3073
3074 if (Info.hasPrivateSegmentWaveByteOffset()) {
3075 // Scratch wave offset passed in system SGPR.
3076 unsigned PrivateSegmentWaveByteOffsetReg;
3077
3078 if (IsShader) {
3079 PrivateSegmentWaveByteOffsetReg =
3080 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3081
3082 // This is true if the scratch wave byte offset doesn't have a fixed
3083 // location.
3084 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3085 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
3086 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3087 }
3088 } else
3089 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3090
3091 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3092 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
3093 }
3094
3095 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
3096 Info.getNumPreloadedSGPRs() >= 16);
3097}
3098
3100 MachineFunction &MF,
3101 const SIRegisterInfo &TRI,
3103 // Now that we've figured out where the scratch register inputs are, see if
3104 // should reserve the arguments and use them directly.
3105 MachineFrameInfo &MFI = MF.getFrameInfo();
3106 bool HasStackObjects = MFI.hasStackObjects();
3107 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3108
3109 // Record that we know we have non-spill stack objects so we don't need to
3110 // check all stack objects later.
3111 if (HasStackObjects)
3112 Info.setHasNonSpillStackObjects(true);
3113
3114 // Everything live out of a block is spilled with fast regalloc, so it's
3115 // almost certain that spilling will be required.
3116 if (TM.getOptLevel() == CodeGenOptLevel::None)
3117 HasStackObjects = true;
3118
3119 // For now assume stack access is needed in any callee functions, so we need
3120 // the scratch registers to pass in.
3121 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
3122
3123 if (!ST.enableFlatScratch()) {
3124 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
3125 // If we have stack objects, we unquestionably need the private buffer
3126 // resource. For the Code Object V2 ABI, this will be the first 4 user
3127 // SGPR inputs. We can reserve those and use them directly.
3128
3129 Register PrivateSegmentBufferReg =
3131 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3132 } else {
3133 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
3134 // We tentatively reserve the last registers (skipping the last registers
3135 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
3136 // we'll replace these with the ones immediately after those which were
3137 // really allocated. In the prologue copies will be inserted from the
3138 // argument to these reserved registers.
3139
3140 // Without HSA, relocations are used for the scratch pointer and the
3141 // buffer resource setup is always inserted in the prologue. Scratch wave
3142 // offset is still in an input SGPR.
3143 Info.setScratchRSrcReg(ReservedBufferReg);
3144 }
3145 }
3146
3148
3149 // For entry functions we have to set up the stack pointer if we use it,
3150 // whereas non-entry functions get this "for free". This means there is no
3151 // intrinsic advantage to using S32 over S34 in cases where we do not have
3152 // calls but do need a frame pointer (i.e. if we are requested to have one
3153 // because frame pointer elimination is disabled). To keep things simple we
3154 // only ever use S32 as the call ABI stack pointer, and so using it does not
3155 // imply we need a separate frame pointer.
3156 //
3157 // Try to use s32 as the SP, but move it if it would interfere with input
3158 // arguments. This won't work with calls though.
3159 //
3160 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
3161 // registers.
3162 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
3163 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3164 } else {
3166
3167 if (MFI.hasCalls())
3168 report_fatal_error("call in graphics shader with too many input SGPRs");
3169
3170 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
3171 if (!MRI.isLiveIn(Reg)) {
3172 Info.setStackPtrOffsetReg(Reg);
3173 break;
3174 }
3175 }
3176
3177 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3178 report_fatal_error("failed to find register for SP");
3179 }
3180
3181 // hasFP should be accurate for entry functions even before the frame is
3182 // finalized, because it does not rely on the known stack size, only
3183 // properties like whether variable sized objects are present.
3184 if (ST.getFrameLowering()->hasFP(MF)) {
3185 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3186 }
3187}
3188
3191 return !Info->isEntryFunction();
3192}
3193
3195
3197 MachineBasicBlock *Entry,
3198 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
3200
3201 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3202 if (!IStart)
3203 return;
3204
3205 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3206 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
3207 MachineBasicBlock::iterator MBBI = Entry->begin();
3208 for (const MCPhysReg *I = IStart; *I; ++I) {
3209 const TargetRegisterClass *RC = nullptr;
3210 if (AMDGPU::SReg_64RegClass.contains(*I))
3211 RC = &AMDGPU::SGPR_64RegClass;
3212 else if (AMDGPU::SReg_32RegClass.contains(*I))
3213 RC = &AMDGPU::SGPR_32RegClass;
3214 else
3215 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3216
3217 Register NewVR = MRI->createVirtualRegister(RC);
3218 // Create copy from CSR to a virtual register.
3219 Entry->addLiveIn(*I);
3220 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
3221 .addReg(*I);
3222
3223 // Insert the copy-back instructions right before the terminator.
3224 for (auto *Exit : Exits)
3225 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
3226 TII->get(TargetOpcode::COPY), *I)
3227 .addReg(NewVR);
3228 }
3229}
3230
3232 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3233 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3234 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3236
3238 const Function &Fn = MF.getFunction();
3241 bool IsError = false;
3242
3243 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
3245 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3246 IsError = true;
3247 }
3248
3251 BitVector Skipped(Ins.size());
3252 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3253 *DAG.getContext());
3254
3255 bool IsGraphics = AMDGPU::isGraphics(CallConv);
3256 bool IsKernel = AMDGPU::isKernel(CallConv);
3257 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
3258
3259 if (IsGraphics) {
3260 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3261 assert(!UserSGPRInfo.hasDispatchPtr() &&
3262 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3263 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3264 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3265 (void)UserSGPRInfo;
3266 if (!Subtarget->enableFlatScratch())
3267 assert(!UserSGPRInfo.hasFlatScratchInit());
3268 if ((CallConv != CallingConv::AMDGPU_CS &&
3269 CallConv != CallingConv::AMDGPU_Gfx &&
3270 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
3271 !Subtarget->hasArchitectedSGPRs())
3272 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3273 !Info->hasWorkGroupIDZ());
3274 }
3275
3276 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3277
3278 if (CallConv == CallingConv::AMDGPU_PS) {
3279 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3280
3281 // At least one interpolation mode must be enabled or else the GPU will
3282 // hang.
3283 //
3284 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3285 // set PSInputAddr, the user wants to enable some bits after the compilation
3286 // based on run-time states. Since we can't know what the final PSInputEna
3287 // will look like, so we shouldn't do anything here and the user should take
3288 // responsibility for the correct programming.
3289 //
3290 // Otherwise, the following restrictions apply:
3291 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
3292 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
3293 // enabled too.
3294 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3295 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3296 CCInfo.AllocateReg(AMDGPU::VGPR0);
3297 CCInfo.AllocateReg(AMDGPU::VGPR1);
3298 Info->markPSInputAllocated(0);
3299 Info->markPSInputEnabled(0);
3300 }
3301 if (Subtarget->isAmdPalOS()) {
3302 // For isAmdPalOS, the user does not enable some bits after compilation
3303 // based on run-time states; the register values being generated here are
3304 // the final ones set in hardware. Therefore we need to apply the
3305 // workaround to PSInputAddr and PSInputEnable together. (The case where
3306 // a bit is set in PSInputAddr but not PSInputEnable is where the
3307 // frontend set up an input arg for a particular interpolation mode, but
3308 // nothing uses that input arg. Really we should have an earlier pass
3309 // that removes such an arg.)
3310 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3311 if ((PsInputBits & 0x7F) == 0 ||
3312 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3313 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
3314 }
3315 } else if (IsKernel) {
3316 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3317 } else {
3318 Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3319 Ins.end());
3320 }
3321
3322 if (IsKernel)
3323 analyzeFormalArgumentsCompute(CCInfo, Ins);
3324
3325 if (IsEntryFunc) {
3326 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
3327 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
3328 if (IsKernel && Subtarget->hasKernargPreload())
3329 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
3330
3331 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
3332 } else if (!IsGraphics) {
3333 // For the fixed ABI, pass workitem IDs in the last argument register.
3334 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
3335
3336 // FIXME: Sink this into allocateSpecialInputSGPRs
3337 if (!Subtarget->enableFlatScratch())
3338 CCInfo.AllocateReg(Info->getScratchRSrcReg());
3339
3340 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
3341 }
3342
3343 if (!IsKernel) {
3344 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
3345 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
3346
3347 // This assumes the registers are allocated by CCInfo in ascending order
3348 // with no gaps.
3349 Info->setNumWaveDispatchSGPRs(
3350 CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
3351 Info->setNumWaveDispatchVGPRs(
3352 CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
3353 } else if (Info->getNumKernargPreloadedSGPRs()) {
3354 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3355 }
3356
3358
3359 if (IsWholeWaveFunc) {
3361 {MVT::i1, MVT::Other}, Chain);
3362 InVals.push_back(Setup.getValue(0));
3363 Chains.push_back(Setup.getValue(1));
3364 }
3365
3366 // FIXME: This is the minimum kernel argument alignment. We should improve
3367 // this to the maximum alignment of the arguments.
3368 //
3369 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3370 // kern arg offset.
3371 const Align KernelArgBaseAlign = Align(16);
3372
3373 for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3374 ++i) {
3375 const ISD::InputArg &Arg = Ins[i];
3376 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3377 InVals.push_back(DAG.getPOISON(Arg.VT));
3378 continue;
3379 }
3380
3381 CCValAssign &VA = ArgLocs[ArgIdx++];
3382 MVT VT = VA.getLocVT();
3383
3384 if (IsEntryFunc && VA.isMemLoc()) {
3385 VT = Ins[i].VT;
3386 EVT MemVT = VA.getLocVT();
3387
3388 const uint64_t Offset = VA.getLocMemOffset();
3389 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
3390
3391 if (Arg.Flags.isByRef()) {
3392 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
3393
3394 const GCNTargetMachine &TM =
3395 static_cast<const GCNTargetMachine &>(getTargetMachine());
3396 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
3397 Arg.Flags.getPointerAddrSpace())) {
3400 }
3401
3402 InVals.push_back(Ptr);
3403 continue;
3404 }
3405
3406 SDValue NewArg;
3407 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3408 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3409 // In this case the argument is packed into the previous preload SGPR.
3410 int64_t AlignDownOffset = alignDown(Offset, 4);
3411 int64_t OffsetDiff = Offset - AlignDownOffset;
3412 EVT IntVT = MemVT.changeTypeToInteger();
3413
3414 const SIMachineFunctionInfo *Info =
3417 Register Reg =
3418 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3419
3420 assert(Reg);
3421 Register VReg = MRI.getLiveInVirtReg(Reg);
3422 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3423
3424 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
3425 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
3426
3427 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
3428 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
3429 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
3430 Ins[i].Flags.isSExt(), &Ins[i]);
3431
3432 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
3433 } else {
3434 const SIMachineFunctionInfo *Info =
3437 const SmallVectorImpl<MCRegister> &PreloadRegs =
3438 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3439
3440 SDValue Copy;
3441 if (PreloadRegs.size() == 1) {
3442 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3443 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3444 NewArg = DAG.getCopyFromReg(
3445 Chain, DL, VReg,
3447 TRI->getRegSizeInBits(*RC)));
3448
3449 } else {
3450 // If the kernarg alignment does not match the alignment of the SGPR
3451 // tuple RC that can accommodate this argument, it will be built up
3452 // via copies from from the individual SGPRs that the argument was
3453 // preloaded to.
3455 for (auto Reg : PreloadRegs) {
3456 Register VReg = MRI.getLiveInVirtReg(Reg);
3457 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3458 Elts.push_back(Copy);
3459 }
3460 NewArg =
3461 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3462 PreloadRegs.size()),
3463 DL, Elts);
3464 }
3465
3466 // If the argument was preloaded to multiple consecutive 32-bit
3467 // registers because of misalignment between addressable SGPR tuples
3468 // and the argument size, we can still assume that because of kernarg
3469 // segment alignment restrictions that NewArg's size is the same as
3470 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3471 // truncate since we cannot preload to less than a single SGPR and the
3472 // MemVT may be smaller.
3473 EVT MemVTInt =
3475 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3476 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3477
3478 NewArg = DAG.getBitcast(MemVT, NewArg);
3479 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3480 Ins[i].Flags.isSExt(), &Ins[i]);
3481 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3482 }
3483 } else {
3484 // Hidden arguments that are in the kernel signature must be preloaded
3485 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3486 // the argument list and is not preloaded.
3487 if (Arg.isOrigArg()) {
3488 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3489 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3491 *OrigArg->getParent(),
3492 "hidden argument in kernel signature was not preloaded",
3493 DL.getDebugLoc()));
3494 }
3495 }
3496
3497 NewArg =
3498 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3499 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3500 }
3501 Chains.push_back(NewArg.getValue(1));
3502
3503 auto *ParamTy =
3504 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3505 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3506 ParamTy &&
3507 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3508 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3509 // On SI local pointers are just offsets into LDS, so they are always
3510 // less than 16-bits. On CI and newer they could potentially be
3511 // real pointers, so we can't guarantee their size.
3512 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3513 DAG.getValueType(MVT::i16));
3514 }
3515
3516 InVals.push_back(NewArg);
3517 continue;
3518 }
3519 if (!IsEntryFunc && VA.isMemLoc()) {
3520 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3521 InVals.push_back(Val);
3522 if (!Arg.Flags.isByVal())
3523 Chains.push_back(Val.getValue(1));
3524 continue;
3525 }
3526
3527 assert(VA.isRegLoc() && "Parameter must be in a register!");
3528
3529 Register Reg = VA.getLocReg();
3530 const TargetRegisterClass *RC = nullptr;
3531 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3532 RC = &AMDGPU::VGPR_32RegClass;
3533 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3534 RC = &AMDGPU::SGPR_32RegClass;
3535 else
3536 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3537
3538 Reg = MF.addLiveIn(Reg, RC);
3539 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3540
3541 if (Arg.Flags.isSRet()) {
3542 // The return object should be reasonably addressable.
3543
3544 // FIXME: This helps when the return is a real sret. If it is a
3545 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3546 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3547 unsigned NumBits =
3549 Val = DAG.getNode(
3550 ISD::AssertZext, DL, VT, Val,
3551 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3552 }
3553
3554 Val = convertABITypeToValueType(DAG, Val, VA, DL);
3555 InVals.push_back(Val);
3556 }
3557
3558 // Start adding system SGPRs.
3559 if (IsEntryFunc)
3560 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3561
3562 // DAG.getPass() returns nullptr when using new pass manager.
3563 // TODO: Use DAG.getMFAM() to access analysis result.
3564 if (DAG.getPass()) {
3565 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3566 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3567 }
3568
3569 unsigned StackArgSize = CCInfo.getStackSize();
3570 Info->setBytesInStackArgArea(StackArgSize);
3571
3572 return Chains.empty() ? Chain
3573 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3574}
3575
3576// TODO: If return values can't fit in registers, we should return as many as
3577// possible in registers before passing on stack.
3579 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3580 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3581 const Type *RetTy) const {
3582 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3583 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3584 // for shaders. Vector types should be explicitly handled by CC.
3585 if (AMDGPU::isEntryFunctionCC(CallConv))
3586 return true;
3587
3589 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3590 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3591 return false;
3592
3593 // We must use the stack if return would require unavailable registers.
3594 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3595 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3596 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3597 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3598 return false;
3599
3600 return true;
3601}
3602
3603SDValue
3605 bool isVarArg,
3607 const SmallVectorImpl<SDValue> &OutVals,
3608 const SDLoc &DL, SelectionDAG &DAG) const {
3612
3613 if (AMDGPU::isKernel(CallConv)) {
3614 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3615 OutVals, DL, DAG);
3616 }
3617
3618 bool IsShader = AMDGPU::isShader(CallConv);
3619
3620 Info->setIfReturnsVoid(Outs.empty());
3621 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3622
3623 // CCValAssign - represent the assignment of the return value to a location.
3625
3626 // CCState - Info about the registers and stack slots.
3627 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3628 *DAG.getContext());
3629
3630 // Analyze outgoing return values.
3631 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3632
3633 SDValue Glue;
3635 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3636
3637 SDValue ReadFirstLane =
3638 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3639 // Copy the result values into the output registers.
3640 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3641 ++I, ++RealRVLocIdx) {
3642 CCValAssign &VA = RVLocs[I];
3643 assert(VA.isRegLoc() && "Can only return in registers!");
3644 // TODO: Partially return in registers if return values don't fit.
3645 SDValue Arg = OutVals[RealRVLocIdx];
3646
3647 // Copied from other backends.
3648 switch (VA.getLocInfo()) {
3649 case CCValAssign::Full:
3650 break;
3651 case CCValAssign::BCvt:
3652 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3653 break;
3654 case CCValAssign::SExt:
3655 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3656 break;
3657 case CCValAssign::ZExt:
3658 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3659 break;
3660 case CCValAssign::AExt:
3661 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3662 break;
3663 default:
3664 llvm_unreachable("Unknown loc info!");
3665 }
3666 if (TRI->isSGPRPhysReg(VA.getLocReg()))
3668 ReadFirstLane, Arg);
3669 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3670 Glue = Chain.getValue(1);
3671 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3672 }
3673
3674 // FIXME: Does sret work properly?
3675 if (!Info->isEntryFunction()) {
3676 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3677 const MCPhysReg *I =
3678 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3679 if (I) {
3680 for (; *I; ++I) {
3681 if (AMDGPU::SReg_64RegClass.contains(*I))
3682 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3683 else if (AMDGPU::SReg_32RegClass.contains(*I))
3684 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3685 else
3686 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3687 }
3688 }
3689 }
3690
3691 // Update chain and glue.
3692 RetOps[0] = Chain;
3693 if (Glue.getNode())
3694 RetOps.push_back(Glue);
3695
3696 unsigned Opc = AMDGPUISD::ENDPGM;
3697 if (!IsWaveEnd)
3698 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3699 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3701 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3702}
3703
3705 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3706 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3707 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3708 SDValue ThisVal) const {
3709 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3710
3711 // Assign locations to each value returned by this call.
3713 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3714 *DAG.getContext());
3715 CCInfo.AnalyzeCallResult(Ins, RetCC);
3716
3717 // Copy all of the result registers out of their specified physreg.
3718 for (CCValAssign VA : RVLocs) {
3719 SDValue Val;
3720
3721 if (VA.isRegLoc()) {
3722 Val =
3723 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3724 Chain = Val.getValue(1);
3725 InGlue = Val.getValue(2);
3726 } else if (VA.isMemLoc()) {
3727 report_fatal_error("TODO: return values in memory");
3728 } else
3729 llvm_unreachable("unknown argument location type");
3730
3731 switch (VA.getLocInfo()) {
3732 case CCValAssign::Full:
3733 break;
3734 case CCValAssign::BCvt:
3735 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3736 break;
3737 case CCValAssign::ZExt:
3738 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3739 DAG.getValueType(VA.getValVT()));
3740 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3741 break;
3742 case CCValAssign::SExt:
3743 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3744 DAG.getValueType(VA.getValVT()));
3745 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3746 break;
3747 case CCValAssign::AExt:
3748 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3749 break;
3750 default:
3751 llvm_unreachable("Unknown loc info!");
3752 }
3753
3754 InVals.push_back(Val);
3755 }
3756
3757 return Chain;
3758}
3759
3760// Add code to pass special inputs required depending on used features separate
3761// from the explicit user arguments present in the IR.
3763 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3764 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3765 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3766 // If we don't have a call site, this was a call inserted by
3767 // legalization. These can never use special inputs.
3768 if (!CLI.CB)
3769 return;
3770
3771 SelectionDAG &DAG = CLI.DAG;
3772 const SDLoc &DL = CLI.DL;
3773 const Function &F = DAG.getMachineFunction().getFunction();
3774
3775 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3776 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3777
3778 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3780 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3781 // DAG.getPass() returns nullptr when using new pass manager.
3782 // TODO: Use DAG.getMFAM() to access analysis result.
3783 if (DAG.getPass()) {
3784 auto &ArgUsageInfo =
3786 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3787 }
3788 }
3789
3790 // TODO: Unify with private memory register handling. This is complicated by
3791 // the fact that at least in kernels, the input argument is not necessarily
3792 // in the same location as the input.
3793 // clang-format off
3794 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3795 std::array<StringLiteral, 2>> ImplicitAttrs[] = {
3796 {AMDGPUFunctionArgInfo::DISPATCH_PTR, {"amdgpu-no-dispatch-ptr", ""}},
3797 {AMDGPUFunctionArgInfo::QUEUE_PTR, {"amdgpu-no-queue-ptr", ""}},
3798 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, {"amdgpu-no-implicitarg-ptr", ""}},
3799 {AMDGPUFunctionArgInfo::DISPATCH_ID, {"amdgpu-no-dispatch-id", ""}},
3800 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, {"amdgpu-no-workgroup-id-x", "amdgpu-no-cluster-id-x"}},
3801 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, {"amdgpu-no-workgroup-id-y", "amdgpu-no-cluster-id-y"}},
3802 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, {"amdgpu-no-workgroup-id-z", "amdgpu-no-cluster-id-z"}},
3803 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID, {"amdgpu-no-lds-kernel-id", ""}},
3804 };
3805 // clang-format on
3806
3807 for (auto [InputID, Attrs] : ImplicitAttrs) {
3808 // If the callee does not use the attribute value, skip copying the value.
3809 if (all_of(Attrs, [&](StringRef Attr) {
3810 return Attr.empty() || CLI.CB->hasFnAttr(Attr);
3811 }))
3812 continue;
3813
3814 const auto [OutgoingArg, ArgRC, ArgTy] =
3815 CalleeArgInfo->getPreloadedValue(InputID);
3816 if (!OutgoingArg)
3817 continue;
3818
3819 const auto [IncomingArg, IncomingArgRC, Ty] =
3820 CallerArgInfo.getPreloadedValue(InputID);
3821 assert(IncomingArgRC == ArgRC);
3822
3823 // All special arguments are ints for now.
3824 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3825 SDValue InputReg;
3826
3827 if (IncomingArg) {
3828 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3829 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3830 // The implicit arg ptr is special because it doesn't have a corresponding
3831 // input for kernels, and is computed from the kernarg segment pointer.
3832 InputReg = getImplicitArgPtr(DAG, DL);
3833 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3834 std::optional<uint32_t> Id =
3836 if (Id.has_value()) {
3837 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3838 } else {
3839 InputReg = DAG.getPOISON(ArgVT);
3840 }
3841 } else {
3842 // We may have proven the input wasn't needed, although the ABI is
3843 // requiring it. We just need to allocate the register appropriately.
3844 InputReg = DAG.getPOISON(ArgVT);
3845 }
3846
3847 if (OutgoingArg->isRegister()) {
3848 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3849 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3850 report_fatal_error("failed to allocate implicit input argument");
3851 } else {
3852 unsigned SpecialArgOffset =
3853 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3854 SDValue ArgStore =
3855 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3856 MemOpChains.push_back(ArgStore);
3857 }
3858 }
3859
3860 // Pack workitem IDs into a single register or pass it as is if already
3861 // packed.
3862
3863 auto [OutgoingArg, ArgRC, Ty] =
3865 if (!OutgoingArg)
3866 std::tie(OutgoingArg, ArgRC, Ty) =
3868 if (!OutgoingArg)
3869 std::tie(OutgoingArg, ArgRC, Ty) =
3871 if (!OutgoingArg)
3872 return;
3873
3874 const ArgDescriptor *IncomingArgX = std::get<0>(
3876 const ArgDescriptor *IncomingArgY = std::get<0>(
3878 const ArgDescriptor *IncomingArgZ = std::get<0>(
3880
3881 SDValue InputReg;
3882 SDLoc SL;
3883
3884 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3885 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3886 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3887
3888 // If incoming ids are not packed we need to pack them.
3889 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3890 NeedWorkItemIDX) {
3891 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3892 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3893 } else {
3894 InputReg = DAG.getConstant(0, DL, MVT::i32);
3895 }
3896 }
3897
3898 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3899 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3900 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3901 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3902 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3903 InputReg = InputReg.getNode()
3904 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3905 : Y;
3906 }
3907
3908 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3909 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3910 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3911 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3912 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3913 InputReg = InputReg.getNode()
3914 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3915 : Z;
3916 }
3917
3918 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3919 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3920 // We're in a situation where the outgoing function requires the workitem
3921 // ID, but the calling function does not have it (e.g a graphics function
3922 // calling a C calling convention function). This is illegal, but we need
3923 // to produce something.
3924 InputReg = DAG.getPOISON(MVT::i32);
3925 } else {
3926 // Workitem ids are already packed, any of present incoming arguments
3927 // will carry all required fields.
3928 ArgDescriptor IncomingArg =
3929 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3930 : IncomingArgY ? *IncomingArgY
3931 : *IncomingArgZ,
3932 ~0u);
3933 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3934 }
3935 }
3936
3937 if (OutgoingArg->isRegister()) {
3938 if (InputReg)
3939 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3940
3941 CCInfo.AllocateReg(OutgoingArg->getRegister());
3942 } else {
3943 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3944 if (InputReg) {
3945 SDValue ArgStore =
3946 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3947 MemOpChains.push_back(ArgStore);
3948 }
3949 }
3950}
3951
3953 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3955 const SmallVectorImpl<SDValue> &OutVals,
3956 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3957 if (AMDGPU::isChainCC(CalleeCC))
3958 return true;
3959
3960 if (!AMDGPU::mayTailCallThisCC(CalleeCC))
3961 return false;
3962
3963 // For a divergent call target, we need to do a waterfall loop over the
3964 // possible callees which precludes us from using a simple jump.
3965 if (Callee->isDivergent())
3966 return false;
3967
3969 const Function &CallerF = MF.getFunction();
3970 CallingConv::ID CallerCC = CallerF.getCallingConv();
3972 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3973
3974 // Kernels aren't callable, and don't have a live in return address so it
3975 // doesn't make sense to do a tail call with entry functions.
3976 if (!CallerPreserved)
3977 return false;
3978
3979 bool CCMatch = CallerCC == CalleeCC;
3980
3982 if (AMDGPU::canGuaranteeTCO(CalleeCC) && CCMatch)
3983 return true;
3984 return false;
3985 }
3986
3987 // TODO: Can we handle var args?
3988 if (IsVarArg)
3989 return false;
3990
3991 for (const Argument &Arg : CallerF.args()) {
3992 if (Arg.hasByValAttr())
3993 return false;
3994 }
3995
3996 LLVMContext &Ctx = *DAG.getContext();
3997
3998 // Check that the call results are passed in the same way.
3999 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
4000 CCAssignFnForCall(CalleeCC, IsVarArg),
4001 CCAssignFnForCall(CallerCC, IsVarArg)))
4002 return false;
4003
4004 // The callee has to preserve all registers the caller needs to preserve.
4005 if (!CCMatch) {
4006 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4007 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4008 return false;
4009 }
4010
4011 // Nothing more to check if the callee is taking no arguments.
4012 if (Outs.empty())
4013 return true;
4014
4016 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4017
4018 // FIXME: We are not allocating special input registers, so we will be
4019 // deciding based on incorrect register assignments.
4020 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
4021
4022 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4023 // If the stack arguments for this call do not fit into our own save area then
4024 // the call cannot be made tail.
4025 // TODO: Is this really necessary?
4026 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
4027 return false;
4028
4029 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
4030 // FIXME: What about inreg arguments that end up passed in memory?
4031 if (!CCVA.isRegLoc())
4032 continue;
4033
4034 // If we are passing an argument in an SGPR, and the value is divergent,
4035 // this call requires a waterfall loop.
4036 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4037 LLVM_DEBUG(
4038 dbgs() << "Cannot tail call due to divergent outgoing argument in "
4039 << printReg(CCVA.getLocReg(), TRI) << '\n');
4040 return false;
4041 }
4042 }
4043
4044 const MachineRegisterInfo &MRI = MF.getRegInfo();
4045 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
4046}
4047
4049 if (!CI->isTailCall())
4050 return false;
4051
4052 const Function *ParentFn = CI->getParent()->getParent();
4054 return false;
4055 return true;
4056}
4057
4058namespace {
4059// Chain calls have special arguments that we need to handle. These are
4060// tagging along at the end of the arguments list(s), after the SGPR and VGPR
4061// arguments (index 0 and 1 respectively).
4062enum ChainCallArgIdx {
4063 Exec = 2,
4064 Flags,
4065 NumVGPRs,
4066 FallbackExec,
4067 FallbackCallee
4068};
4069} // anonymous namespace
4070
4071// The wave scratch offset register is used as the global base pointer.
4073 SmallVectorImpl<SDValue> &InVals) const {
4074 CallingConv::ID CallConv = CLI.CallConv;
4075 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
4076
4077 SelectionDAG &DAG = CLI.DAG;
4078
4079 const SDLoc &DL = CLI.DL;
4080 SDValue Chain = CLI.Chain;
4081 SDValue Callee = CLI.Callee;
4082
4083 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
4084 bool UsesDynamicVGPRs = false;
4085 if (IsChainCallConv) {
4086 // The last arguments should be the value that we need to put in EXEC,
4087 // followed by the flags and any other arguments with special meanings.
4088 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
4089 // we don't treat them like the "real" arguments.
4090 auto RequestedExecIt =
4091 llvm::find_if(CLI.Outs, [](const ISD::OutputArg &Arg) {
4092 return Arg.OrigArgIndex == 2;
4093 });
4094 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
4095
4096 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
4097 CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx,
4098 CLI.OutVals.end());
4099 CLI.Outs.erase(RequestedExecIt, CLI.Outs.end());
4100
4101 assert(CLI.Outs.back().OrigArgIndex < 2 &&
4102 "Haven't popped all the special args");
4103
4104 TargetLowering::ArgListEntry RequestedExecArg =
4105 CLI.Args[ChainCallArgIdx::Exec];
4106 if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
4107 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
4108
4109 // Convert constants into TargetConstants, so they become immediate operands
4110 // instead of being selected into S_MOV.
4111 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
4112 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {
4113 ChainCallSpecialArgs.push_back(DAG.getTargetConstant(
4114 ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0)));
4115 } else
4116 ChainCallSpecialArgs.push_back(Arg.Node);
4117 };
4118
4119 PushNodeOrTargetConstant(RequestedExecArg);
4120
4121 // Process any other special arguments depending on the value of the flags.
4122 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
4123
4124 const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
4125 if (FlagsValue.isZero()) {
4126 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
4127 return lowerUnhandledCall(CLI, InVals,
4128 "no additional args allowed if flags == 0");
4129 } else if (FlagsValue.isOneBitSet(0)) {
4130 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4131 return lowerUnhandledCall(CLI, InVals, "expected 3 additional args");
4132 }
4133
4134 if (!Subtarget->isWave32()) {
4135 return lowerUnhandledCall(
4136 CLI, InVals, "dynamic VGPR mode is only supported for wave32");
4137 }
4138
4139 UsesDynamicVGPRs = true;
4140 std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
4141 CLI.Args.end(), PushNodeOrTargetConstant);
4142 }
4143 }
4144
4146 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
4148 bool &IsTailCall = CLI.IsTailCall;
4149 bool IsVarArg = CLI.IsVarArg;
4150 bool IsSibCall = false;
4152
4153 if (Callee.isUndef() || isNullConstant(Callee)) {
4154 if (!CLI.IsTailCall) {
4155 for (ISD::InputArg &Arg : CLI.Ins)
4156 InVals.push_back(DAG.getPOISON(Arg.VT));
4157 }
4158
4159 return Chain;
4160 }
4161
4162 if (IsVarArg) {
4163 return lowerUnhandledCall(CLI, InVals,
4164 "unsupported call to variadic function ");
4165 }
4166
4167 if (!CLI.CB)
4168 return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization");
4169
4170 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
4171 return lowerUnhandledCall(CLI, InVals,
4172 "unsupported required tail call to function ");
4173 }
4174
4175 if (IsTailCall) {
4176 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
4177 Outs, OutVals, Ins, DAG);
4178 if (!IsTailCall &&
4179 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
4180 report_fatal_error("failed to perform tail call elimination on a call "
4181 "site marked musttail or on llvm.amdgcn.cs.chain");
4182 }
4183
4184 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4185
4186 // A sibling call is one where we're under the usual C ABI and not planning
4187 // to change that but can still do a tail call:
4188 if (!TailCallOpt && IsTailCall)
4189 IsSibCall = true;
4190
4191 if (IsTailCall)
4192 ++NumTailCalls;
4193 }
4194
4197 SmallVector<SDValue, 8> MemOpChains;
4198
4199 // Analyze operands of the call, assigning locations to each operand.
4201 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4202 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
4203
4204 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
4206 // With a fixed ABI, allocate fixed registers before user arguments.
4207 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
4208 }
4209
4210 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
4211
4212 // Get a count of how many bytes are to be pushed on the stack.
4213 unsigned NumBytes = CCInfo.getStackSize();
4214
4215 if (IsSibCall) {
4216 // Since we're not changing the ABI to make this a tail call, the memory
4217 // operands are already available in the caller's incoming argument space.
4218 NumBytes = 0;
4219 }
4220
4221 // FPDiff is the byte offset of the call's argument area from the callee's.
4222 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4223 // by this amount for a tail call. In a sibling call it must be 0 because the
4224 // caller will deallocate the entire stack and the callee still expects its
4225 // arguments to begin at SP+0. Completely unused for non-tail calls.
4226 int32_t FPDiff = 0;
4227 MachineFrameInfo &MFI = MF.getFrameInfo();
4228 auto *TRI = Subtarget->getRegisterInfo();
4229
4230 // Adjust the stack pointer for the new arguments...
4231 // These operations are automatically eliminated by the prolog/epilog pass
4232 if (!IsSibCall)
4233 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
4234
4235 if (!IsSibCall || IsChainCallConv) {
4236 if (!Subtarget->enableFlatScratch()) {
4237 SmallVector<SDValue, 4> CopyFromChains;
4238
4239 // In the HSA case, this should be an identity copy.
4240 SDValue ScratchRSrcReg =
4241 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
4242 RegsToPass.emplace_back(IsChainCallConv
4243 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4244 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4245 ScratchRSrcReg);
4246 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
4247 Chain = DAG.getTokenFactor(DL, CopyFromChains);
4248 }
4249 }
4250
4251 const unsigned NumSpecialInputs = RegsToPass.size();
4252
4253 MVT PtrVT = MVT::i32;
4254
4255 // Walk the register/memloc assignments, inserting copies/loads.
4256 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4257 CCValAssign &VA = ArgLocs[i];
4258 SDValue Arg = OutVals[i];
4259
4260 // Promote the value if needed.
4261 switch (VA.getLocInfo()) {
4262 case CCValAssign::Full:
4263 break;
4264 case CCValAssign::BCvt:
4265 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
4266 break;
4267 case CCValAssign::ZExt:
4268 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
4269 break;
4270 case CCValAssign::SExt:
4271 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
4272 break;
4273 case CCValAssign::AExt:
4274 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
4275 break;
4276 case CCValAssign::FPExt:
4277 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
4278 break;
4279 default:
4280 llvm_unreachable("Unknown loc info!");
4281 }
4282
4283 if (VA.isRegLoc()) {
4284 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
4285 } else {
4286 assert(VA.isMemLoc());
4287
4288 SDValue DstAddr;
4289 MachinePointerInfo DstInfo;
4290
4291 unsigned LocMemOffset = VA.getLocMemOffset();
4292 int32_t Offset = LocMemOffset;
4293
4294 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
4295 MaybeAlign Alignment;
4296
4297 if (IsTailCall) {
4298 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4299 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4300 : VA.getValVT().getStoreSize();
4301
4302 // FIXME: We can have better than the minimum byval required alignment.
4303 Alignment =
4304 Flags.isByVal()
4305 ? Flags.getNonZeroByValAlign()
4306 : commonAlignment(Subtarget->getStackAlignment(), Offset);
4307
4308 Offset = Offset + FPDiff;
4309 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
4310
4311 DstAddr = DAG.getFrameIndex(FI, PtrVT);
4312 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4313
4314 // Make sure any stack arguments overlapping with where we're storing
4315 // are loaded before this eventual operation. Otherwise they'll be
4316 // clobbered.
4317
4318 // FIXME: Why is this really necessary? This seems to just result in a
4319 // lot of code to copy the stack and write them back to the same
4320 // locations, which are supposed to be immutable?
4321 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
4322 } else {
4323 // Stores to the argument stack area are relative to the stack pointer.
4324 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
4325 MVT::i32);
4326 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
4327 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
4328 Alignment =
4329 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
4330 }
4331
4332 if (Outs[i].Flags.isByVal()) {
4333 SDValue SizeNode =
4334 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
4335 SDValue Cpy =
4336 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
4337 Outs[i].Flags.getNonZeroByValAlign(),
4338 /*isVol = */ false, /*AlwaysInline = */ true,
4339 /*CI=*/nullptr, std::nullopt, DstInfo,
4341
4342 MemOpChains.push_back(Cpy);
4343 } else {
4344 SDValue Store =
4345 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
4346 MemOpChains.push_back(Store);
4347 }
4348 }
4349 }
4350
4351 if (!MemOpChains.empty())
4352 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
4353
4354 SDValue ReadFirstLaneID =
4355 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4356
4357 SDValue TokenGlue;
4358 if (CLI.ConvergenceControlToken) {
4359 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
4361 }
4362
4363 // Build a sequence of copy-to-reg nodes chained together with token chain
4364 // and flag operands which copy the outgoing args into the appropriate regs.
4365 SDValue InGlue;
4366
4367 unsigned ArgIdx = 0;
4368 for (auto [Reg, Val] : RegsToPass) {
4369 if (ArgIdx++ >= NumSpecialInputs &&
4370 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4371 // For chain calls, the inreg arguments are required to be
4372 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4373 // they are uniform.
4374 //
4375 // For other calls, if an inreg arguments is known to be uniform,
4376 // speculatively insert a readfirstlane in case it is in a VGPR.
4377 //
4378 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4379 // value, so let that continue to produce invalid code.
4380
4381 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4382 if (TokenGlue)
4383 ReadfirstlaneArgs.push_back(TokenGlue);
4385 ReadfirstlaneArgs);
4386 }
4387
4388 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4389 InGlue = Chain.getValue(1);
4390 }
4391
4392 // We don't usually want to end the call-sequence here because we would tidy
4393 // the frame up *after* the call, however in the ABI-changing tail-call case
4394 // we've carefully laid out the parameters so that when sp is reset they'll be
4395 // in the correct location.
4396 if (IsTailCall && !IsSibCall) {
4397 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
4398 InGlue = Chain.getValue(1);
4399 }
4400
4401 std::vector<SDValue> Ops({Chain});
4402
4403 // Add a redundant copy of the callee global which will not be legalized, as
4404 // we need direct access to the callee later.
4406 const GlobalValue *GV = GSD->getGlobal();
4407 Ops.push_back(Callee);
4408 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
4409 } else {
4410 if (IsTailCall) {
4411 // isEligibleForTailCallOptimization considered whether the call target is
4412 // divergent, but we may still end up with a uniform value in a VGPR.
4413 // Insert a readfirstlane just in case.
4414 SDValue ReadFirstLaneID =
4415 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4416
4417 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4418 if (TokenGlue)
4419 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
4420 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
4421 ReadfirstlaneArgs);
4422 }
4423
4424 Ops.push_back(Callee);
4425 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
4426 }
4427
4428 if (IsTailCall) {
4429 // Each tail call may have to adjust the stack by a different amount, so
4430 // this information must travel along with the operation for eventual
4431 // consumption by emitEpilogue.
4432 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
4433 }
4434
4435 if (IsChainCallConv)
4436 llvm::append_range(Ops, ChainCallSpecialArgs);
4437
4438 // Add argument registers to the end of the list so that they are known live
4439 // into the call.
4440 for (auto &[Reg, Val] : RegsToPass)
4441 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
4442
4443 // Add a register mask operand representing the call-preserved registers.
4444 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4445 assert(Mask && "Missing call preserved mask for calling convention");
4446 Ops.push_back(DAG.getRegisterMask(Mask));
4447
4448 if (SDValue Token = CLI.ConvergenceControlToken) {
4450 GlueOps.push_back(Token);
4451 if (InGlue)
4452 GlueOps.push_back(InGlue);
4453
4454 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
4455 MVT::Glue, GlueOps),
4456 0);
4457 }
4458
4459 if (InGlue)
4460 Ops.push_back(InGlue);
4461
4462 // If we're doing a tall call, use a TC_RETURN here rather than an
4463 // actual call instruction.
4464 if (IsTailCall) {
4465 MFI.setHasTailCall();
4466 unsigned OPC = AMDGPUISD::TC_RETURN;
4467 switch (CallConv) {
4470 break;
4473 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4475 break;
4476 }
4477
4478 // If the caller is a whole wave function, we need to use a special opcode
4479 // so we can patch up EXEC.
4480 if (Info->isWholeWaveFunction())
4482
4483 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4484 }
4485
4486 // Returns a chain and a flag for retval copy to use.
4487 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4488 Chain = Call.getValue(0);
4489 InGlue = Call.getValue(1);
4490
4491 uint64_t CalleePopBytes = NumBytes;
4492 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4493 if (!Ins.empty())
4494 InGlue = Chain.getValue(1);
4495
4496 // Handle result values, copying them out of physregs into vregs that we
4497 // return.
4498 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4499 InVals, /*IsThisReturn=*/false, SDValue());
4500}
4501
4502// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4503// except for:
4504// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4505// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4507 SelectionDAG &DAG) const {
4508 const MachineFunction &MF = DAG.getMachineFunction();
4510
4511 SDLoc dl(Op);
4512 EVT VT = Op.getValueType();
4513 SDValue Chain = Op.getOperand(0);
4514 Register SPReg = Info->getStackPtrOffsetReg();
4515
4516 // Chain the dynamic stack allocation so that it doesn't modify the stack
4517 // pointer when other instructions are using the stack.
4518 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4519
4520 SDValue Size = Op.getOperand(1);
4521 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4522 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4523
4524 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4526 "Stack grows upwards for AMDGPU");
4527
4528 Chain = BaseAddr.getValue(1);
4529 Align StackAlign = TFL->getStackAlign();
4530 if (Alignment > StackAlign) {
4531 uint64_t ScaledAlignment = Alignment.value()
4532 << Subtarget->getWavefrontSizeLog2();
4533 uint64_t StackAlignMask = ScaledAlignment - 1;
4534 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4535 DAG.getConstant(StackAlignMask, dl, VT));
4536 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4537 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4538 }
4539
4540 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4541 SDValue NewSP;
4543 // For constant sized alloca, scale alloca size by wave-size
4544 SDValue ScaledSize = DAG.getNode(
4545 ISD::SHL, dl, VT, Size,
4546 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4547 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4548 } else {
4549 // For dynamic sized alloca, perform wave-wide reduction to get max of
4550 // alloca size(divergent) and then scale it by wave-size
4551 SDValue WaveReduction =
4552 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4553 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4554 Size, DAG.getConstant(0, dl, MVT::i32));
4555 SDValue ScaledSize = DAG.getNode(
4556 ISD::SHL, dl, VT, Size,
4557 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4558 NewSP =
4559 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4560 SDValue ReadFirstLaneID =
4561 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4562 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4563 NewSP);
4564 }
4565
4566 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4567 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4568
4569 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4570}
4571
4573 if (Op.getValueType() != MVT::i32)
4574 return Op; // Defer to cannot select error.
4575
4577 SDLoc SL(Op);
4578
4579 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4580
4581 // Convert from wave uniform to swizzled vector address. This should protect
4582 // from any edge cases where the stacksave result isn't directly used with
4583 // stackrestore.
4584 SDValue VectorAddress =
4585 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4586 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4587}
4588
4590 SelectionDAG &DAG) const {
4591 SDLoc SL(Op);
4592 assert(Op.getValueType() == MVT::i32);
4593
4594 uint32_t BothRoundHwReg =
4596 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4597
4598 SDValue IntrinID =
4599 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4600 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4601 Op.getOperand(0), IntrinID, GetRoundBothImm);
4602
4603 // There are two rounding modes, one for f32 and one for f64/f16. We only
4604 // report in the standard value range if both are the same.
4605 //
4606 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4607 // ties away from zero is not supported, and the other values are rotated by
4608 // 1.
4609 //
4610 // If the two rounding modes are not the same, report a target defined value.
4611
4612 // Mode register rounding mode fields:
4613 //
4614 // [1:0] Single-precision round mode.
4615 // [3:2] Double/Half-precision round mode.
4616 //
4617 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4618 //
4619 // Hardware Spec
4620 // Toward-0 3 0
4621 // Nearest Even 0 1
4622 // +Inf 1 2
4623 // -Inf 2 3
4624 // NearestAway0 N/A 4
4625 //
4626 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4627 // table we can index by the raw hardware mode.
4628 //
4629 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4630
4631 SDValue BitTable =
4633
4634 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4635 SDValue RoundModeTimesNumBits =
4636 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4637
4638 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4639 // knew only one mode was demanded.
4640 SDValue TableValue =
4641 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4642 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4643
4644 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4645 SDValue TableEntry =
4646 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4647
4648 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4649 // if it's an extended value.
4650 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4651 SDValue IsStandardValue =
4652 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4653 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4654 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4655 TableEntry, EnumOffset);
4656
4657 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4658}
4659
4661 SelectionDAG &DAG) const {
4662 SDLoc SL(Op);
4663
4664 SDValue NewMode = Op.getOperand(1);
4665 assert(NewMode.getValueType() == MVT::i32);
4666
4667 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4668 // hardware MODE.fp_round values.
4669 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4670 uint32_t ClampedVal = std::min(
4671 static_cast<uint32_t>(ConstMode->getZExtValue()),
4673 NewMode = DAG.getConstant(
4674 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4675 } else {
4676 // If we know the input can only be one of the supported standard modes in
4677 // the range 0-3, we can use a simplified mapping to hardware values.
4678 KnownBits KB = DAG.computeKnownBits(NewMode);
4679 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4680 // The supported standard values are 0-3. The extended values start at 8. We
4681 // need to offset by 4 if the value is in the extended range.
4682
4683 if (UseReducedTable) {
4684 // Truncate to the low 32-bits.
4685 SDValue BitTable = DAG.getConstant(
4686 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4687
4688 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4689 SDValue RoundModeTimesNumBits =
4690 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4691
4692 NewMode =
4693 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4694
4695 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4696 // the table extracted bits into inline immediates.
4697 } else {
4698 // table_index = umin(value, value - 4)
4699 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4700 SDValue BitTable =
4702
4703 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4704 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4705 SDValue IndexVal =
4706 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4707
4708 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4709 SDValue RoundModeTimesNumBits =
4710 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4711
4712 SDValue TableValue =
4713 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4714 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4715
4716 // No need to mask out the high bits since the setreg will ignore them
4717 // anyway.
4718 NewMode = TruncTable;
4719 }
4720
4721 // Insert a readfirstlane in case the value is a VGPR. We could do this
4722 // earlier and keep more operations scalar, but that interferes with
4723 // combining the source.
4724 SDValue ReadFirstLaneID =
4725 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4726 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4727 ReadFirstLaneID, NewMode);
4728 }
4729
4730 // N.B. The setreg will be later folded into s_round_mode on supported
4731 // targets.
4732 SDValue IntrinID =
4733 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4734 uint32_t BothRoundHwReg =
4736 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4737
4738 SDValue SetReg =
4739 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4740 IntrinID, RoundBothImm, NewMode);
4741
4742 return SetReg;
4743}
4744
4746 if (Op->isDivergent() &&
4747 (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
4748 // Cannot do I$ prefetch with divergent pointer.
4749 return SDValue();
4750
4751 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4755 break;
4757 if (Subtarget->hasSafeSmemPrefetch())
4758 break;
4759 [[fallthrough]];
4760 default:
4761 return SDValue();
4762 }
4763
4764 // I$ prefetch
4765 if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
4766 return SDValue();
4767
4768 return Op;
4769}
4770
4771// Work around DAG legality rules only based on the result type.
4773 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4774 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4775 EVT SrcVT = Src.getValueType();
4776
4777 if (SrcVT.getScalarType() != MVT::bf16)
4778 return Op;
4779
4780 SDLoc SL(Op);
4781 SDValue BitCast =
4782 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4783
4784 EVT DstVT = Op.getValueType();
4785 if (IsStrict)
4786 llvm_unreachable("Need STRICT_BF16_TO_FP");
4787
4788 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4789}
4790
4792 SDLoc SL(Op);
4793 if (Op.getValueType() != MVT::i64)
4794 return Op;
4795
4796 uint32_t ModeHwReg =
4798 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4799 uint32_t TrapHwReg =
4801 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4802
4803 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4804 SDValue IntrinID =
4805 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4806 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4807 Op.getOperand(0), IntrinID, ModeHwRegImm);
4808 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4809 Op.getOperand(0), IntrinID, TrapHwRegImm);
4810 SDValue TokenReg =
4811 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4812 GetTrapReg.getValue(1));
4813
4814 SDValue CvtPtr =
4815 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4816 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4817
4818 return DAG.getMergeValues({Result, TokenReg}, SL);
4819}
4820
4822 SDLoc SL(Op);
4823 if (Op.getOperand(1).getValueType() != MVT::i64)
4824 return Op;
4825
4826 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4827 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4828 DAG.getConstant(0, SL, MVT::i32));
4829 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4830 DAG.getConstant(1, SL, MVT::i32));
4831
4832 SDValue ReadFirstLaneID =
4833 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4834 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4835 ReadFirstLaneID, NewModeReg);
4836 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4837 ReadFirstLaneID, NewTrapReg);
4838
4839 unsigned ModeHwReg =
4841 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4842 unsigned TrapHwReg =
4844 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4845
4846 SDValue IntrinID =
4847 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4848 SDValue SetModeReg =
4849 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4850 IntrinID, ModeHwRegImm, NewModeReg);
4851 SDValue SetTrapReg =
4852 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4853 IntrinID, TrapHwRegImm, NewTrapReg);
4854 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4855}
4856
4858 const MachineFunction &MF) const {
4859 const Function &Fn = MF.getFunction();
4860
4862 .Case("m0", AMDGPU::M0)
4863 .Case("exec", AMDGPU::EXEC)
4864 .Case("exec_lo", AMDGPU::EXEC_LO)
4865 .Case("exec_hi", AMDGPU::EXEC_HI)
4866 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4867 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4868 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4869 .Default(Register());
4870 if (!Reg)
4871 return Reg;
4872
4873 if (!Subtarget->hasFlatScrRegister() &&
4874 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4875 Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) +
4876 "\" for subtarget."));
4877 }
4878
4879 switch (Reg) {
4880 case AMDGPU::M0:
4881 case AMDGPU::EXEC_LO:
4882 case AMDGPU::EXEC_HI:
4883 case AMDGPU::FLAT_SCR_LO:
4884 case AMDGPU::FLAT_SCR_HI:
4885 if (VT.getSizeInBits() == 32)
4886 return Reg;
4887 break;
4888 case AMDGPU::EXEC:
4889 case AMDGPU::FLAT_SCR:
4890 if (VT.getSizeInBits() == 64)
4891 return Reg;
4892 break;
4893 default:
4894 llvm_unreachable("missing register type checking");
4895 }
4896
4898 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4899}
4900
4901// If kill is not the last instruction, split the block so kill is always a
4902// proper terminator.
4905 MachineBasicBlock *BB) const {
4906 MachineBasicBlock *SplitBB = BB->splitAt(MI, /*UpdateLiveIns=*/true);
4908 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4909 return SplitBB;
4910}
4911
4912// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4913// \p MI will be the only instruction in the loop body block. Otherwise, it will
4914// be the first instruction in the remainder block.
4915//
4916/// \returns { LoopBody, Remainder }
4917static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4919 MachineFunction *MF = MBB.getParent();
4921
4922 // To insert the loop we need to split the block. Move everything after this
4923 // point to a new block, and insert a new empty block between the two.
4925 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4927 ++MBBI;
4928
4929 MF->insert(MBBI, LoopBB);
4930 MF->insert(MBBI, RemainderBB);
4931
4932 LoopBB->addSuccessor(LoopBB);
4933 LoopBB->addSuccessor(RemainderBB);
4934
4935 // Move the rest of the block into a new block.
4936 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4937
4938 if (InstInLoop) {
4939 auto Next = std::next(I);
4940
4941 // Move instruction to loop body.
4942 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4943
4944 // Move the rest of the block.
4945 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4946 } else {
4947 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4948 }
4949
4950 MBB.addSuccessor(LoopBB);
4951
4952 return std::pair(LoopBB, RemainderBB);
4953}
4954
4955/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4957 MachineBasicBlock *MBB = MI.getParent();
4959 auto I = MI.getIterator();
4960 auto E = std::next(I);
4961
4962 // clang-format off
4963 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4964 .addImm(0);
4965 // clang-format on
4966
4967 MIBundleBuilder Bundler(*MBB, I, E);
4968 finalizeBundle(*MBB, Bundler.begin());
4969}
4970
4973 MachineBasicBlock *BB) const {
4974 const DebugLoc &DL = MI.getDebugLoc();
4975
4977
4979
4980 // Apparently kill flags are only valid if the def is in the same block?
4981 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4982 Src->setIsKill(false);
4983
4984 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
4985
4986 MachineBasicBlock::iterator I = LoopBB->end();
4987
4988 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4990
4991 // Clear TRAP_STS.MEM_VIOL
4992 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4993 .addImm(0)
4994 .addImm(EncodedReg);
4995
4997
4998 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4999
5000 // Load and check TRAP_STS.MEM_VIOL
5001 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
5002 .addImm(EncodedReg);
5003
5004 // FIXME: Do we need to use an isel pseudo that may clobber scc?
5005 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5006 .addReg(Reg, RegState::Kill)
5007 .addImm(0);
5008 // clang-format off
5009 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5010 .addMBB(LoopBB);
5011 // clang-format on
5012
5013 return RemainderBB;
5014}
5015
5016// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
5017// wavefront. If the value is uniform and just happens to be in a VGPR, this
5018// will only do one iteration. In the worst case, this will loop 64 times.
5019//
5020// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
5023 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
5024 const DebugLoc &DL, const MachineOperand &Idx,
5025 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
5026 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
5027 Register &SGPRIdxReg) {
5028
5029 MachineFunction *MF = OrigBB.getParent();
5030 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5031 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5034
5035 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5036 Register PhiExec = MRI.createVirtualRegister(BoolRC);
5037 Register NewExec = MRI.createVirtualRegister(BoolRC);
5038 Register CurrentIdxReg =
5039 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5040 Register CondReg = MRI.createVirtualRegister(BoolRC);
5041
5042 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
5043 .addReg(InitReg)
5044 .addMBB(&OrigBB)
5045 .addReg(ResultReg)
5046 .addMBB(&LoopBB);
5047
5048 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
5049 .addReg(InitSaveExecReg)
5050 .addMBB(&OrigBB)
5051 .addReg(NewExec)
5052 .addMBB(&LoopBB);
5053
5054 // Read the next variant <- also loop target.
5055 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5056 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
5057
5058 // Compare the just read M0 value to all possible Idx values.
5059 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5060 .addReg(CurrentIdxReg)
5061 .addReg(Idx.getReg(), 0, Idx.getSubReg());
5062
5063 // Update EXEC, save the original EXEC value to VCC.
5064 BuildMI(LoopBB, I, DL, TII->get(LMC.AndSaveExecOpc), NewExec)
5065 .addReg(CondReg, RegState::Kill);
5066
5067 MRI.setSimpleHint(NewExec, CondReg);
5068
5069 if (UseGPRIdxMode) {
5070 if (Offset == 0) {
5071 SGPRIdxReg = CurrentIdxReg;
5072 } else {
5073 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5074 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5075 .addReg(CurrentIdxReg, RegState::Kill)
5076 .addImm(Offset);
5077 }
5078 } else {
5079 // Move index from VCC into M0
5080 if (Offset == 0) {
5081 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5082 .addReg(CurrentIdxReg, RegState::Kill);
5083 } else {
5084 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5085 .addReg(CurrentIdxReg, RegState::Kill)
5086 .addImm(Offset);
5087 }
5088 }
5089
5090 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
5091 MachineInstr *InsertPt =
5092 BuildMI(LoopBB, I, DL, TII->get(LMC.XorTermOpc), LMC.ExecReg)
5093 .addReg(LMC.ExecReg)
5094 .addReg(NewExec);
5095
5096 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
5097 // s_cbranch_scc0?
5098
5099 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
5100 // clang-format off
5101 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5102 .addMBB(&LoopBB);
5103 // clang-format on
5104
5105 return InsertPt->getIterator();
5106}
5107
5108// This has slightly sub-optimal regalloc when the source vector is killed by
5109// the read. The register allocator does not understand that the kill is
5110// per-workitem, so is kept alive for the whole loop so we end up not re-using a
5111// subregister from it, using 1 more VGPR than necessary. This was saved when
5112// this was expanded after register allocation.
5115 unsigned InitResultReg, unsigned PhiReg, int Offset,
5116 bool UseGPRIdxMode, Register &SGPRIdxReg) {
5117 MachineFunction *MF = MBB.getParent();
5118 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5119 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5121 const DebugLoc &DL = MI.getDebugLoc();
5123
5124 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
5125 Register DstReg = MI.getOperand(0).getReg();
5126 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
5127 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
5129
5130 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
5131
5132 // Save the EXEC mask
5133 // clang-format off
5134 BuildMI(MBB, I, DL, TII->get(LMC.MovOpc), SaveExec)
5135 .addReg(LMC.ExecReg);
5136 // clang-format on
5137
5138 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
5139
5140 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5141
5142 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
5143 InitResultReg, DstReg, PhiReg, TmpExec,
5144 Offset, UseGPRIdxMode, SGPRIdxReg);
5145
5146 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
5148 ++MBBI;
5149 MF->insert(MBBI, LandingPad);
5150 LoopBB->removeSuccessor(RemainderBB);
5151 LandingPad->addSuccessor(RemainderBB);
5152 LoopBB->addSuccessor(LandingPad);
5153 MachineBasicBlock::iterator First = LandingPad->begin();
5154 // clang-format off
5155 BuildMI(*LandingPad, First, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
5156 .addReg(SaveExec);
5157 // clang-format on
5158
5159 return InsPt;
5160}
5161
5162// Returns subreg index, offset
5163static std::pair<unsigned, int>
5165 const TargetRegisterClass *SuperRC, unsigned VecReg,
5166 int Offset) {
5167 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
5168
5169 // Skip out of bounds offsets, or else we would end up using an undefined
5170 // register.
5171 if (Offset >= NumElts || Offset < 0)
5172 return std::pair(AMDGPU::sub0, Offset);
5173
5174 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
5175}
5176
5179 int Offset) {
5180 MachineBasicBlock *MBB = MI.getParent();
5181 const DebugLoc &DL = MI.getDebugLoc();
5183
5184 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5185
5186 assert(Idx->getReg() != AMDGPU::NoRegister);
5187
5188 if (Offset == 0) {
5189 // clang-format off
5190 BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5191 .add(*Idx);
5192 // clang-format on
5193 } else {
5194 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5195 .add(*Idx)
5196 .addImm(Offset);
5197 }
5198}
5199
5202 int Offset) {
5203 MachineBasicBlock *MBB = MI.getParent();
5204 const DebugLoc &DL = MI.getDebugLoc();
5206
5207 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5208
5209 if (Offset == 0)
5210 return Idx->getReg();
5211
5212 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5213 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
5214 .add(*Idx)
5215 .addImm(Offset);
5216 return Tmp;
5217}
5218
5221 const GCNSubtarget &ST) {
5222 const SIInstrInfo *TII = ST.getInstrInfo();
5223 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5224 MachineFunction *MF = MBB.getParent();
5226
5227 Register Dst = MI.getOperand(0).getReg();
5228 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5229 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
5230 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5231
5232 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
5233 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5234
5235 unsigned SubReg;
5236 std::tie(SubReg, Offset) =
5237 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
5238
5239 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5240
5241 // Check for a SGPR index.
5242 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5244 const DebugLoc &DL = MI.getDebugLoc();
5245
5246 if (UseGPRIdxMode) {
5247 // TODO: Look at the uses to avoid the copy. This may require rescheduling
5248 // to avoid interfering with other uses, so probably requires a new
5249 // optimization pass.
5251
5252 const MCInstrDesc &GPRIDXDesc =
5253 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5254 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5255 .addReg(SrcReg)
5256 .addReg(Idx)
5257 .addImm(SubReg);
5258 } else {
5260
5261 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5262 .addReg(SrcReg, 0, SubReg)
5263 .addReg(SrcReg, RegState::Implicit);
5264 }
5265
5266 MI.eraseFromParent();
5267
5268 return &MBB;
5269 }
5270
5271 // Control flow needs to be inserted if indexing with a VGPR.
5272 const DebugLoc &DL = MI.getDebugLoc();
5274
5275 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5276 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5277
5278 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
5279
5280 Register SGPRIdxReg;
5281 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
5282 UseGPRIdxMode, SGPRIdxReg);
5283
5284 MachineBasicBlock *LoopBB = InsPt->getParent();
5285
5286 if (UseGPRIdxMode) {
5287 const MCInstrDesc &GPRIDXDesc =
5288 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5289
5290 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5291 .addReg(SrcReg)
5292 .addReg(SGPRIdxReg)
5293 .addImm(SubReg);
5294 } else {
5295 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5296 .addReg(SrcReg, 0, SubReg)
5297 .addReg(SrcReg, RegState::Implicit);
5298 }
5299
5300 MI.eraseFromParent();
5301
5302 return LoopBB;
5303}
5304
5307 const GCNSubtarget &ST) {
5308 const SIInstrInfo *TII = ST.getInstrInfo();
5309 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5310 MachineFunction *MF = MBB.getParent();
5312
5313 Register Dst = MI.getOperand(0).getReg();
5314 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
5315 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5316 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
5317 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5318 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
5319 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5320
5321 // This can be an immediate, but will be folded later.
5322 assert(Val->getReg());
5323
5324 unsigned SubReg;
5325 std::tie(SubReg, Offset) =
5326 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
5327 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5328
5329 if (Idx->getReg() == AMDGPU::NoRegister) {
5331 const DebugLoc &DL = MI.getDebugLoc();
5332
5333 assert(Offset == 0);
5334
5335 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
5336 .add(*SrcVec)
5337 .add(*Val)
5338 .addImm(SubReg);
5339
5340 MI.eraseFromParent();
5341 return &MBB;
5342 }
5343
5344 // Check for a SGPR index.
5345 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5347 const DebugLoc &DL = MI.getDebugLoc();
5348
5349 if (UseGPRIdxMode) {
5351
5352 const MCInstrDesc &GPRIDXDesc =
5353 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5354 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5355 .addReg(SrcVec->getReg())
5356 .add(*Val)
5357 .addReg(Idx)
5358 .addImm(SubReg);
5359 } else {
5361
5362 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5363 TRI.getRegSizeInBits(*VecRC), 32, false);
5364 BuildMI(MBB, I, DL, MovRelDesc, Dst)
5365 .addReg(SrcVec->getReg())
5366 .add(*Val)
5367 .addImm(SubReg);
5368 }
5369 MI.eraseFromParent();
5370 return &MBB;
5371 }
5372
5373 // Control flow needs to be inserted if indexing with a VGPR.
5374 if (Val->isReg())
5375 MRI.clearKillFlags(Val->getReg());
5376
5377 const DebugLoc &DL = MI.getDebugLoc();
5378
5379 Register PhiReg = MRI.createVirtualRegister(VecRC);
5380
5381 Register SGPRIdxReg;
5382 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
5383 UseGPRIdxMode, SGPRIdxReg);
5384 MachineBasicBlock *LoopBB = InsPt->getParent();
5385
5386 if (UseGPRIdxMode) {
5387 const MCInstrDesc &GPRIDXDesc =
5388 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5389
5390 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5391 .addReg(PhiReg)
5392 .add(*Val)
5393 .addReg(SGPRIdxReg)
5394 .addImm(SubReg);
5395 } else {
5396 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5397 TRI.getRegSizeInBits(*VecRC), 32, false);
5398 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
5399 .addReg(PhiReg)
5400 .add(*Val)
5401 .addImm(SubReg);
5402 }
5403
5404 MI.eraseFromParent();
5405 return LoopBB;
5406}
5407
5409 MachineBasicBlock *BB) {
5410 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5411 // For GFX12, we emit s_add_u64 and s_sub_u64.
5412 MachineFunction *MF = BB->getParent();
5413 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5414 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5416 const DebugLoc &DL = MI.getDebugLoc();
5417 MachineOperand &Dest = MI.getOperand(0);
5418 MachineOperand &Src0 = MI.getOperand(1);
5419 MachineOperand &Src1 = MI.getOperand(2);
5420 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5421 if (ST.hasScalarAddSub64()) {
5422 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5423 // clang-format off
5424 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5425 .add(Src0)
5426 .add(Src1);
5427 // clang-format on
5428 } else {
5429 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5430 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5431
5432 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5433 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5434
5435 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5436 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5437 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5438 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5439
5440 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5441 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5442 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5443 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5444
5445 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5446 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5447 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5448 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5449 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5450 .addReg(DestSub0)
5451 .addImm(AMDGPU::sub0)
5452 .addReg(DestSub1)
5453 .addImm(AMDGPU::sub1);
5454 }
5455 MI.eraseFromParent();
5456 return BB;
5457}
5458
5460 switch (Opc) {
5461 case AMDGPU::S_MIN_U32:
5462 return std::numeric_limits<uint32_t>::max();
5463 case AMDGPU::S_MIN_I32:
5464 return std::numeric_limits<int32_t>::max();
5465 case AMDGPU::S_MAX_U32:
5466 return std::numeric_limits<uint32_t>::min();
5467 case AMDGPU::S_MAX_I32:
5468 return std::numeric_limits<int32_t>::min();
5469 case AMDGPU::S_ADD_I32:
5470 case AMDGPU::S_SUB_I32:
5471 case AMDGPU::S_OR_B32:
5472 case AMDGPU::S_XOR_B32:
5473 return std::numeric_limits<uint32_t>::min();
5474 case AMDGPU::S_AND_B32:
5475 return std::numeric_limits<uint32_t>::max();
5476 default:
5478 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5479 }
5480}
5481
5483 switch (Opc) {
5484 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5485 return std::numeric_limits<uint64_t>::max();
5486 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5487 return std::numeric_limits<int64_t>::max();
5488 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5489 return std::numeric_limits<uint64_t>::min();
5490 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5491 return std::numeric_limits<int64_t>::min();
5492 case AMDGPU::S_ADD_U64_PSEUDO:
5493 case AMDGPU::S_SUB_U64_PSEUDO:
5494 case AMDGPU::S_OR_B64:
5495 case AMDGPU::S_XOR_B64:
5496 return std::numeric_limits<uint64_t>::min();
5497 case AMDGPU::S_AND_B64:
5498 return std::numeric_limits<uint64_t>::max();
5499 default:
5501 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5502 }
5503}
5504
5505static bool is32bitWaveReduceOperation(unsigned Opc) {
5506 return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
5507 Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
5508 Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
5509 Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5510 Opc == AMDGPU::S_XOR_B32;
5511}
5512
5515 const GCNSubtarget &ST,
5516 unsigned Opc) {
5518 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5519 const DebugLoc &DL = MI.getDebugLoc();
5520 const SIInstrInfo *TII = ST.getInstrInfo();
5521
5522 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5523 Register SrcReg = MI.getOperand(1).getReg();
5524 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
5525 Register DstReg = MI.getOperand(0).getReg();
5526 MachineBasicBlock *RetBB = nullptr;
5527 if (isSGPR) {
5528 switch (Opc) {
5529 case AMDGPU::S_MIN_U32:
5530 case AMDGPU::S_MIN_I32:
5531 case AMDGPU::S_MAX_U32:
5532 case AMDGPU::S_MAX_I32:
5533 case AMDGPU::S_AND_B32:
5534 case AMDGPU::S_OR_B32: {
5535 // Idempotent operations.
5536 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5537 RetBB = &BB;
5538 break;
5539 }
5540 case AMDGPU::V_CMP_LT_U64_e64: // umin
5541 case AMDGPU::V_CMP_LT_I64_e64: // min
5542 case AMDGPU::V_CMP_GT_U64_e64: // umax
5543 case AMDGPU::V_CMP_GT_I64_e64: // max
5544 case AMDGPU::S_AND_B64:
5545 case AMDGPU::S_OR_B64: {
5546 // Idempotent operations.
5547 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
5548 RetBB = &BB;
5549 break;
5550 }
5551 case AMDGPU::S_XOR_B32:
5552 case AMDGPU::S_XOR_B64:
5553 case AMDGPU::S_ADD_I32:
5554 case AMDGPU::S_ADD_U64_PSEUDO:
5555 case AMDGPU::S_SUB_I32:
5556 case AMDGPU::S_SUB_U64_PSEUDO: {
5557 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5558 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5559 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5560 Register NumActiveLanes =
5561 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5562
5563 bool IsWave32 = ST.isWave32();
5564 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5565 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5566 unsigned BitCountOpc =
5567 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5568
5569 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5570
5571 auto NewAccumulator =
5572 BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
5573 .addReg(ExecMask);
5574
5575 switch (Opc) {
5576 case AMDGPU::S_XOR_B32:
5577 case AMDGPU::S_XOR_B64: {
5578 // Performing an XOR operation on a uniform value
5579 // depends on the parity of the number of active lanes.
5580 // For even parity, the result will be 0, for odd
5581 // parity the result will be the same as the input value.
5582 Register ParityRegister =
5583 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5584
5585 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5586 .addReg(NewAccumulator->getOperand(0).getReg())
5587 .addImm(1)
5588 .setOperandDead(3); // Dead scc
5589 if (Opc == AMDGPU::S_XOR_B32) {
5590 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5591 .addReg(SrcReg)
5592 .addReg(ParityRegister);
5593 } else {
5594 Register DestSub0 =
5595 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5596 Register DestSub1 =
5597 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5598
5599 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5600 const TargetRegisterClass *SrcSubRC =
5601 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5602
5603 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5604 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5605 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5606 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5607
5608 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5609 .add(Op1L)
5610 .addReg(ParityRegister);
5611
5612 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5613 .add(Op1H)
5614 .addReg(ParityRegister);
5615
5616 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5617 .addReg(DestSub0)
5618 .addImm(AMDGPU::sub0)
5619 .addReg(DestSub1)
5620 .addImm(AMDGPU::sub1);
5621 }
5622 break;
5623 }
5624 case AMDGPU::S_SUB_I32: {
5625 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5626
5627 // Take the negation of the source operand.
5628 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5629 .addImm(0)
5630 .addReg(SrcReg);
5631 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5632 .addReg(NegatedVal)
5633 .addReg(NewAccumulator->getOperand(0).getReg());
5634 break;
5635 }
5636 case AMDGPU::S_ADD_I32: {
5637 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5638 .addReg(SrcReg)
5639 .addReg(NewAccumulator->getOperand(0).getReg());
5640 break;
5641 }
5642 case AMDGPU::S_ADD_U64_PSEUDO:
5643 case AMDGPU::S_SUB_U64_PSEUDO: {
5644 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5645 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5646 Register Op1H_Op0L_Reg =
5647 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5648 Register Op1L_Op0H_Reg =
5649 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5650 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5651 Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5652 Register NegatedValLo =
5653 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5654 Register NegatedValHi =
5655 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5656
5657 const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5658 const TargetRegisterClass *Src1SubRC =
5659 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5660
5661 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5662 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5663 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5664 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5665
5666 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5667 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5668 .addImm(0)
5669 .addReg(NewAccumulator->getOperand(0).getReg())
5670 .setOperandDead(3); // Dead scc
5671 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5672 .addReg(NegatedValLo)
5673 .addImm(31)
5674 .setOperandDead(3); // Dead scc
5675 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5676 .add(Op1L)
5677 .addReg(NegatedValHi);
5678 }
5679 Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5680 ? NegatedValLo
5681 : NewAccumulator->getOperand(0).getReg();
5682 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5683 .add(Op1L)
5684 .addReg(LowOpcode);
5685 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5686 .add(Op1L)
5687 .addReg(LowOpcode);
5688 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5689 .add(Op1H)
5690 .addReg(LowOpcode);
5691
5692 Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5693 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5694 .addReg(CarryReg)
5695 .addReg(Op1H_Op0L_Reg)
5696 .setOperandDead(3); // Dead scc
5697
5698 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5699 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5700 .addReg(HiVal)
5701 .addReg(Op1L_Op0H_Reg)
5702 .setOperandDead(3); // Dead scc
5703 }
5704 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5705 .addReg(DestSub0)
5706 .addImm(AMDGPU::sub0)
5707 .addReg(DestSub1)
5708 .addImm(AMDGPU::sub1);
5709 break;
5710 }
5711 }
5712 RetBB = &BB;
5713 }
5714 }
5715 } else {
5716 // TODO: Implement DPP Strategy and switch based on immediate strategy
5717 // operand. For now, for all the cases (default, Iterative and DPP we use
5718 // iterative approach by default.)
5719
5720 // To reduce the VGPR using iterative approach, we need to iterate
5721 // over all the active lanes. Lowering consists of ComputeLoop,
5722 // which iterate over only active lanes. We use copy of EXEC register
5723 // as induction variable and every active lane modifies it using bitset0
5724 // so that we will get the next active lane for next iteration.
5726 Register SrcReg = MI.getOperand(1).getReg();
5727 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5728
5729 // Create Control flow for loop
5730 // Split MI's Machine Basic block into For loop
5731 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
5732
5733 // Create virtual registers required for lowering.
5734 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5735 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5736 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5737 Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
5738 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5739 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5740 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5741 Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5742 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
5743
5744 bool IsWave32 = ST.isWave32();
5745 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5746 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5747
5748 // Create initial values of induction variable from Exec, Accumulator and
5749 // insert branch instr to newly created ComputeBlock
5750 BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
5751 if (is32BitOpc) {
5753 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5754 .addImm(IdentityValue);
5755 } else {
5757 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5758 .addImm(IdentityValue);
5759 }
5760 // clang-format off
5761 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5762 .addMBB(ComputeLoop);
5763 // clang-format on
5764
5765 // Start constructing ComputeLoop
5766 I = ComputeLoop->begin();
5767 auto Accumulator =
5768 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5769 .addReg(IdentityValReg)
5770 .addMBB(&BB);
5771 auto ActiveBits =
5772 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5773 .addReg(LoopIterator)
5774 .addMBB(&BB);
5775
5776 I = ComputeLoop->end();
5777 MachineInstr *NewAccumulator;
5778 // Perform the computations
5779 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5780 BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5781 .addReg(ActiveBitsReg);
5782 if (is32BitOpc) {
5783 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5784 LaneValueReg)
5785 .addReg(SrcReg)
5786 .addReg(FF1Reg);
5787 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5788 .addReg(Accumulator->getOperand(0).getReg())
5789 .addReg(LaneValueReg);
5790 } else {
5791 Register LaneValueLoReg =
5792 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5793 Register LaneValueHiReg =
5794 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5795 Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5796 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5797 const TargetRegisterClass *SrcSubRC =
5798 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5799 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5800 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5801 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5802 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5803 // lane value input should be in an sgpr
5804 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5805 LaneValueLoReg)
5806 .add(Op1L)
5807 .addReg(FF1Reg);
5808 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5809 LaneValueHiReg)
5810 .add(Op1H)
5811 .addReg(FF1Reg);
5812 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5813 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5814 .addReg(LaneValueLoReg)
5815 .addImm(AMDGPU::sub0)
5816 .addReg(LaneValueHiReg)
5817 .addImm(AMDGPU::sub1);
5818 switch (Opc) {
5819 case AMDGPU::S_OR_B64:
5820 case AMDGPU::S_AND_B64:
5821 case AMDGPU::S_XOR_B64: {
5822 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5823 .addReg(Accumulator->getOperand(0).getReg())
5824 .addReg(LaneValue->getOperand(0).getReg())
5825 .setOperandDead(3); // Dead scc
5826 break;
5827 }
5828 case AMDGPU::V_CMP_GT_I64_e64:
5829 case AMDGPU::V_CMP_GT_U64_e64:
5830 case AMDGPU::V_CMP_LT_I64_e64:
5831 case AMDGPU::V_CMP_LT_U64_e64: {
5832 Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
5833 Register ComparisonResultReg =
5834 MRI.createVirtualRegister(WaveMaskRegClass);
5835 const TargetRegisterClass *VregClass = TRI->getVGPR64Class();
5836 const TargetRegisterClass *VSubRegClass =
5837 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5838 Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
5839 MachineOperand SrcReg0Sub0 =
5840 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5841 VregClass, AMDGPU::sub0, VSubRegClass);
5842 MachineOperand SrcReg0Sub1 =
5843 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5844 VregClass, AMDGPU::sub1, VSubRegClass);
5845 BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
5846 AccumulatorVReg)
5847 .add(SrcReg0Sub0)
5848 .addImm(AMDGPU::sub0)
5849 .add(SrcReg0Sub1)
5850 .addImm(AMDGPU::sub1);
5851 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
5852 .addReg(LaneValue->getOperand(0).getReg())
5853 .addReg(AccumulatorVReg);
5854
5855 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5856 BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
5857 .addReg(LaneMaskReg)
5858 .addReg(ActiveBitsReg);
5859
5860 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5861 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5862 .addReg(LaneValue->getOperand(0).getReg())
5863 .addReg(Accumulator->getOperand(0).getReg());
5864 break;
5865 }
5866 case AMDGPU::S_ADD_U64_PSEUDO:
5867 case AMDGPU::S_SUB_U64_PSEUDO: {
5868 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5869 .addReg(Accumulator->getOperand(0).getReg())
5870 .addReg(LaneValue->getOperand(0).getReg());
5871 ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
5872 break;
5873 }
5874 }
5875 }
5876 // Manipulate the iterator to get the next active lane
5877 unsigned BITSETOpc =
5878 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5879 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5880 .addReg(FF1Reg)
5881 .addReg(ActiveBitsReg);
5882
5883 // Add phi nodes
5884 Accumulator.addReg(DstReg).addMBB(ComputeLoop);
5885 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5886
5887 // Creating branching
5888 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5889 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5890 .addReg(NewActiveBitsReg)
5891 .addImm(0);
5892 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5893 .addMBB(ComputeLoop);
5894
5895 RetBB = ComputeEnd;
5896 }
5897 MI.eraseFromParent();
5898 return RetBB;
5899}
5900
5903 MachineBasicBlock *BB) const {
5904 MachineFunction *MF = BB->getParent();
5906 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5908 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
5910 const DebugLoc &DL = MI.getDebugLoc();
5911
5912 switch (MI.getOpcode()) {
5913 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5914 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5915 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5916 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
5917 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5918 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5919 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5920 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
5921 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5922 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5923 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5924 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
5925 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5926 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5927 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
5928 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
5929 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5930 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5931 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5932 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
5933 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5934 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5935 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5936 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
5937 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5938 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5939 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
5940 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
5941 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5942 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5943 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
5944 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
5945 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5946 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
5947 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
5948 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
5949 case AMDGPU::S_UADDO_PSEUDO:
5950 case AMDGPU::S_USUBO_PSEUDO: {
5951 MachineOperand &Dest0 = MI.getOperand(0);
5952 MachineOperand &Dest1 = MI.getOperand(1);
5953 MachineOperand &Src0 = MI.getOperand(2);
5954 MachineOperand &Src1 = MI.getOperand(3);
5955
5956 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5957 ? AMDGPU::S_ADD_U32
5958 : AMDGPU::S_SUB_U32;
5959 // clang-format off
5960 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
5961 .add(Src0)
5962 .add(Src1);
5963 // clang-format on
5964
5965 unsigned SelOpc =
5966 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5967 BuildMI(*BB, MI, DL, TII->get(SelOpc), Dest1.getReg()).addImm(-1).addImm(0);
5968
5969 MI.eraseFromParent();
5970 return BB;
5971 }
5972 case AMDGPU::S_ADD_U64_PSEUDO:
5973 case AMDGPU::S_SUB_U64_PSEUDO: {
5974 return Expand64BitScalarArithmetic(MI, BB);
5975 }
5976 case AMDGPU::V_ADD_U64_PSEUDO:
5977 case AMDGPU::V_SUB_U64_PSEUDO: {
5978 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5979
5980 MachineOperand &Dest = MI.getOperand(0);
5981 MachineOperand &Src0 = MI.getOperand(1);
5982 MachineOperand &Src1 = MI.getOperand(2);
5983
5984 if (ST.hasAddSubU64Insts()) {
5985 auto I = BuildMI(*BB, MI, DL,
5986 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
5987 : AMDGPU::V_SUB_U64_e64),
5988 Dest.getReg())
5989 .add(Src0)
5990 .add(Src1)
5991 .addImm(0); // clamp
5992 TII->legalizeOperands(*I);
5993 MI.eraseFromParent();
5994 return BB;
5995 }
5996
5997 if (IsAdd && ST.hasLshlAddU64Inst()) {
5998 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
5999 Dest.getReg())
6000 .add(Src0)
6001 .addImm(0)
6002 .add(Src1);
6003 TII->legalizeOperands(*Add);
6004 MI.eraseFromParent();
6005 return BB;
6006 }
6007
6008 const auto *CarryRC = TRI->getWaveMaskRegClass();
6009
6010 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6011 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6012
6013 Register CarryReg = MRI.createVirtualRegister(CarryRC);
6014 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
6015
6016 const TargetRegisterClass *Src0RC = Src0.isReg()
6017 ? MRI.getRegClass(Src0.getReg())
6018 : &AMDGPU::VReg_64RegClass;
6019 const TargetRegisterClass *Src1RC = Src1.isReg()
6020 ? MRI.getRegClass(Src1.getReg())
6021 : &AMDGPU::VReg_64RegClass;
6022
6023 const TargetRegisterClass *Src0SubRC =
6024 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6025 const TargetRegisterClass *Src1SubRC =
6026 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6027
6028 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
6029 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6030 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
6031 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6032
6033 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
6034 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6035 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
6036 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6037
6038 unsigned LoOpc =
6039 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6040 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
6041 .addReg(CarryReg, RegState::Define)
6042 .add(SrcReg0Sub0)
6043 .add(SrcReg1Sub0)
6044 .addImm(0); // clamp bit
6045
6046 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6047 MachineInstr *HiHalf =
6048 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
6049 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
6050 .add(SrcReg0Sub1)
6051 .add(SrcReg1Sub1)
6052 .addReg(CarryReg, RegState::Kill)
6053 .addImm(0); // clamp bit
6054
6055 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
6056 .addReg(DestSub0)
6057 .addImm(AMDGPU::sub0)
6058 .addReg(DestSub1)
6059 .addImm(AMDGPU::sub1);
6060 TII->legalizeOperands(*LoHalf);
6061 TII->legalizeOperands(*HiHalf);
6062 MI.eraseFromParent();
6063 return BB;
6064 }
6065 case AMDGPU::S_ADD_CO_PSEUDO:
6066 case AMDGPU::S_SUB_CO_PSEUDO: {
6067 // This pseudo has a chance to be selected
6068 // only from uniform add/subcarry node. All the VGPR operands
6069 // therefore assumed to be splat vectors.
6071 MachineOperand &Dest = MI.getOperand(0);
6072 MachineOperand &CarryDest = MI.getOperand(1);
6073 MachineOperand &Src0 = MI.getOperand(2);
6074 MachineOperand &Src1 = MI.getOperand(3);
6075 MachineOperand &Src2 = MI.getOperand(4);
6076 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
6077 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6078 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6079 .addReg(Src0.getReg());
6080 Src0.setReg(RegOp0);
6081 }
6082 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
6083 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6084 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6085 .addReg(Src1.getReg());
6086 Src1.setReg(RegOp1);
6087 }
6088 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6089 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
6090 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6091 .addReg(Src2.getReg());
6092 Src2.setReg(RegOp2);
6093 }
6094
6095 if (ST.isWave64()) {
6096 if (ST.hasScalarCompareEq64()) {
6097 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
6098 .addReg(Src2.getReg())
6099 .addImm(0);
6100 } else {
6101 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
6102 const TargetRegisterClass *SubRC =
6103 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6104 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6105 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6106 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6107 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6108 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6109
6110 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
6111 .add(Src2Sub0)
6112 .add(Src2Sub1);
6113
6114 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6115 .addReg(Src2_32, RegState::Kill)
6116 .addImm(0);
6117 }
6118 } else {
6119 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6120 .addReg(Src2.getReg())
6121 .addImm(0);
6122 }
6123
6124 unsigned Opc = MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
6125 ? AMDGPU::S_ADDC_U32
6126 : AMDGPU::S_SUBB_U32;
6127
6128 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
6129
6130 unsigned SelOpc =
6131 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6132
6133 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
6134 .addImm(-1)
6135 .addImm(0);
6136
6137 MI.eraseFromParent();
6138 return BB;
6139 }
6140 case AMDGPU::SI_INIT_M0: {
6141 MachineOperand &M0Init = MI.getOperand(0);
6142 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6143 TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6144 AMDGPU::M0)
6145 .add(M0Init);
6146 MI.eraseFromParent();
6147 return BB;
6148 }
6149 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6150 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6151 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6152 TII->get(AMDGPU::S_CMP_EQ_U32))
6153 .addImm(0)
6154 .addImm(0);
6155 return BB;
6156 }
6157 case AMDGPU::GET_GROUPSTATICSIZE: {
6158 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
6159 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
6160 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
6161 .add(MI.getOperand(0))
6162 .addImm(MFI->getLDSSize());
6163 MI.eraseFromParent();
6164 return BB;
6165 }
6166 case AMDGPU::GET_SHADERCYCLESHILO: {
6168 // The algorithm is:
6169 //
6170 // hi1 = getreg(SHADER_CYCLES_HI)
6171 // lo1 = getreg(SHADER_CYCLES_LO)
6172 // hi2 = getreg(SHADER_CYCLES_HI)
6173 //
6174 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
6175 // Otherwise there was overflow and the result is hi2:0. In both cases the
6176 // result should represent the actual time at some point during the sequence
6177 // of three getregs.
6178 using namespace AMDGPU::Hwreg;
6179 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6180 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
6181 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6182 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6183 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
6184 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6185 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6186 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
6187 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6188 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
6189 .addReg(RegHi1)
6190 .addReg(RegHi2);
6191 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6192 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
6193 .addReg(RegLo1)
6194 .addImm(0);
6195 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
6196 .add(MI.getOperand(0))
6197 .addReg(RegLo)
6198 .addImm(AMDGPU::sub0)
6199 .addReg(RegHi2)
6200 .addImm(AMDGPU::sub1);
6201 MI.eraseFromParent();
6202 return BB;
6203 }
6204 case AMDGPU::SI_INDIRECT_SRC_V1:
6205 case AMDGPU::SI_INDIRECT_SRC_V2:
6206 case AMDGPU::SI_INDIRECT_SRC_V4:
6207 case AMDGPU::SI_INDIRECT_SRC_V8:
6208 case AMDGPU::SI_INDIRECT_SRC_V9:
6209 case AMDGPU::SI_INDIRECT_SRC_V10:
6210 case AMDGPU::SI_INDIRECT_SRC_V11:
6211 case AMDGPU::SI_INDIRECT_SRC_V12:
6212 case AMDGPU::SI_INDIRECT_SRC_V16:
6213 case AMDGPU::SI_INDIRECT_SRC_V32:
6214 return emitIndirectSrc(MI, *BB, *getSubtarget());
6215 case AMDGPU::SI_INDIRECT_DST_V1:
6216 case AMDGPU::SI_INDIRECT_DST_V2:
6217 case AMDGPU::SI_INDIRECT_DST_V4:
6218 case AMDGPU::SI_INDIRECT_DST_V8:
6219 case AMDGPU::SI_INDIRECT_DST_V9:
6220 case AMDGPU::SI_INDIRECT_DST_V10:
6221 case AMDGPU::SI_INDIRECT_DST_V11:
6222 case AMDGPU::SI_INDIRECT_DST_V12:
6223 case AMDGPU::SI_INDIRECT_DST_V16:
6224 case AMDGPU::SI_INDIRECT_DST_V32:
6225 return emitIndirectDst(MI, *BB, *getSubtarget());
6226 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6227 case AMDGPU::SI_KILL_I1_PSEUDO:
6228 return splitKillBlock(MI, BB);
6229 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6230 Register Dst = MI.getOperand(0).getReg();
6231 const MachineOperand &Src0 = MI.getOperand(1);
6232 const MachineOperand &Src1 = MI.getOperand(2);
6233 Register SrcCond = MI.getOperand(3).getReg();
6234
6235 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6236 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6237 const auto *CondRC = TRI->getWaveMaskRegClass();
6238 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
6239
6240 const TargetRegisterClass *Src0RC = Src0.isReg()
6241 ? MRI.getRegClass(Src0.getReg())
6242 : &AMDGPU::VReg_64RegClass;
6243 const TargetRegisterClass *Src1RC = Src1.isReg()
6244 ? MRI.getRegClass(Src1.getReg())
6245 : &AMDGPU::VReg_64RegClass;
6246
6247 const TargetRegisterClass *Src0SubRC =
6248 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6249 const TargetRegisterClass *Src1SubRC =
6250 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6251
6252 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
6253 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6254 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
6255 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6256
6257 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
6258 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6259 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
6260 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6261
6262 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
6263 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
6264 .addImm(0)
6265 .add(Src0Sub0)
6266 .addImm(0)
6267 .add(Src1Sub0)
6268 .addReg(SrcCondCopy);
6269 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
6270 .addImm(0)
6271 .add(Src0Sub1)
6272 .addImm(0)
6273 .add(Src1Sub1)
6274 .addReg(SrcCondCopy);
6275
6276 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
6277 .addReg(DstLo)
6278 .addImm(AMDGPU::sub0)
6279 .addReg(DstHi)
6280 .addImm(AMDGPU::sub1);
6281 MI.eraseFromParent();
6282 return BB;
6283 }
6284 case AMDGPU::SI_BR_UNDEF: {
6285 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
6286 .add(MI.getOperand(0));
6287 Br->getOperand(1).setIsUndef(); // read undef SCC
6288 MI.eraseFromParent();
6289 return BB;
6290 }
6291 case AMDGPU::ADJCALLSTACKUP:
6292 case AMDGPU::ADJCALLSTACKDOWN: {
6294 MachineInstrBuilder MIB(*MF, &MI);
6295 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
6296 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
6297 return BB;
6298 }
6299 case AMDGPU::SI_CALL_ISEL: {
6300 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
6301
6303 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6304
6305 for (const MachineOperand &MO : MI.operands())
6306 MIB.add(MO);
6307
6308 MIB.cloneMemRefs(MI);
6309 MI.eraseFromParent();
6310 return BB;
6311 }
6312 case AMDGPU::V_ADD_CO_U32_e32:
6313 case AMDGPU::V_SUB_CO_U32_e32:
6314 case AMDGPU::V_SUBREV_CO_U32_e32: {
6315 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
6316 unsigned Opc = MI.getOpcode();
6317
6318 bool NeedClampOperand = false;
6319 if (TII->pseudoToMCOpcode(Opc) == -1) {
6321 NeedClampOperand = true;
6322 }
6323
6324 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
6325 if (TII->isVOP3(*I)) {
6326 I.addReg(TRI->getVCC(), RegState::Define);
6327 }
6328 I.add(MI.getOperand(1)).add(MI.getOperand(2));
6329 if (NeedClampOperand)
6330 I.addImm(0); // clamp bit for e64 encoding
6331
6332 TII->legalizeOperands(*I);
6333
6334 MI.eraseFromParent();
6335 return BB;
6336 }
6337 case AMDGPU::V_ADDC_U32_e32:
6338 case AMDGPU::V_SUBB_U32_e32:
6339 case AMDGPU::V_SUBBREV_U32_e32:
6340 // These instructions have an implicit use of vcc which counts towards the
6341 // constant bus limit.
6342 TII->legalizeOperands(MI);
6343 return BB;
6344 case AMDGPU::DS_GWS_INIT:
6345 case AMDGPU::DS_GWS_SEMA_BR:
6346 case AMDGPU::DS_GWS_BARRIER:
6347 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
6348 [[fallthrough]];
6349 case AMDGPU::DS_GWS_SEMA_V:
6350 case AMDGPU::DS_GWS_SEMA_P:
6351 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6352 // A s_waitcnt 0 is required to be the instruction immediately following.
6353 if (getSubtarget()->hasGWSAutoReplay()) {
6355 return BB;
6356 }
6357
6358 return emitGWSMemViolTestLoop(MI, BB);
6359 case AMDGPU::S_SETREG_B32: {
6360 // Try to optimize cases that only set the denormal mode or rounding mode.
6361 //
6362 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
6363 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
6364 // instead.
6365 //
6366 // FIXME: This could be predicates on the immediate, but tablegen doesn't
6367 // allow you to have a no side effect instruction in the output of a
6368 // sideeffecting pattern.
6369 auto [ID, Offset, Width] =
6370 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
6372 return BB;
6373
6374 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
6375 const unsigned SetMask = WidthMask << Offset;
6376
6377 if (getSubtarget()->hasDenormModeInst()) {
6378 unsigned SetDenormOp = 0;
6379 unsigned SetRoundOp = 0;
6380
6381 // The dedicated instructions can only set the whole denorm or round mode
6382 // at once, not a subset of bits in either.
6383 if (SetMask ==
6385 // If this fully sets both the round and denorm mode, emit the two
6386 // dedicated instructions for these.
6387 SetRoundOp = AMDGPU::S_ROUND_MODE;
6388 SetDenormOp = AMDGPU::S_DENORM_MODE;
6389 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
6390 SetRoundOp = AMDGPU::S_ROUND_MODE;
6391 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
6392 SetDenormOp = AMDGPU::S_DENORM_MODE;
6393 }
6394
6395 if (SetRoundOp || SetDenormOp) {
6396 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
6397 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6398 unsigned ImmVal = Def->getOperand(1).getImm();
6399 if (SetRoundOp) {
6400 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
6401 .addImm(ImmVal & 0xf);
6402
6403 // If we also have the denorm mode, get just the denorm mode bits.
6404 ImmVal >>= 4;
6405 }
6406
6407 if (SetDenormOp) {
6408 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
6409 .addImm(ImmVal & 0xf);
6410 }
6411
6412 MI.eraseFromParent();
6413 return BB;
6414 }
6415 }
6416 }
6417
6418 // If only FP bits are touched, used the no side effects pseudo.
6419 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
6420 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
6421 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
6422
6423 return BB;
6424 }
6425 case AMDGPU::S_INVERSE_BALLOT_U32:
6426 case AMDGPU::S_INVERSE_BALLOT_U64:
6427 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
6428 // necessary. After that they are equivalent to a COPY.
6429 MI.setDesc(TII->get(AMDGPU::COPY));
6430 return BB;
6431 case AMDGPU::ENDPGM_TRAP: {
6432 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
6433 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
6434 MI.addOperand(MachineOperand::CreateImm(0));
6435 return BB;
6436 }
6437
6438 // We need a block split to make the real endpgm a terminator. We also don't
6439 // want to break phis in successor blocks, so we can't just delete to the
6440 // end of the block.
6441
6442 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
6444 MF->push_back(TrapBB);
6445 // clang-format off
6446 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
6447 .addImm(0);
6448 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
6449 .addMBB(TrapBB);
6450 // clang-format on
6451
6452 BB->addSuccessor(TrapBB);
6453 MI.eraseFromParent();
6454 return SplitBB;
6455 }
6456 case AMDGPU::SIMULATED_TRAP: {
6457 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6458 MachineBasicBlock *SplitBB =
6459 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
6460 MI.eraseFromParent();
6461 return SplitBB;
6462 }
6463 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6464 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6466
6467 // During ISel, it's difficult to propagate the original EXEC mask to use as
6468 // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
6469 MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
6470 assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6471 Register OriginalExec = Setup->getOperand(0).getReg();
6472 MF->getRegInfo().clearKillFlags(OriginalExec);
6473 MI.getOperand(0).setReg(OriginalExec);
6474 return BB;
6475 }
6476 default:
6477 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
6478 if (!MI.mayStore())
6480 return BB;
6481 }
6483 }
6484}
6485
6487 // This currently forces unfolding various combinations of fsub into fma with
6488 // free fneg'd operands. As long as we have fast FMA (controlled by
6489 // isFMAFasterThanFMulAndFAdd), we should perform these.
6490
6491 // When fma is quarter rate, for f64 where add / sub are at best half rate,
6492 // most of these combines appear to be cycle neutral but save on instruction
6493 // count / code size.
6494 return true;
6495}
6496
6498
6500 EVT VT) const {
6501 if (!VT.isVector()) {
6502 return MVT::i1;
6503 }
6504 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
6505}
6506
6508 // TODO: Should i16 be used always if legal? For now it would force VALU
6509 // shifts.
6510 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6511}
6512
6514 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6515 ? Ty.changeElementSize(16)
6516 : Ty.changeElementSize(32);
6517}
6518
6519// Answering this is somewhat tricky and depends on the specific device which
6520// have different rates for fma or all f64 operations.
6521//
6522// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
6523// regardless of which device (although the number of cycles differs between
6524// devices), so it is always profitable for f64.
6525//
6526// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
6527// only on full rate devices. Normally, we should prefer selecting v_mad_f32
6528// which we can always do even without fused FP ops since it returns the same
6529// result as the separate operations and since it is always full
6530// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
6531// however does not support denormals, so we do report fma as faster if we have
6532// a fast fma device and require denormals.
6533//
6535 EVT VT) const {
6536 VT = VT.getScalarType();
6537
6538 switch (VT.getSimpleVT().SimpleTy) {
6539 case MVT::f32: {
6540 // If mad is not available this depends only on if f32 fma is full rate.
6541 if (!Subtarget->hasMadMacF32Insts())
6542 return Subtarget->hasFastFMAF32();
6543
6544 // Otherwise f32 mad is always full rate and returns the same result as
6545 // the separate operations so should be preferred over fma.
6546 // However does not support denormals.
6548 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6549
6550 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
6551 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6552 }
6553 case MVT::f64:
6554 return true;
6555 case MVT::f16:
6556 case MVT::bf16:
6557 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
6558 default:
6559 break;
6560 }
6561
6562 return false;
6563}
6564
6566 LLT Ty) const {
6567 switch (Ty.getScalarSizeInBits()) {
6568 case 16:
6569 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
6570 case 32:
6571 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
6572 case 64:
6573 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
6574 default:
6575 break;
6576 }
6577
6578 return false;
6579}
6580
6582 if (!Ty.isScalar())
6583 return false;
6584
6585 if (Ty.getScalarSizeInBits() == 16)
6586 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
6587 if (Ty.getScalarSizeInBits() == 32)
6588 return Subtarget->hasMadMacF32Insts() &&
6589 denormalModeIsFlushAllF32(*MI.getMF());
6590
6591 return false;
6592}
6593
6595 const SDNode *N) const {
6596 // TODO: Check future ftz flag
6597 // v_mad_f32/v_mac_f32 do not support denormals.
6598 EVT VT = N->getValueType(0);
6599 if (VT == MVT::f32)
6600 return Subtarget->hasMadMacF32Insts() &&
6602 if (VT == MVT::f16) {
6603 return Subtarget->hasMadF16() &&
6605 }
6606
6607 return false;
6608}
6609
6610//===----------------------------------------------------------------------===//
6611// Custom DAG Lowering Operations
6612//===----------------------------------------------------------------------===//
6613
6614// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6615// wider vector type is legal.
6617 SelectionDAG &DAG) const {
6618 unsigned Opc = Op.getOpcode();
6619 EVT VT = Op.getValueType();
6620 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6621 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6622 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6623 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6624 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6625 VT == MVT::v32bf16);
6626
6627 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
6628
6629 SDLoc SL(Op);
6630 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
6631 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
6632
6633 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6634}
6635
6636// Enable lowering of ROTR for vxi32 types. This is a workaround for a
6637// regression whereby extra unnecessary instructions were added to codegen
6638// for rotr operations, casued by legalising v2i32 or. This resulted in extra
6639// instructions to extract the result from the vector.
6641 [[maybe_unused]] EVT VT = Op.getValueType();
6642
6643 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6644 VT == MVT::v16i32) &&
6645 "Unexpected ValueType.");
6646
6647 return DAG.UnrollVectorOp(Op.getNode());
6648}
6649
6650// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6651// wider vector type is legal.
6653 SelectionDAG &DAG) const {
6654 unsigned Opc = Op.getOpcode();
6655 EVT VT = Op.getValueType();
6656 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6657 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6658 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6659 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6660 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6661 VT == MVT::v32bf16);
6662
6663 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
6664 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6665
6666 SDLoc SL(Op);
6667
6668 SDValue OpLo =
6669 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
6670 SDValue OpHi =
6671 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
6672
6673 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6674}
6675
6677 SelectionDAG &DAG) const {
6678 unsigned Opc = Op.getOpcode();
6679 EVT VT = Op.getValueType();
6680 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6681 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6682 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6683 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6684 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6685 VT == MVT::v32bf16);
6686
6687 SDValue Op0 = Op.getOperand(0);
6688 auto [Lo0, Hi0] = Op0.getValueType().isVector()
6689 ? DAG.SplitVectorOperand(Op.getNode(), 0)
6690 : std::pair(Op0, Op0);
6691
6692 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6693 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
6694
6695 SDLoc SL(Op);
6696 auto ResVT = DAG.GetSplitDestVTs(VT);
6697
6698 SDValue OpLo =
6699 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
6700 SDValue OpHi =
6701 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
6702
6703 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6704}
6705
6707 switch (Op.getOpcode()) {
6708 default:
6710 case ISD::BRCOND:
6711 return LowerBRCOND(Op, DAG);
6712 case ISD::RETURNADDR:
6713 return LowerRETURNADDR(Op, DAG);
6714 case ISD::LOAD: {
6715 SDValue Result = LowerLOAD(Op, DAG);
6716 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6717 "Load should return a value and a chain");
6718 return Result;
6719 }
6720 case ISD::FSQRT: {
6721 EVT VT = Op.getValueType();
6722 if (VT == MVT::f32)
6723 return lowerFSQRTF32(Op, DAG);
6724 if (VT == MVT::f64)
6725 return lowerFSQRTF64(Op, DAG);
6726 return SDValue();
6727 }
6728 case ISD::FSIN:
6729 case ISD::FCOS:
6730 return LowerTrig(Op, DAG);
6731 case ISD::SELECT:
6732 return LowerSELECT(Op, DAG);
6733 case ISD::FDIV:
6734 return LowerFDIV(Op, DAG);
6735 case ISD::FFREXP:
6736 return LowerFFREXP(Op, DAG);
6737 case ISD::ATOMIC_CMP_SWAP:
6738 return LowerATOMIC_CMP_SWAP(Op, DAG);
6739 case ISD::STORE:
6740 return LowerSTORE(Op, DAG);
6741 case ISD::GlobalAddress: {
6744 return LowerGlobalAddress(MFI, Op, DAG);
6745 }
6747 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6749 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6751 return LowerINTRINSIC_VOID(Op, DAG);
6752 case ISD::ADDRSPACECAST:
6753 return lowerADDRSPACECAST(Op, DAG);
6755 return lowerINSERT_SUBVECTOR(Op, DAG);
6757 return lowerINSERT_VECTOR_ELT(Op, DAG);
6759 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
6761 return lowerVECTOR_SHUFFLE(Op, DAG);
6763 return lowerSCALAR_TO_VECTOR(Op, DAG);
6764 case ISD::BUILD_VECTOR:
6765 return lowerBUILD_VECTOR(Op, DAG);
6766 case ISD::FP_ROUND:
6768 return lowerFP_ROUND(Op, DAG);
6769 case ISD::TRAP:
6770 return lowerTRAP(Op, DAG);
6771 case ISD::DEBUGTRAP:
6772 return lowerDEBUGTRAP(Op, DAG);
6773 case ISD::ABS:
6774 case ISD::FABS:
6775 case ISD::FNEG:
6776 case ISD::FCANONICALIZE:
6777 case ISD::BSWAP:
6778 return splitUnaryVectorOp(Op, DAG);
6779 case ISD::FMINNUM:
6780 case ISD::FMAXNUM:
6781 return lowerFMINNUM_FMAXNUM(Op, DAG);
6782 case ISD::FMINIMUMNUM:
6783 case ISD::FMAXIMUMNUM:
6784 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
6785 case ISD::FMINIMUM:
6786 case ISD::FMAXIMUM:
6787 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
6788 case ISD::FLDEXP:
6789 case ISD::STRICT_FLDEXP:
6790 return lowerFLDEXP(Op, DAG);
6791 case ISD::FMA:
6792 return splitTernaryVectorOp(Op, DAG);
6793 case ISD::FP_TO_SINT:
6794 case ISD::FP_TO_UINT:
6795 return LowerFP_TO_INT(Op, DAG);
6796 case ISD::SHL:
6797 case ISD::SRA:
6798 case ISD::SRL:
6799 case ISD::ADD:
6800 case ISD::SUB:
6801 case ISD::SMIN:
6802 case ISD::SMAX:
6803 case ISD::UMIN:
6804 case ISD::UMAX:
6805 case ISD::FADD:
6806 case ISD::FMUL:
6807 case ISD::FMINNUM_IEEE:
6808 case ISD::FMAXNUM_IEEE:
6809 case ISD::UADDSAT:
6810 case ISD::USUBSAT:
6811 case ISD::SADDSAT:
6812 case ISD::SSUBSAT:
6813 return splitBinaryVectorOp(Op, DAG);
6814 case ISD::FCOPYSIGN:
6815 return lowerFCOPYSIGN(Op, DAG);
6816 case ISD::MUL:
6817 return lowerMUL(Op, DAG);
6818 case ISD::SMULO:
6819 case ISD::UMULO:
6820 return lowerXMULO(Op, DAG);
6821 case ISD::SMUL_LOHI:
6822 case ISD::UMUL_LOHI:
6823 return lowerXMUL_LOHI(Op, DAG);
6824 case ISD::DYNAMIC_STACKALLOC:
6825 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6826 case ISD::STACKSAVE:
6827 return LowerSTACKSAVE(Op, DAG);
6828 case ISD::GET_ROUNDING:
6829 return lowerGET_ROUNDING(Op, DAG);
6830 case ISD::SET_ROUNDING:
6831 return lowerSET_ROUNDING(Op, DAG);
6832 case ISD::PREFETCH:
6833 return lowerPREFETCH(Op, DAG);
6834 case ISD::FP_EXTEND:
6836 return lowerFP_EXTEND(Op, DAG);
6837 case ISD::GET_FPENV:
6838 return lowerGET_FPENV(Op, DAG);
6839 case ISD::SET_FPENV:
6840 return lowerSET_FPENV(Op, DAG);
6841 case ISD::ROTR:
6842 return lowerROTR(Op, DAG);
6843 }
6844 return SDValue();
6845}
6846
6847// Used for D16: Casts the result of an instruction into the right vector,
6848// packs values if loads return unpacked values.
6850 const SDLoc &DL, SelectionDAG &DAG,
6851 bool Unpacked) {
6852 if (!LoadVT.isVector())
6853 return Result;
6854
6855 // Cast back to the original packed type or to a larger type that is a
6856 // multiple of 32 bit for D16. Widening the return type is a required for
6857 // legalization.
6858 EVT FittingLoadVT = LoadVT;
6859 if ((LoadVT.getVectorNumElements() % 2) == 1) {
6860 FittingLoadVT =
6862 LoadVT.getVectorNumElements() + 1);
6863 }
6864
6865 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6866 // Truncate to v2i16/v4i16.
6867 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6868
6869 // Workaround legalizer not scalarizing truncate after vector op
6870 // legalization but not creating intermediate vector trunc.
6872 DAG.ExtractVectorElements(Result, Elts);
6873 for (SDValue &Elt : Elts)
6874 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6875
6876 // Pad illegal v1i16/v3fi6 to v4i16
6877 if ((LoadVT.getVectorNumElements() % 2) == 1)
6878 Elts.push_back(DAG.getPOISON(MVT::i16));
6879
6880 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6881
6882 // Bitcast to original type (v2f16/v4f16).
6883 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6884 }
6885
6886 // Cast back to the original packed type.
6887 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6888}
6889
6890SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6891 SelectionDAG &DAG,
6893 bool IsIntrinsic) const {
6894 SDLoc DL(M);
6895
6896 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6897 EVT LoadVT = M->getValueType(0);
6898
6899 EVT EquivLoadVT = LoadVT;
6900 if (LoadVT.isVector()) {
6901 if (Unpacked) {
6902 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6903 LoadVT.getVectorNumElements());
6904 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
6905 // Widen v3f16 to legal type
6906 EquivLoadVT =
6908 LoadVT.getVectorNumElements() + 1);
6909 }
6910 }
6911
6912 // Change from v4f16/v2f16 to EquivLoadVT.
6913 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
6914
6916 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
6917 M->getMemoryVT(), M->getMemOperand());
6918
6919 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
6920
6921 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
6922}
6923
6924SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
6925 SelectionDAG &DAG,
6926 ArrayRef<SDValue> Ops) const {
6927 SDLoc DL(M);
6928 EVT LoadVT = M->getValueType(0);
6929 EVT EltType = LoadVT.getScalarType();
6930 EVT IntVT = LoadVT.changeTypeToInteger();
6931
6932 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
6933
6934 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
6935 bool IsTFE = M->getNumValues() == 3;
6936
6937 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
6939 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
6940 : AMDGPUISD::BUFFER_LOAD;
6941
6942 if (IsD16) {
6943 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
6944 }
6945
6946 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6947 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
6948 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
6949 IsTFE);
6950
6951 if (isTypeLegal(LoadVT)) {
6952 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
6953 M->getMemOperand(), DAG);
6954 }
6955
6956 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
6957 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
6958 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
6959 M->getMemOperand(), DAG);
6960 return DAG.getMergeValues(
6961 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
6962 DL);
6963}
6964
6966 SelectionDAG &DAG) {
6967 EVT VT = N->getValueType(0);
6968 unsigned CondCode = N->getConstantOperandVal(3);
6969 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
6970 return DAG.getPOISON(VT);
6971
6972 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
6973
6974 SDValue LHS = N->getOperand(1);
6975 SDValue RHS = N->getOperand(2);
6976
6977 SDLoc DL(N);
6978
6979 EVT CmpVT = LHS.getValueType();
6980 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
6981 unsigned PromoteOp =
6983 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
6984 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
6985 }
6986
6987 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
6988
6989 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6990 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6991
6992 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
6993 DAG.getCondCode(CCOpcode));
6994 if (VT.bitsEq(CCVT))
6995 return SetCC;
6996 return DAG.getZExtOrTrunc(SetCC, DL, VT);
6997}
6998
7000 SelectionDAG &DAG) {
7001 EVT VT = N->getValueType(0);
7002
7003 unsigned CondCode = N->getConstantOperandVal(3);
7004 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
7005 return DAG.getPOISON(VT);
7006
7007 SDValue Src0 = N->getOperand(1);
7008 SDValue Src1 = N->getOperand(2);
7009 EVT CmpVT = Src0.getValueType();
7010 SDLoc SL(N);
7011
7012 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
7013 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7014 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7015 }
7016
7017 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
7018 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
7019 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7020 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7021 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7022 DAG.getCondCode(CCOpcode));
7023 if (VT.bitsEq(CCVT))
7024 return SetCC;
7025 return DAG.getZExtOrTrunc(SetCC, SL, VT);
7026}
7027
7029 SelectionDAG &DAG) {
7030 EVT VT = N->getValueType(0);
7031 SDValue Src = N->getOperand(1);
7032 SDLoc SL(N);
7033
7034 if (Src.getOpcode() == ISD::SETCC) {
7035 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
7036 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
7037 Src.getOperand(1), Src.getOperand(2));
7038 }
7039 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
7040 // (ballot 0) -> 0
7041 if (Arg->isZero())
7042 return DAG.getConstant(0, SL, VT);
7043
7044 // (ballot 1) -> EXEC/EXEC_LO
7045 if (Arg->isOne()) {
7046 Register Exec;
7047 if (VT.getScalarSizeInBits() == 32)
7048 Exec = AMDGPU::EXEC_LO;
7049 else if (VT.getScalarSizeInBits() == 64)
7050 Exec = AMDGPU::EXEC;
7051 else
7052 return SDValue();
7053
7054 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
7055 }
7056 }
7057
7058 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
7059 // ISD::SETNE)
7060 return DAG.getNode(
7061 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
7062 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
7063}
7064
7066 SelectionDAG &DAG) {
7067 EVT VT = N->getValueType(0);
7068 unsigned ValSize = VT.getSizeInBits();
7069 unsigned IID = N->getConstantOperandVal(0);
7070 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7071 IID == Intrinsic::amdgcn_permlanex16;
7072 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7073 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7074 SDLoc SL(N);
7075 MVT IntVT = MVT::getIntegerVT(ValSize);
7076 const GCNSubtarget *ST = TLI.getSubtarget();
7077 unsigned SplitSize = 32;
7078 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7079 ST->hasDPALU_DPP() &&
7080 AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))
7081 SplitSize = 64;
7082
7083 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
7084 SDValue Src2, MVT ValT) -> SDValue {
7086 switch (IID) {
7087 case Intrinsic::amdgcn_permlane16:
7088 case Intrinsic::amdgcn_permlanex16:
7089 case Intrinsic::amdgcn_update_dpp:
7090 Operands.push_back(N->getOperand(6));
7091 Operands.push_back(N->getOperand(5));
7092 Operands.push_back(N->getOperand(4));
7093 [[fallthrough]];
7094 case Intrinsic::amdgcn_writelane:
7095 Operands.push_back(Src2);
7096 [[fallthrough]];
7097 case Intrinsic::amdgcn_readlane:
7098 case Intrinsic::amdgcn_set_inactive:
7099 case Intrinsic::amdgcn_set_inactive_chain_arg:
7100 case Intrinsic::amdgcn_mov_dpp8:
7101 Operands.push_back(Src1);
7102 [[fallthrough]];
7103 case Intrinsic::amdgcn_readfirstlane:
7104 case Intrinsic::amdgcn_permlane64:
7105 Operands.push_back(Src0);
7106 break;
7107 default:
7108 llvm_unreachable("unhandled lane op");
7109 }
7110
7111 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
7112 std::reverse(Operands.begin(), Operands.end());
7113
7114 if (SDNode *GL = N->getGluedNode()) {
7115 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7116 GL = GL->getOperand(0).getNode();
7117 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7118 SDValue(GL, 0)));
7119 }
7120
7121 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
7122 };
7123
7124 SDValue Src0 = N->getOperand(1);
7125 SDValue Src1, Src2;
7126 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7127 IID == Intrinsic::amdgcn_mov_dpp8 ||
7128 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7129 Src1 = N->getOperand(2);
7130 if (IID == Intrinsic::amdgcn_writelane ||
7131 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7132 Src2 = N->getOperand(3);
7133 }
7134
7135 if (ValSize == SplitSize) {
7136 // Already legal
7137 return SDValue();
7138 }
7139
7140 if (ValSize < 32) {
7141 bool IsFloat = VT.isFloatingPoint();
7142 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
7143 SL, MVT::i32);
7144
7145 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7146 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
7147 SL, MVT::i32);
7148 }
7149
7150 if (IID == Intrinsic::amdgcn_writelane) {
7151 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
7152 SL, MVT::i32);
7153 }
7154
7155 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7156 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
7157 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
7158 }
7159
7160 if (ValSize % SplitSize != 0)
7161 return SDValue();
7162
7163 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
7164 EVT VT = N->getValueType(0);
7165 unsigned NE = VT.getVectorNumElements();
7166 EVT EltVT = VT.getVectorElementType();
7168 unsigned NumOperands = N->getNumOperands();
7169 SmallVector<SDValue, 4> Operands(NumOperands);
7170 SDNode *GL = N->getGluedNode();
7171
7172 // only handle convergencectrl_glue
7173 assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7174
7175 for (unsigned i = 0; i != NE; ++i) {
7176 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7177 ++j) {
7178 SDValue Operand = N->getOperand(j);
7179 EVT OperandVT = Operand.getValueType();
7180 if (OperandVT.isVector()) {
7181 // A vector operand; extract a single element.
7182 EVT OperandEltVT = OperandVT.getVectorElementType();
7183 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
7184 Operand, DAG.getVectorIdxConstant(i, SL));
7185 } else {
7186 // A scalar operand; just use it as is.
7187 Operands[j] = Operand;
7188 }
7189 }
7190
7191 if (GL)
7192 Operands[NumOperands - 1] =
7193 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7194 SDValue(GL->getOperand(0).getNode(), 0));
7195
7196 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
7197 }
7198
7199 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
7200 return DAG.getBuildVector(VecVT, SL, Scalars);
7201 };
7202
7203 if (VT.isVector()) {
7204 switch (MVT::SimpleValueType EltTy =
7206 case MVT::i32:
7207 case MVT::f32:
7208 if (SplitSize == 32) {
7209 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
7210 return unrollLaneOp(LaneOp.getNode());
7211 }
7212 [[fallthrough]];
7213 case MVT::i16:
7214 case MVT::f16:
7215 case MVT::bf16: {
7216 unsigned SubVecNumElt =
7217 SplitSize / VT.getVectorElementType().getSizeInBits();
7218 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
7220 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7221 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7222 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
7223 DAG.getConstant(EltIdx, SL, MVT::i32));
7224
7225 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7226 IsPermLane16)
7227 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
7228 DAG.getConstant(EltIdx, SL, MVT::i32));
7229
7230 if (IID == Intrinsic::amdgcn_writelane)
7231 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
7232 DAG.getConstant(EltIdx, SL, MVT::i32));
7233
7234 Pieces.push_back(
7235 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7236 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7237 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7238 EltIdx += SubVecNumElt;
7239 }
7240 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
7241 }
7242 default:
7243 // Handle all other cases by bitcasting to i32 vectors
7244 break;
7245 }
7246 }
7247
7248 MVT VecVT =
7249 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
7250 Src0 = DAG.getBitcast(VecVT, Src0);
7251
7252 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7253 Src1 = DAG.getBitcast(VecVT, Src1);
7254
7255 if (IID == Intrinsic::amdgcn_writelane)
7256 Src2 = DAG.getBitcast(VecVT, Src2);
7257
7258 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7259 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
7260 return DAG.getBitcast(VT, UnrolledLaneOp);
7261}
7262
7265 SelectionDAG &DAG) const {
7266 switch (N->getOpcode()) {
7268 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
7269 Results.push_back(Res);
7270 return;
7271 }
7273 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
7274 Results.push_back(Res);
7275 return;
7276 }
7278 unsigned IID = N->getConstantOperandVal(0);
7279 switch (IID) {
7280 case Intrinsic::amdgcn_make_buffer_rsrc:
7281 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
7282 return;
7283 case Intrinsic::amdgcn_cvt_pkrtz: {
7284 SDValue Src0 = N->getOperand(1);
7285 SDValue Src1 = N->getOperand(2);
7286 SDLoc SL(N);
7287 SDValue Cvt =
7288 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
7289 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7290 return;
7291 }
7292 case Intrinsic::amdgcn_cvt_pknorm_i16:
7293 case Intrinsic::amdgcn_cvt_pknorm_u16:
7294 case Intrinsic::amdgcn_cvt_pk_i16:
7295 case Intrinsic::amdgcn_cvt_pk_u16: {
7296 SDValue Src0 = N->getOperand(1);
7297 SDValue Src1 = N->getOperand(2);
7298 SDLoc SL(N);
7299 unsigned Opcode;
7300
7301 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7303 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7305 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7307 else
7309
7310 EVT VT = N->getValueType(0);
7311 if (isTypeLegal(VT))
7312 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
7313 else {
7314 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
7315 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7316 }
7317 return;
7318 }
7319 case Intrinsic::amdgcn_s_buffer_load: {
7320 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
7321 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
7322 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
7323 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
7324 // s_buffer_load_i8.
7325 if (!Subtarget->hasScalarSubwordLoads())
7326 return;
7327 SDValue Op = SDValue(N, 0);
7328 SDValue Rsrc = Op.getOperand(1);
7329 SDValue Offset = Op.getOperand(2);
7330 SDValue CachePolicy = Op.getOperand(3);
7331 EVT VT = Op.getValueType();
7332 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
7333 SDLoc DL(Op);
7335 const DataLayout &DataLayout = DAG.getDataLayout();
7336 Align Alignment =
7342 VT.getStoreSize(), Alignment);
7343 SDValue LoadVal;
7344 if (!Offset->isDivergent()) {
7345 SDValue Ops[] = {Rsrc, // source register
7346 Offset, CachePolicy};
7347 SDValue BufferLoad =
7349 DAG.getVTList(MVT::i32), Ops, VT, MMO);
7350 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
7351 } else {
7352 SDValue Ops[] = {
7353 DAG.getEntryNode(), // Chain
7354 Rsrc, // rsrc
7355 DAG.getConstant(0, DL, MVT::i32), // vindex
7356 {}, // voffset
7357 {}, // soffset
7358 {}, // offset
7359 CachePolicy, // cachepolicy
7360 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7361 };
7362 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
7363 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
7364 }
7365 Results.push_back(LoadVal);
7366 return;
7367 }
7368 case Intrinsic::amdgcn_dead: {
7369 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
7370 Results.push_back(DAG.getPOISON(N->getValueType(I)));
7371 return;
7372 }
7373 }
7374 break;
7375 }
7377 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
7378 if (Res.getOpcode() == ISD::MERGE_VALUES) {
7379 // FIXME: Hacky
7380 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
7381 Results.push_back(Res.getOperand(I));
7382 }
7383 } else {
7384 Results.push_back(Res);
7385 Results.push_back(Res.getValue(1));
7386 }
7387 return;
7388 }
7389
7390 break;
7391 }
7392 case ISD::SELECT: {
7393 SDLoc SL(N);
7394 EVT VT = N->getValueType(0);
7395 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
7396 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
7397 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
7398
7399 EVT SelectVT = NewVT;
7400 if (NewVT.bitsLT(MVT::i32)) {
7401 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
7402 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
7403 SelectVT = MVT::i32;
7404 }
7405
7406 SDValue NewSelect =
7407 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
7408
7409 if (NewVT != SelectVT)
7410 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
7411 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
7412 return;
7413 }
7414 case ISD::FNEG: {
7415 if (N->getValueType(0) != MVT::v2f16)
7416 break;
7417
7418 SDLoc SL(N);
7419 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7420
7421 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
7422 DAG.getConstant(0x80008000, SL, MVT::i32));
7423 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7424 return;
7425 }
7426 case ISD::FABS: {
7427 if (N->getValueType(0) != MVT::v2f16)
7428 break;
7429
7430 SDLoc SL(N);
7431 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7432
7433 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
7434 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
7435 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7436 return;
7437 }
7438 case ISD::FSQRT: {
7439 if (N->getValueType(0) != MVT::f16)
7440 break;
7441 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
7442 break;
7443 }
7444 default:
7446 break;
7447 }
7448}
7449
7450/// Helper function for LowerBRCOND
7451static SDNode *findUser(SDValue Value, unsigned Opcode) {
7452
7453 for (SDUse &U : Value->uses()) {
7454 if (U.get() != Value)
7455 continue;
7456
7457 if (U.getUser()->getOpcode() == Opcode)
7458 return U.getUser();
7459 }
7460 return nullptr;
7461}
7462
7463unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
7464 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
7465 switch (Intr->getConstantOperandVal(1)) {
7466 case Intrinsic::amdgcn_if:
7467 return AMDGPUISD::IF;
7468 case Intrinsic::amdgcn_else:
7469 return AMDGPUISD::ELSE;
7470 case Intrinsic::amdgcn_loop:
7471 return AMDGPUISD::LOOP;
7472 case Intrinsic::amdgcn_end_cf:
7473 llvm_unreachable("should not occur");
7474 default:
7475 return 0;
7476 }
7477 }
7478
7479 // break, if_break, else_break are all only used as inputs to loop, not
7480 // directly as branch conditions.
7481 return 0;
7482}
7483
7490
7492 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7493 return false;
7494
7495 // FIXME: Either avoid relying on address space here or change the default
7496 // address space for functions to avoid the explicit check.
7497 return (GV->getValueType()->isFunctionTy() ||
7500}
7501
7503 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
7504}
7505
7507 if (!GV->hasExternalLinkage())
7508 return true;
7509
7510 const auto OS = getTargetMachine().getTargetTriple().getOS();
7511 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
7512}
7513
7514/// This transforms the control flow intrinsics to get the branch destination as
7515/// last parameter, also switches branch target with BR if the need arise
7516SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
7517 SDLoc DL(BRCOND);
7518
7519 SDNode *Intr = BRCOND.getOperand(1).getNode();
7520 SDValue Target = BRCOND.getOperand(2);
7521 SDNode *BR = nullptr;
7522 SDNode *SetCC = nullptr;
7523
7524 switch (Intr->getOpcode()) {
7525 case ISD::SETCC: {
7526 // As long as we negate the condition everything is fine
7527 SetCC = Intr;
7528 Intr = SetCC->getOperand(0).getNode();
7529 break;
7530 }
7531 case ISD::XOR: {
7532 // Similar to SETCC, if we have (xor c, -1), we will be fine.
7533 SDValue LHS = Intr->getOperand(0);
7534 SDValue RHS = Intr->getOperand(1);
7535 if (auto *C = dyn_cast<ConstantSDNode>(RHS); C && C->getZExtValue()) {
7536 Intr = LHS.getNode();
7537 break;
7538 }
7539 [[fallthrough]];
7540 }
7541 default: {
7542 // Get the target from BR if we don't negate the condition
7543 BR = findUser(BRCOND, ISD::BR);
7544 assert(BR && "brcond missing unconditional branch user");
7545 Target = BR->getOperand(1);
7546 }
7547 }
7548
7549 unsigned CFNode = isCFIntrinsic(Intr);
7550 if (CFNode == 0) {
7551 // This is a uniform branch so we don't need to legalize.
7552 return BRCOND;
7553 }
7554
7555 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
7557
7558 assert(!SetCC ||
7559 (SetCC->getConstantOperandVal(1) == 1 &&
7560 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
7561 ISD::SETNE));
7562
7563 // operands of the new intrinsic call
7565 if (HaveChain)
7566 Ops.push_back(BRCOND.getOperand(0));
7567
7568 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
7569 Ops.push_back(Target);
7570
7571 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
7572
7573 // build the new intrinsic call
7574 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
7575
7576 if (!HaveChain) {
7577 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
7578
7580 }
7581
7582 if (BR) {
7583 // Give the branch instruction our target
7584 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
7585 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
7586 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
7587 }
7588
7589 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
7590
7591 // Copy the intrinsic results to registers
7592 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
7593 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
7594 if (!CopyToReg)
7595 continue;
7596
7597 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
7598 SDValue(Result, i - 1), SDValue());
7599
7600 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
7601 }
7602
7603 // Remove the old intrinsic from the chain
7604 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
7605 Intr->getOperand(0));
7606
7607 return Chain;
7608}
7609
7610SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
7611 MVT VT = Op.getSimpleValueType();
7612 SDLoc DL(Op);
7613 // Checking the depth
7614 if (Op.getConstantOperandVal(0) != 0)
7615 return DAG.getConstant(0, DL, VT);
7616
7617 MachineFunction &MF = DAG.getMachineFunction();
7618 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7619 // Check for kernel and shader functions
7620 if (Info->isEntryFunction())
7621 return DAG.getConstant(0, DL, VT);
7622
7623 MachineFrameInfo &MFI = MF.getFrameInfo();
7624 // There is a call to @llvm.returnaddress in this function
7625 MFI.setReturnAddressIsTaken(true);
7626
7627 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
7628 // Get the return address reg and mark it as an implicit live-in
7629 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
7630 getRegClassFor(VT, Op.getNode()->isDivergent()));
7631
7632 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
7633}
7634
7635SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
7636 const SDLoc &DL, EVT VT) const {
7637 return Op.getValueType().bitsLE(VT)
7638 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
7639 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
7640 DAG.getTargetConstant(0, DL, MVT::i32));
7641}
7642
7643SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
7644 SelectionDAG &DAG) const {
7645 EVT DstVT = Op.getValueType();
7646 unsigned NumElts = DstVT.getVectorNumElements();
7647 assert(NumElts > 2 && isPowerOf2_32(NumElts));
7648
7649 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
7650
7651 SDLoc DL(Op);
7652 unsigned Opc = Op.getOpcode();
7653 SDValue Flags = Op.getOperand(1);
7654 EVT HalfDstVT =
7655 EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2);
7656 SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags);
7657 SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags);
7658
7659 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi);
7660}
7661
7662SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
7663 SDValue Src = Op.getOperand(0);
7664 EVT SrcVT = Src.getValueType();
7665 EVT DstVT = Op.getValueType();
7666
7667 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
7668 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
7669 if (SrcVT.getScalarType() != MVT::f32)
7670 return SDValue();
7671 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
7672 }
7673
7674 if (SrcVT.getScalarType() != MVT::f64)
7675 return Op;
7676
7677 SDLoc DL(Op);
7678 if (DstVT == MVT::f16) {
7679 // TODO: Handle strictfp
7680 if (Op.getOpcode() != ISD::FP_ROUND)
7681 return Op;
7682
7683 if (!Subtarget->has16BitInsts()) {
7684 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
7685 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7686 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7687 }
7688 if (Op->getFlags().hasApproximateFuncs()) {
7689 SDValue Flags = Op.getOperand(1);
7690 SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
7691 return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
7692 }
7693 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
7694 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7695 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7696 }
7697
7698 assert(DstVT.getScalarType() == MVT::bf16 &&
7699 "custom lower FP_ROUND for f16 or bf16");
7700 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
7701
7702 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
7703 // hardware f32 -> bf16 instruction.
7704 EVT F32VT = SrcVT.isVector() ? SrcVT.changeVectorElementType(MVT::f32) :
7705 MVT::f32;
7706 SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);
7707 return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,
7708 DAG.getTargetConstant(0, DL, MVT::i32));
7709}
7710
7711SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
7712 SelectionDAG &DAG) const {
7713 EVT VT = Op.getValueType();
7714 const MachineFunction &MF = DAG.getMachineFunction();
7715 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7716 bool IsIEEEMode = Info->getMode().IEEE;
7717
7718 // FIXME: Assert during selection that this is only selected for
7719 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
7720 // mode functions, but this happens to be OK since it's only done in cases
7721 // where there is known no sNaN.
7722 if (IsIEEEMode)
7723 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
7724
7725 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7726 VT == MVT::v16bf16)
7727 return splitBinaryVectorOp(Op, DAG);
7728 return Op;
7729}
7730
7731SDValue
7732SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
7733 SelectionDAG &DAG) const {
7734 EVT VT = Op.getValueType();
7735 const MachineFunction &MF = DAG.getMachineFunction();
7736 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7737 bool IsIEEEMode = Info->getMode().IEEE;
7738
7739 if (IsIEEEMode)
7740 return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
7741
7742 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7743 VT == MVT::v16bf16)
7744 return splitBinaryVectorOp(Op, DAG);
7745 return Op;
7746}
7747
7748SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
7749 SelectionDAG &DAG) const {
7750 EVT VT = Op.getValueType();
7751 if (VT.isVector())
7752 return splitBinaryVectorOp(Op, DAG);
7753
7754 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7755 !Subtarget->hasMinimum3Maximum3F16() &&
7756 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7757 "should not need to widen f16 minimum/maximum to v2f16");
7758
7759 // Widen f16 operation to v2f16
7760
7761 // fminimum f16:x, f16:y ->
7762 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
7763 // (v2f16 (scalar_to_vector y))), 0
7764 SDLoc SL(Op);
7765 SDValue WideSrc0 =
7766 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
7767 SDValue WideSrc1 =
7768 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
7769
7770 SDValue Widened =
7771 DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7772
7773 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
7774 DAG.getConstant(0, SL, MVT::i32));
7775}
7776
7777SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
7778 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
7779 EVT VT = Op.getValueType();
7780 assert(VT == MVT::f16);
7781
7782 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
7783 EVT ExpVT = Exp.getValueType();
7784 if (ExpVT == MVT::i16)
7785 return Op;
7786
7787 SDLoc DL(Op);
7788
7789 // Correct the exponent type for f16 to i16.
7790 // Clamp the range of the exponent to the instruction's range.
7791
7792 // TODO: This should be a generic narrowing legalization, and can easily be
7793 // for GlobalISel.
7794
7795 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
7796 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
7797
7798 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
7799 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
7800
7801 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
7802
7803 if (IsStrict) {
7804 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
7805 {Op.getOperand(0), Op.getOperand(1), TruncExp});
7806 }
7807
7808 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
7809}
7810
7812 switch (Op->getOpcode()) {
7813 case ISD::SRA:
7814 case ISD::SMIN:
7815 case ISD::SMAX:
7816 return ISD::SIGN_EXTEND;
7817 case ISD::SRL:
7818 case ISD::UMIN:
7819 case ISD::UMAX:
7820 return ISD::ZERO_EXTEND;
7821 case ISD::ADD:
7822 case ISD::SUB:
7823 case ISD::AND:
7824 case ISD::OR:
7825 case ISD::XOR:
7826 case ISD::SHL:
7827 case ISD::SELECT:
7828 case ISD::MUL:
7829 // operation result won't be influenced by garbage high bits.
7830 // TODO: are all of those cases correct, and are there more?
7831 return ISD::ANY_EXTEND;
7832 case ISD::SETCC: {
7833 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7835 }
7836 default:
7837 llvm_unreachable("unexpected opcode!");
7838 }
7839}
7840
7841SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
7842 DAGCombinerInfo &DCI) const {
7843 const unsigned Opc = Op.getOpcode();
7844 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
7845 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
7846 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
7847 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
7848 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
7849
7850 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
7851 : Op->getOperand(0).getValueType();
7852 auto ExtTy = OpTy.changeElementType(MVT::i32);
7853
7854 if (DCI.isBeforeLegalizeOps() ||
7855 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
7856 return SDValue();
7857
7858 auto &DAG = DCI.DAG;
7859
7860 SDLoc DL(Op);
7861 SDValue LHS;
7862 SDValue RHS;
7863 if (Opc == ISD::SELECT) {
7864 LHS = Op->getOperand(1);
7865 RHS = Op->getOperand(2);
7866 } else {
7867 LHS = Op->getOperand(0);
7868 RHS = Op->getOperand(1);
7869 }
7870
7871 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
7872 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
7873
7874 // Special case: for shifts, the RHS always needs a zext.
7875 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
7876 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
7877 else
7878 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
7879
7880 // setcc always return i1/i1 vec so no need to truncate after.
7881 if (Opc == ISD::SETCC) {
7882 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7883 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
7884 }
7885
7886 // For other ops, we extend the operation's return type as well so we need to
7887 // truncate back to the original type.
7888 SDValue NewVal;
7889 if (Opc == ISD::SELECT)
7890 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
7891 else
7892 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
7893
7894 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
7895}
7896
7897SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
7898 SDValue Mag = Op.getOperand(0);
7899 EVT MagVT = Mag.getValueType();
7900
7901 if (MagVT.getVectorNumElements() > 2)
7902 return splitBinaryVectorOp(Op, DAG);
7903
7904 SDValue Sign = Op.getOperand(1);
7905 EVT SignVT = Sign.getValueType();
7906
7907 if (MagVT == SignVT)
7908 return Op;
7909
7910 // fcopysign v2f16:mag, v2f32:sign ->
7911 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
7912
7913 SDLoc SL(Op);
7914 SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
7915 SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32);
7916
7917 SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
7918
7919 return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
7920}
7921
7922// Custom lowering for vector multiplications and s_mul_u64.
7923SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
7924 EVT VT = Op.getValueType();
7925
7926 // Split vector operands.
7927 if (VT.isVector())
7928 return splitBinaryVectorOp(Op, DAG);
7929
7930 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
7931
7932 // There are four ways to lower s_mul_u64:
7933 //
7934 // 1. If all the operands are uniform, then we lower it as it is.
7935 //
7936 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
7937 // multiplications because there is not a vector equivalent of s_mul_u64.
7938 //
7939 // 3. If the cost model decides that it is more efficient to use vector
7940 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
7941 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
7942 //
7943 // 4. If the cost model decides to use vector registers and both of the
7944 // operands are zero-extended/sign-extended from 32-bits, then we split the
7945 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
7946 // possible to check if the operands are zero-extended or sign-extended in
7947 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
7948 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
7949 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
7950 // If the cost model decides that we have to use vector registers, then
7951 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
7952 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
7953 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
7954 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
7955 // SIInstrInfo.cpp .
7956
7957 if (Op->isDivergent())
7958 return SDValue();
7959
7960 SDValue Op0 = Op.getOperand(0);
7961 SDValue Op1 = Op.getOperand(1);
7962 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
7963 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
7964 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
7965 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
7966 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
7967 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
7968 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
7969 SDLoc SL(Op);
7970 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7971 return SDValue(
7972 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7973 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
7974 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
7975 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7976 return SDValue(
7977 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7978 // If all the operands are uniform, then we lower s_mul_u64 as it is.
7979 return Op;
7980}
7981
7982SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
7983 EVT VT = Op.getValueType();
7984 SDLoc SL(Op);
7985 SDValue LHS = Op.getOperand(0);
7986 SDValue RHS = Op.getOperand(1);
7987 bool isSigned = Op.getOpcode() == ISD::SMULO;
7988
7989 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
7990 const APInt &C = RHSC->getAPIntValue();
7991 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
7992 if (C.isPowerOf2()) {
7993 // smulo(x, signed_min) is same as umulo(x, signed_min).
7994 bool UseArithShift = isSigned && !C.isMinSignedValue();
7995 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
7996 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
7997 SDValue Overflow =
7998 DAG.getSetCC(SL, MVT::i1,
7999 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
8000 Result, ShiftAmt),
8001 LHS, ISD::SETNE);
8002 return DAG.getMergeValues({Result, Overflow}, SL);
8003 }
8004 }
8005
8006 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
8007 SDValue Top =
8008 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
8009
8010 SDValue Sign = isSigned
8011 ? DAG.getNode(ISD::SRA, SL, VT, Result,
8012 DAG.getConstant(VT.getScalarSizeInBits() - 1,
8013 SL, MVT::i32))
8014 : DAG.getConstant(0, SL, VT);
8015 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
8016
8017 return DAG.getMergeValues({Result, Overflow}, SL);
8018}
8019
8020SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
8021 if (Op->isDivergent()) {
8022 // Select to V_MAD_[IU]64_[IU]32.
8023 return Op;
8024 }
8025 if (Subtarget->hasSMulHi()) {
8026 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
8027 return SDValue();
8028 }
8029 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
8030 // calculate the high part, so we might as well do the whole thing with
8031 // V_MAD_[IU]64_[IU]32.
8032 return Op;
8033}
8034
8035SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
8036 if (!Subtarget->isTrapHandlerEnabled() ||
8037 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
8038 return lowerTrapEndpgm(Op, DAG);
8039
8040 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
8041 : lowerTrapHsaQueuePtr(Op, DAG);
8042}
8043
8044SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
8045 SDLoc SL(Op);
8046 SDValue Chain = Op.getOperand(0);
8047 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8048}
8049
8050SDValue
8051SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
8052 const SDLoc &DL, Align Alignment,
8053 ImplicitParameter Param) const {
8054 MachineFunction &MF = DAG.getMachineFunction();
8055 uint64_t Offset = getImplicitParameterOffset(MF, Param);
8056 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
8057 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8058 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
8061}
8062
8063SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
8064 SelectionDAG &DAG) const {
8065 SDLoc SL(Op);
8066 SDValue Chain = Op.getOperand(0);
8067
8068 SDValue QueuePtr;
8069 // For code object version 5, QueuePtr is passed through implicit kernarg.
8070 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8072 QueuePtr =
8073 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
8074 } else {
8075 MachineFunction &MF = DAG.getMachineFunction();
8076 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8077 Register UserSGPR = Info->getQueuePtrUserSGPR();
8078
8079 if (UserSGPR == AMDGPU::NoRegister) {
8080 // We probably are in a function incorrectly marked with
8081 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
8082 // trap, so just use a null pointer.
8083 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
8084 } else {
8085 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
8086 MVT::i64);
8087 }
8088 }
8089
8090 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
8091 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
8092
8093 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8094 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
8095 ToReg.getValue(1)};
8096 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8097}
8098
8099SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
8100 SDLoc SL(Op);
8101 SDValue Chain = Op.getOperand(0);
8102
8103 // We need to simulate the 's_trap 2' instruction on targets that run in
8104 // PRIV=1 (where it is treated as a nop).
8105 if (Subtarget->hasPrivEnabledTrap2NopBug())
8106 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
8107
8108 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8109 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8110 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8111}
8112
8113SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
8114 SDLoc SL(Op);
8115 SDValue Chain = Op.getOperand(0);
8116 MachineFunction &MF = DAG.getMachineFunction();
8117
8118 if (!Subtarget->isTrapHandlerEnabled() ||
8119 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
8120 LLVMContext &Ctx = MF.getFunction().getContext();
8121 Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(),
8122 "debugtrap handler not supported",
8123 Op.getDebugLoc(), DS_Warning));
8124 return Chain;
8125 }
8126
8127 uint64_t TrapID =
8128 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
8129 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8130 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8131}
8132
8133SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
8134 SelectionDAG &DAG) const {
8135 if (Subtarget->hasApertureRegs()) {
8136 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
8137 ? AMDGPU::SRC_SHARED_BASE
8138 : AMDGPU::SRC_PRIVATE_BASE;
8139 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8140 !Subtarget->hasGloballyAddressableScratch()) &&
8141 "Cannot use src_private_base with globally addressable scratch!");
8142 // Note: this feature (register) is broken. When used as a 32-bit operand,
8143 // it returns a wrong value (all zeroes?). The real value is in the upper 32
8144 // bits.
8145 //
8146 // To work around the issue, emit a 64 bit copy from this register
8147 // then extract the high bits. Note that this shouldn't even result in a
8148 // shift being emitted and simply become a pair of registers (e.g.):
8149 // s_mov_b64 s[6:7], src_shared_base
8150 // v_mov_b32_e32 v1, s7
8151 SDValue Copy =
8152 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ApertureRegNo, MVT::v2i32);
8153 return DAG.getExtractVectorElt(DL, MVT::i32, Copy, 1);
8154 }
8155
8156 // For code object version 5, private_base and shared_base are passed through
8157 // implicit kernargs.
8158 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8162 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
8163 }
8164
8165 MachineFunction &MF = DAG.getMachineFunction();
8166 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8167 Register UserSGPR = Info->getQueuePtrUserSGPR();
8168 if (UserSGPR == AMDGPU::NoRegister) {
8169 // We probably are in a function incorrectly marked with
8170 // amdgpu-no-queue-ptr. This is undefined.
8171 return DAG.getPOISON(MVT::i32);
8172 }
8173
8174 SDValue QueuePtr =
8175 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
8176
8177 // Offset into amd_queue_t for group_segment_aperture_base_hi /
8178 // private_segment_aperture_base_hi.
8179 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
8180
8181 SDValue Ptr =
8182 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
8183
8184 // TODO: Use custom target PseudoSourceValue.
8185 // TODO: We should use the value from the IR intrinsic call, but it might not
8186 // be available and how do we get it?
8187 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8188 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
8189 commonAlignment(Align(64), StructOffset),
8192}
8193
8194/// Return true if the value is a known valid address, such that a null check is
8195/// not necessary.
8197 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
8199 return true;
8200
8201 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
8202 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
8203
8204 // TODO: Search through arithmetic, handle arguments and loads
8205 // marked nonnull.
8206 return false;
8207}
8208
8209SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
8210 SelectionDAG &DAG) const {
8211 SDLoc SL(Op);
8212
8213 const AMDGPUTargetMachine &TM =
8214 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
8215
8216 unsigned DestAS, SrcAS;
8217 SDValue Src;
8218 bool IsNonNull = false;
8219 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
8220 SrcAS = ASC->getSrcAddressSpace();
8221 Src = ASC->getOperand(0);
8222 DestAS = ASC->getDestAddressSpace();
8223 } else {
8224 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
8225 Op.getConstantOperandVal(0) ==
8226 Intrinsic::amdgcn_addrspacecast_nonnull);
8227 Src = Op->getOperand(1);
8228 SrcAS = Op->getConstantOperandVal(2);
8229 DestAS = Op->getConstantOperandVal(3);
8230 IsNonNull = true;
8231 }
8232
8233 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
8234
8235 // flat -> local/private
8236 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
8237 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
8238 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
8239 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8240
8241 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
8242 Subtarget->hasGloballyAddressableScratch()) {
8243 // flat -> private with globally addressable scratch: subtract
8244 // src_flat_scratch_base_lo.
8245 SDValue FlatScratchBaseLo(
8246 DAG.getMachineNode(
8247 AMDGPU::S_MOV_B32, SL, MVT::i32,
8248 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8249 0);
8250 Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
8251 }
8252
8253 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8254 return Ptr;
8255
8256 unsigned NullVal = TM.getNullPointerValue(DestAS);
8257 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8258 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
8259
8260 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
8261 SegmentNullPtr);
8262 }
8263 }
8264
8265 // local/private -> flat
8266 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
8267 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
8268 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
8269 SDValue CvtPtr;
8270 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
8271 Subtarget->hasGloballyAddressableScratch()) {
8272 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
8273 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
8274 SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
8275 SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
8276 ThreadID = DAG.getNode(
8277 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8278 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
8279 AllOnes, ThreadID);
8280 if (Subtarget->isWave64())
8281 ThreadID = DAG.getNode(
8282 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8283 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
8284 AllOnes, ThreadID);
8285 SDValue ShAmt = DAG.getShiftAmountConstant(
8286 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8287 SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
8288 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
8289 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8290 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
8291 // 64-bit hi:lo value.
8292 SDValue FlatScratchBase = {
8293 DAG.getMachineNode(
8294 AMDGPU::S_MOV_B64, SL, MVT::i64,
8295 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8296 0};
8297 CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8298 } else {
8299 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8300 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
8301 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8302 }
8303
8304 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8305 return CvtPtr;
8306
8307 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8308 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8309
8310 SDValue NonNull =
8311 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
8312
8313 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
8314 FlatNullPtr);
8315 }
8316 }
8317
8318 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8319 Op.getValueType() == MVT::i64) {
8320 const SIMachineFunctionInfo *Info =
8321 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
8322 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
8323 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
8324 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8325 }
8326
8327 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8328 Src.getValueType() == MVT::i64)
8329 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8330
8331 // global <-> flat are no-ops and never emitted.
8332
8333 // Invalid casts are poison.
8334 return DAG.getPOISON(Op->getValueType(0));
8335}
8336
8337// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
8338// the small vector and inserting them into the big vector. That is better than
8339// the default expansion of doing it via a stack slot. Even though the use of
8340// the stack slot would be optimized away afterwards, the stack slot itself
8341// remains.
8342SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
8343 SelectionDAG &DAG) const {
8344 SDValue Vec = Op.getOperand(0);
8345 SDValue Ins = Op.getOperand(1);
8346 SDValue Idx = Op.getOperand(2);
8347 EVT VecVT = Vec.getValueType();
8348 EVT InsVT = Ins.getValueType();
8349 EVT EltVT = VecVT.getVectorElementType();
8350 unsigned InsNumElts = InsVT.getVectorNumElements();
8351 unsigned IdxVal = Idx->getAsZExtVal();
8352 SDLoc SL(Op);
8353
8354 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
8355 // Insert 32-bit registers at a time.
8356 assert(InsNumElts % 2 == 0 && "expect legal vector types");
8357
8358 unsigned VecNumElts = VecVT.getVectorNumElements();
8359 EVT NewVecVT =
8360 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
8361 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8363 MVT::i32, InsNumElts / 2);
8364
8365 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8366 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8367
8368 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
8369 SDValue Elt;
8370 if (InsNumElts == 2) {
8371 Elt = Ins;
8372 } else {
8373 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
8374 DAG.getConstant(I, SL, MVT::i32));
8375 }
8376 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
8377 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
8378 }
8379
8380 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
8381 }
8382
8383 for (unsigned I = 0; I != InsNumElts; ++I) {
8384 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
8385 DAG.getConstant(I, SL, MVT::i32));
8386 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
8387 DAG.getConstant(IdxVal + I, SL, MVT::i32));
8388 }
8389 return Vec;
8390}
8391
8392SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
8393 SelectionDAG &DAG) const {
8394 SDValue Vec = Op.getOperand(0);
8395 SDValue InsVal = Op.getOperand(1);
8396 SDValue Idx = Op.getOperand(2);
8397 EVT VecVT = Vec.getValueType();
8398 EVT EltVT = VecVT.getVectorElementType();
8399 unsigned VecSize = VecVT.getSizeInBits();
8400 unsigned EltSize = EltVT.getSizeInBits();
8401 SDLoc SL(Op);
8402
8403 // Specially handle the case of v4i16 with static indexing.
8404 unsigned NumElts = VecVT.getVectorNumElements();
8405 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
8406 if (NumElts == 4 && EltSize == 16 && KIdx) {
8407 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
8408
8409 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8410 DAG.getConstant(0, SL, MVT::i32));
8411 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8412 DAG.getConstant(1, SL, MVT::i32));
8413
8414 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8415 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8416
8417 unsigned Idx = KIdx->getZExtValue();
8418 bool InsertLo = Idx < 2;
8419 SDValue InsHalf = DAG.getNode(
8420 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
8421 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8422 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8423
8424 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8425
8426 SDValue Concat =
8427 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
8428 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8429
8430 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
8431 }
8432
8433 // Static indexing does not lower to stack access, and hence there is no need
8434 // for special custom lowering to avoid stack access.
8435 if (isa<ConstantSDNode>(Idx))
8436 return SDValue();
8437
8438 // Avoid stack access for dynamic indexing by custom lowering to
8439 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
8440
8441 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
8442
8443 MVT IntVT = MVT::getIntegerVT(VecSize);
8444
8445 // Convert vector index to bit-index and get the required bit mask.
8446 assert(isPowerOf2_32(EltSize));
8447 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
8448 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8449 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8450 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
8451 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
8452
8453 // 1. Create a congruent vector with the target value in each element.
8454 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
8455 DAG.getSplatBuildVector(VecVT, SL, InsVal));
8456
8457 // 2. Mask off all other indices except the required index within (1).
8458 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
8459
8460 // 3. Mask off the required index within the target vector.
8461 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8462 SDValue RHS =
8463 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
8464
8465 // 4. Get (2) and (3) ORed into the target vector.
8466 SDValue BFI =
8467 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
8468
8469 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
8470}
8471
8472SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
8473 SelectionDAG &DAG) const {
8474 SDLoc SL(Op);
8475
8476 EVT ResultVT = Op.getValueType();
8477 SDValue Vec = Op.getOperand(0);
8478 SDValue Idx = Op.getOperand(1);
8479 EVT VecVT = Vec.getValueType();
8480 unsigned VecSize = VecVT.getSizeInBits();
8481 EVT EltVT = VecVT.getVectorElementType();
8482
8483 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
8484
8485 // Make sure we do any optimizations that will make it easier to fold
8486 // source modifiers before obscuring it with bit operations.
8487
8488 // XXX - Why doesn't this get called when vector_shuffle is expanded?
8489 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
8490 return Combined;
8491
8492 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8493 SDValue Lo, Hi;
8494 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
8495
8496 if (VecSize == 128) {
8497 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
8498 Lo = DAG.getBitcast(LoVT,
8499 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8500 DAG.getConstant(0, SL, MVT::i32)));
8501 Hi = DAG.getBitcast(HiVT,
8502 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8503 DAG.getConstant(1, SL, MVT::i32)));
8504 } else if (VecSize == 256) {
8505 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
8506 SDValue Parts[4];
8507 for (unsigned P = 0; P < 4; ++P) {
8508 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8509 DAG.getConstant(P, SL, MVT::i32));
8510 }
8511
8512 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8513 Parts[0], Parts[1]));
8514 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8515 Parts[2], Parts[3]));
8516 } else {
8517 assert(VecSize == 512);
8518
8519 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
8520 SDValue Parts[8];
8521 for (unsigned P = 0; P < 8; ++P) {
8522 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8523 DAG.getConstant(P, SL, MVT::i32));
8524 }
8525
8526 Lo = DAG.getBitcast(LoVT,
8527 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8528 Parts[0], Parts[1], Parts[2], Parts[3]));
8529 Hi = DAG.getBitcast(HiVT,
8530 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8531 Parts[4], Parts[5], Parts[6], Parts[7]));
8532 }
8533
8534 EVT IdxVT = Idx.getValueType();
8535 unsigned NElem = VecVT.getVectorNumElements();
8536 assert(isPowerOf2_32(NElem));
8537 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
8538 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
8539 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
8540 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
8541 }
8542
8543 assert(VecSize <= 64);
8544
8545 MVT IntVT = MVT::getIntegerVT(VecSize);
8546
8547 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
8548 SDValue VecBC = peekThroughBitcasts(Vec);
8549 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8550 SDValue Src = VecBC.getOperand(0);
8551 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8552 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
8553 }
8554
8555 unsigned EltSize = EltVT.getSizeInBits();
8556 assert(isPowerOf2_32(EltSize));
8557
8558 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8559
8560 // Convert vector index to bit-index (* EltSize)
8561 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8562
8563 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8564 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
8565
8566 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8567 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
8568 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
8569 }
8570
8571 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
8572}
8573
8574static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
8575 assert(Elt % 2 == 0);
8576 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8577}
8578
8579static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
8580 assert(Elt % 2 == 0);
8581 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8582 !(Mask[Elt + 1] & 1);
8583}
8584
8585SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
8586 SelectionDAG &DAG) const {
8587 SDLoc SL(Op);
8588 EVT ResultVT = Op.getValueType();
8589 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
8590 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
8591 const int NewSrcNumElts = 2;
8592 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
8593 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
8594
8595 // Break up the shuffle into registers sized pieces.
8596 //
8597 // We're trying to form sub-shuffles that the register allocation pipeline
8598 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
8599 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
8600 // pair of copies into a consecutive register copy, so use the ordinary
8601 // extract_vector_elt lowering unless we can use the shuffle.
8602 //
8603 // TODO: This is a bit of hack, and we should probably always use
8604 // extract_subvector for the largest possible subvector we can (or at least
8605 // use it for PackVT aligned pieces). However we have worse support for
8606 // combines on them don't directly treat extract_subvector / insert_subvector
8607 // as legal. The DAG scheduler also ends up doing a worse job with the
8608 // extract_subvectors.
8609 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
8610
8611 // vector_shuffle <0,1,6,7> lhs, rhs
8612 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
8613 //
8614 // vector_shuffle <6,7,2,3> lhs, rhs
8615 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
8616 //
8617 // vector_shuffle <6,7,0,1> lhs, rhs
8618 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
8619
8620 // Avoid scalarizing when both halves are reading from consecutive elements.
8621
8622 // If we're treating 2 element shuffles as legal, also create odd-to-even
8623 // shuffles of neighboring pairs.
8624 //
8625 // vector_shuffle <3,2,7,6> lhs, rhs
8626 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
8627 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
8628
8630 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
8631 if (ShouldUseConsecutiveExtract &&
8633 const int Idx = SVN->getMaskElt(I);
8634 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8635 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8636 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
8637 SVN->getOperand(VecIdx),
8638 DAG.getConstant(EltIdx, SL, MVT::i32));
8639 Pieces.push_back(SubVec);
8640 } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
8642 int Idx0 = SVN->getMaskElt(I);
8643 int Idx1 = SVN->getMaskElt(I + 1);
8644
8645 SDValue SrcOp0 = SVN->getOperand(0);
8646 SDValue SrcOp1 = SrcOp0;
8647 if (Idx0 >= SrcNumElts) {
8648 SrcOp0 = SVN->getOperand(1);
8649 Idx0 -= SrcNumElts;
8650 }
8651
8652 if (Idx1 >= SrcNumElts) {
8653 SrcOp1 = SVN->getOperand(1);
8654 Idx1 -= SrcNumElts;
8655 }
8656
8657 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8658 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8659
8660 // Extract nearest even aligned piece.
8661 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
8662 DAG.getConstant(AlignedIdx0, SL, MVT::i32));
8663 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
8664 DAG.getConstant(AlignedIdx1, SL, MVT::i32));
8665
8666 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8667 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8668
8669 SDValue Result0 = SubVec0;
8670 SDValue Result1 = SubVec0;
8671
8672 if (SubVec0 != SubVec1) {
8673 NewMaskIdx1 += NewSrcNumElts;
8674 Result1 = SubVec1;
8675 } else {
8676 Result1 = DAG.getPOISON(PackVT);
8677 }
8678
8679 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
8680 {NewMaskIdx0, NewMaskIdx1});
8681 Pieces.push_back(Shuf);
8682 } else {
8683 const int Idx0 = SVN->getMaskElt(I);
8684 const int Idx1 = SVN->getMaskElt(I + 1);
8685 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8686 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8687 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8688 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8689
8690 SDValue Vec0 = SVN->getOperand(VecIdx0);
8691 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
8692 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
8693
8694 SDValue Vec1 = SVN->getOperand(VecIdx1);
8695 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
8696 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
8697 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
8698 }
8699 }
8700
8701 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
8702}
8703
8704SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
8705 SelectionDAG &DAG) const {
8706 SDValue SVal = Op.getOperand(0);
8707 EVT ResultVT = Op.getValueType();
8708 EVT SValVT = SVal.getValueType();
8709 SDValue UndefVal = DAG.getPOISON(SValVT);
8710 SDLoc SL(Op);
8711
8713 VElts.push_back(SVal);
8714 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
8715 VElts.push_back(UndefVal);
8716
8717 return DAG.getBuildVector(ResultVT, SL, VElts);
8718}
8719
8720SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
8721 SelectionDAG &DAG) const {
8722 SDLoc SL(Op);
8723 EVT VT = Op.getValueType();
8724
8725 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8726 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
8727
8728 SDValue Lo = Op.getOperand(0);
8729 SDValue Hi = Op.getOperand(1);
8730
8731 // Avoid adding defined bits with the zero_extend.
8732 if (Hi.isUndef()) {
8733 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8734 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
8735 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
8736 }
8737
8738 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
8739 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
8740
8741 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
8742 DAG.getConstant(16, SL, MVT::i32));
8743 if (Lo.isUndef())
8744 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
8745
8746 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8747 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
8748
8749 SDValue Or =
8750 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
8751 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
8752 }
8753
8754 // Split into 2-element chunks.
8755 const unsigned NumParts = VT.getVectorNumElements() / 2;
8756 EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
8757 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
8758
8760 for (unsigned P = 0; P < NumParts; ++P) {
8761 SDValue Vec = DAG.getBuildVector(
8762 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
8763 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
8764 }
8765
8766 SDValue Blend =
8767 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
8768 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
8769}
8770
8772 const GlobalAddressSDNode *GA) const {
8773 // OSes that use ELF REL relocations (instead of RELA) can only store a
8774 // 32-bit addend in the instruction, so it is not safe to allow offset folding
8775 // which can create arbitrary 64-bit addends. (This is only a problem for
8776 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
8777 // the high 32 bits of the addend.)
8778 //
8779 // This should be kept in sync with how HasRelocationAddend is initialized in
8780 // the constructor of ELFAMDGPUAsmBackend.
8781 if (!Subtarget->isAmdHsaOS())
8782 return false;
8783
8784 // We can fold offsets for anything that doesn't require a GOT relocation.
8785 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
8789}
8790
8791static SDValue
8793 const SDLoc &DL, int64_t Offset, EVT PtrVT,
8794 unsigned GAFlags = SIInstrInfo::MO_NONE) {
8795 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
8796 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
8797 // lowered to the following code sequence:
8798 //
8799 // For constant address space:
8800 // s_getpc_b64 s[0:1]
8801 // s_add_u32 s0, s0, $symbol
8802 // s_addc_u32 s1, s1, 0
8803 //
8804 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8805 // a fixup or relocation is emitted to replace $symbol with a literal
8806 // constant, which is a pc-relative offset from the encoding of the $symbol
8807 // operand to the global variable.
8808 //
8809 // For global address space:
8810 // s_getpc_b64 s[0:1]
8811 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
8812 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
8813 //
8814 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8815 // fixups or relocations are emitted to replace $symbol@*@lo and
8816 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
8817 // which is a 64-bit pc-relative offset from the encoding of the $symbol
8818 // operand to the global variable.
8819 if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
8820 assert(GAFlags != SIInstrInfo::MO_NONE);
8821
8822 SDValue Ptr =
8823 DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2);
8824 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr);
8825 }
8826
8827 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
8828 SDValue PtrHi;
8829 if (GAFlags == SIInstrInfo::MO_NONE)
8830 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
8831 else
8832 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
8833 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
8834}
8835
8836SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
8837 SDValue Op,
8838 SelectionDAG &DAG) const {
8839 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
8840 SDLoc DL(GSD);
8841 EVT PtrVT = Op.getValueType();
8842
8843 const GlobalValue *GV = GSD->getGlobal();
8849 GV->hasExternalLinkage()) {
8850 Type *Ty = GV->getValueType();
8851 // HIP uses an unsized array `extern __shared__ T s[]` or similar
8852 // zero-sized type in other languages to declare the dynamic shared
8853 // memory which size is not known at the compile time. They will be
8854 // allocated by the runtime and placed directly after the static
8855 // allocated ones. They all share the same offset.
8856 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
8857 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
8858 // Adjust alignment for that dynamic shared memory array.
8861 MFI->setUsesDynamicLDS(true);
8862 return SDValue(
8863 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
8864 }
8865 }
8867 }
8868
8870 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
8872 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
8873 }
8874
8875 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8876 if (Subtarget->has64BitLiterals()) {
8878 GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64);
8879 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr),
8880 0);
8881 }
8882
8883 SDValue AddrLo = DAG.getTargetGlobalAddress(
8884 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
8885 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
8886
8887 SDValue AddrHi = DAG.getTargetGlobalAddress(
8888 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
8889 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
8890
8891 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
8892 }
8893
8894 if (shouldEmitFixup(GV))
8895 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
8896
8897 if (shouldEmitPCReloc(GV))
8898 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
8900
8901 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
8903 PointerType *PtrTy =
8905 const DataLayout &DataLayout = DAG.getDataLayout();
8906 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
8907 MachinePointerInfo PtrInfo =
8909
8910 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
8913}
8914
8916 const SDLoc &DL, SDValue V) const {
8917 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
8918 // the destination register.
8919 //
8920 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
8921 // so we will end up with redundant moves to m0.
8922 //
8923 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
8924
8925 // A Null SDValue creates a glue result.
8926 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
8927 V, Chain);
8928 return SDValue(M0, 0);
8929}
8930
8931SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
8932 MVT VT,
8933 unsigned Offset) const {
8934 SDLoc SL(Op);
8935 SDValue Param = lowerKernargMemParameter(
8936 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
8937 // The local size values will have the hi 16-bits as zero.
8938 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
8939 DAG.getValueType(VT));
8940}
8941
8943 EVT VT) {
8946 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
8947 return DAG.getPOISON(VT);
8948}
8949
8951 EVT VT) {
8954 "intrinsic not supported on subtarget", DL.getDebugLoc()));
8955 return DAG.getPOISON(VT);
8956}
8957
8959 ArrayRef<SDValue> Elts) {
8960 assert(!Elts.empty());
8961 MVT Type;
8962 unsigned NumElts = Elts.size();
8963
8964 if (NumElts <= 12) {
8965 Type = MVT::getVectorVT(MVT::f32, NumElts);
8966 } else {
8967 assert(Elts.size() <= 16);
8968 Type = MVT::v16f32;
8969 NumElts = 16;
8970 }
8971
8972 SmallVector<SDValue, 16> VecElts(NumElts);
8973 for (unsigned i = 0; i < Elts.size(); ++i) {
8974 SDValue Elt = Elts[i];
8975 if (Elt.getValueType() != MVT::f32)
8976 Elt = DAG.getBitcast(MVT::f32, Elt);
8977 VecElts[i] = Elt;
8978 }
8979 for (unsigned i = Elts.size(); i < NumElts; ++i)
8980 VecElts[i] = DAG.getPOISON(MVT::f32);
8981
8982 if (NumElts == 1)
8983 return VecElts[0];
8984 return DAG.getBuildVector(Type, DL, VecElts);
8985}
8986
8987static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
8988 SDValue Src, int ExtraElts) {
8989 EVT SrcVT = Src.getValueType();
8990
8992
8993 if (SrcVT.isVector())
8994 DAG.ExtractVectorElements(Src, Elts);
8995 else
8996 Elts.push_back(Src);
8997
8998 SDValue Undef = DAG.getPOISON(SrcVT.getScalarType());
8999 while (ExtraElts--)
9000 Elts.push_back(Undef);
9001
9002 return DAG.getBuildVector(CastVT, DL, Elts);
9003}
9004
9005// Re-construct the required return value for a image load intrinsic.
9006// This is more complicated due to the optional use TexFailCtrl which means the
9007// required return type is an aggregate
9009 ArrayRef<EVT> ResultTypes, bool IsTexFail,
9010 bool Unpacked, bool IsD16, int DMaskPop,
9011 int NumVDataDwords, bool IsAtomicPacked16Bit,
9012 const SDLoc &DL) {
9013 // Determine the required return type. This is the same regardless of
9014 // IsTexFail flag
9015 EVT ReqRetVT = ResultTypes[0];
9016 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
9017 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9018 ? (ReqRetNumElts + 1) / 2
9019 : ReqRetNumElts;
9020
9021 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9022
9023 MVT DataDwordVT =
9024 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
9025
9026 MVT MaskPopVT =
9027 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
9028
9029 SDValue Data(Result, 0);
9030 SDValue TexFail;
9031
9032 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
9033 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
9034 if (MaskPopVT.isVector()) {
9035 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
9036 SDValue(Result, 0), ZeroIdx);
9037 } else {
9038 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
9039 SDValue(Result, 0), ZeroIdx);
9040 }
9041 }
9042
9043 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
9044 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
9045 NumDataDwords - MaskPopDwords);
9046
9047 if (IsD16)
9048 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
9049
9050 EVT LegalReqRetVT = ReqRetVT;
9051 if (!ReqRetVT.isVector()) {
9052 if (!Data.getValueType().isInteger())
9053 Data = DAG.getNode(ISD::BITCAST, DL,
9054 Data.getValueType().changeTypeToInteger(), Data);
9055 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
9056 } else {
9057 // We need to widen the return vector to a legal type
9058 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
9059 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
9060 LegalReqRetVT =
9062 ReqRetVT.getVectorNumElements() + 1);
9063 }
9064 }
9065 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
9066
9067 if (IsTexFail) {
9068 TexFail =
9069 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
9070 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
9071
9072 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
9073 }
9074
9075 if (Result->getNumValues() == 1)
9076 return Data;
9077
9078 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
9079}
9080
9081static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
9082 SDValue *LWE, bool &IsTexFail) {
9083 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
9084
9085 uint64_t Value = TexFailCtrlConst->getZExtValue();
9086 if (Value) {
9087 IsTexFail = true;
9088 }
9089
9090 SDLoc DL(TexFailCtrlConst);
9091 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
9092 Value &= ~(uint64_t)0x1;
9093 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
9094 Value &= ~(uint64_t)0x2;
9095
9096 return Value == 0;
9097}
9098
9100 MVT PackVectorVT,
9101 SmallVectorImpl<SDValue> &PackedAddrs,
9102 unsigned DimIdx, unsigned EndIdx,
9103 unsigned NumGradients) {
9104 SDLoc DL(Op);
9105 for (unsigned I = DimIdx; I < EndIdx; I++) {
9106 SDValue Addr = Op.getOperand(I);
9107
9108 // Gradients are packed with undef for each coordinate.
9109 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
9110 // 1D: undef,dx/dh; undef,dx/dv
9111 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
9112 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
9113 if (((I + 1) >= EndIdx) ||
9114 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
9115 I == DimIdx + NumGradients - 1))) {
9116 if (Addr.getValueType() != MVT::i16)
9117 Addr = DAG.getBitcast(MVT::i16, Addr);
9118 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
9119 } else {
9120 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
9121 I++;
9122 }
9123 Addr = DAG.getBitcast(MVT::f32, Addr);
9124 PackedAddrs.push_back(Addr);
9125 }
9126}
9127
9128SDValue SITargetLowering::lowerImage(SDValue Op,
9130 SelectionDAG &DAG, bool WithChain) const {
9131 SDLoc DL(Op);
9132 MachineFunction &MF = DAG.getMachineFunction();
9133 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
9134 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9136 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
9137 unsigned IntrOpcode = Intr->BaseOpcode;
9138 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
9139 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9140 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9141
9142 SmallVector<EVT, 3> ResultTypes(Op->values());
9143 SmallVector<EVT, 3> OrigResultTypes(Op->values());
9144 bool IsD16 = false;
9145 bool IsG16 = false;
9146 bool IsA16 = false;
9147 SDValue VData;
9148 int NumVDataDwords = 0;
9149 bool AdjustRetType = false;
9150 bool IsAtomicPacked16Bit = false;
9151
9152 // Offset of intrinsic arguments
9153 const unsigned ArgOffset = WithChain ? 2 : 1;
9154
9155 unsigned DMask;
9156 unsigned DMaskLanes = 0;
9157
9158 if (BaseOpcode->Atomic) {
9159 VData = Op.getOperand(2);
9160
9161 IsAtomicPacked16Bit =
9162 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9163 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
9164
9165 bool Is64Bit = VData.getValueSizeInBits() == 64;
9166 if (BaseOpcode->AtomicX2) {
9167 SDValue VData2 = Op.getOperand(3);
9168 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
9169 {VData, VData2});
9170 if (Is64Bit)
9171 VData = DAG.getBitcast(MVT::v4i32, VData);
9172
9173 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9174 DMask = Is64Bit ? 0xf : 0x3;
9175 NumVDataDwords = Is64Bit ? 4 : 2;
9176 } else {
9177 DMask = Is64Bit ? 0x3 : 0x1;
9178 NumVDataDwords = Is64Bit ? 2 : 1;
9179 }
9180 } else {
9181 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
9182 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
9183
9184 if (BaseOpcode->Store) {
9185 VData = Op.getOperand(2);
9186
9187 MVT StoreVT = VData.getSimpleValueType();
9188 if (StoreVT.getScalarType() == MVT::f16) {
9189 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9190 return Op; // D16 is unsupported for this instruction
9191
9192 IsD16 = true;
9193 VData = handleD16VData(VData, DAG, true);
9194 }
9195
9196 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
9197 } else if (!BaseOpcode->NoReturn) {
9198 // Work out the num dwords based on the dmask popcount and underlying type
9199 // and whether packing is supported.
9200 MVT LoadVT = ResultTypes[0].getSimpleVT();
9201 if (LoadVT.getScalarType() == MVT::f16) {
9202 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9203 return Op; // D16 is unsupported for this instruction
9204
9205 IsD16 = true;
9206 }
9207
9208 // Confirm that the return type is large enough for the dmask specified
9209 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
9210 (!LoadVT.isVector() && DMaskLanes > 1))
9211 return Op;
9212
9213 // The sq block of gfx8 and gfx9 do not estimate register use correctly
9214 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
9215 // instructions.
9216 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9217 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9218 NumVDataDwords = (DMaskLanes + 1) / 2;
9219 else
9220 NumVDataDwords = DMaskLanes;
9221
9222 AdjustRetType = true;
9223 }
9224 }
9225
9226 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
9228
9229 // Check for 16 bit addresses or derivatives and pack if true.
9230 MVT VAddrVT =
9231 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
9232 MVT VAddrScalarVT = VAddrVT.getScalarType();
9233 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9234 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9235
9236 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
9237 VAddrScalarVT = VAddrVT.getScalarType();
9238 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9239 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9240
9241 // Push back extra arguments.
9242 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
9243 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
9244 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
9245 // Special handling of bias when A16 is on. Bias is of type half but
9246 // occupies full 32-bit.
9247 SDValue Bias = DAG.getBuildVector(
9248 MVT::v2f16, DL,
9249 {Op.getOperand(ArgOffset + I), DAG.getPOISON(MVT::f16)});
9250 VAddrs.push_back(Bias);
9251 } else {
9252 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
9253 "Bias needs to be converted to 16 bit in A16 mode");
9254 VAddrs.push_back(Op.getOperand(ArgOffset + I));
9255 }
9256 }
9257
9258 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
9259 // 16 bit gradients are supported, but are tied to the A16 control
9260 // so both gradients and addresses must be 16 bit
9261 LLVM_DEBUG(
9262 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
9263 "require 16 bit args for both gradients and addresses");
9264 return Op;
9265 }
9266
9267 if (IsA16) {
9268 if (!ST->hasA16()) {
9269 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
9270 "support 16 bit addresses\n");
9271 return Op;
9272 }
9273 }
9274
9275 // We've dealt with incorrect input so we know that if IsA16, IsG16
9276 // are set then we have to compress/pack operands (either address,
9277 // gradient or both)
9278 // In the case where a16 and gradients are tied (no G16 support) then we
9279 // have already verified that both IsA16 and IsG16 are true
9280 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
9281 // Activate g16
9282 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9284 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
9285 }
9286
9287 // Add gradients (packed or unpacked)
9288 if (IsG16) {
9289 // Pack the gradients
9290 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
9291 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
9292 ArgOffset + Intr->GradientStart,
9293 ArgOffset + Intr->CoordStart, Intr->NumGradients);
9294 } else {
9295 for (unsigned I = ArgOffset + Intr->GradientStart;
9296 I < ArgOffset + Intr->CoordStart; I++)
9297 VAddrs.push_back(Op.getOperand(I));
9298 }
9299
9300 // Add addresses (packed or unpacked)
9301 if (IsA16) {
9302 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
9303 ArgOffset + Intr->CoordStart, VAddrEnd,
9304 0 /* No gradients */);
9305 } else {
9306 // Add uncompressed address
9307 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
9308 VAddrs.push_back(Op.getOperand(I));
9309 }
9310
9311 // If the register allocator cannot place the address registers contiguously
9312 // without introducing moves, then using the non-sequential address encoding
9313 // is always preferable, since it saves VALU instructions and is usually a
9314 // wash in terms of code size or even better.
9315 //
9316 // However, we currently have no way of hinting to the register allocator that
9317 // MIMG addresses should be placed contiguously when it is possible to do so,
9318 // so force non-NSA for the common 2-address case as a heuristic.
9319 //
9320 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
9321 // allocation when possible.
9322 //
9323 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
9324 // set of the remaining addresses.
9325 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
9326 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
9327 const bool UseNSA = ST->hasNSAEncoding() &&
9328 VAddrs.size() >= ST->getNSAThreshold(MF) &&
9329 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
9330 const bool UsePartialNSA =
9331 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
9332
9333 SDValue VAddr;
9334 if (UsePartialNSA) {
9335 VAddr = getBuildDwordsVector(DAG, DL,
9336 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9337 } else if (!UseNSA) {
9338 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
9339 }
9340
9341 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
9342 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
9343 SDValue Unorm;
9344 if (!BaseOpcode->Sampler) {
9345 Unorm = True;
9346 } else {
9347 uint64_t UnormConst =
9348 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
9349
9350 Unorm = UnormConst ? True : False;
9351 }
9352
9353 SDValue TFE;
9354 SDValue LWE;
9355 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
9356 bool IsTexFail = false;
9357 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9358 return Op;
9359
9360 if (IsTexFail) {
9361 if (!DMaskLanes) {
9362 // Expecting to get an error flag since TFC is on - and dmask is 0
9363 // Force dmask to be at least 1 otherwise the instruction will fail
9364 DMask = 0x1;
9365 DMaskLanes = 1;
9366 NumVDataDwords = 1;
9367 }
9368 NumVDataDwords += 1;
9369 AdjustRetType = true;
9370 }
9371
9372 // Has something earlier tagged that the return type needs adjusting
9373 // This happens if the instruction is a load or has set TexFailCtrl flags
9374 if (AdjustRetType) {
9375 // NumVDataDwords reflects the true number of dwords required in the return
9376 // type
9377 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9378 // This is a no-op load. This can be eliminated
9379 SDValue Undef = DAG.getPOISON(Op.getValueType());
9380 if (isa<MemSDNode>(Op))
9381 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
9382 return Undef;
9383 }
9384
9385 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
9386 MVT::i32, NumVDataDwords)
9387 : MVT::i32;
9388
9389 ResultTypes[0] = NewVT;
9390 if (ResultTypes.size() == 3) {
9391 // Original result was aggregate type used for TexFailCtrl results
9392 // The actual instruction returns as a vector type which has now been
9393 // created. Remove the aggregate result.
9394 ResultTypes.erase(&ResultTypes[1]);
9395 }
9396 }
9397
9398 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
9399 if (BaseOpcode->Atomic)
9400 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
9401 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
9403 return Op;
9404
9406 if (BaseOpcode->Store || BaseOpcode->Atomic)
9407 Ops.push_back(VData); // vdata
9408 if (UsePartialNSA) {
9409 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
9410 Ops.push_back(VAddr);
9411 } else if (UseNSA)
9412 append_range(Ops, VAddrs);
9413 else
9414 Ops.push_back(VAddr);
9415 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
9416 EVT RsrcVT = Rsrc.getValueType();
9417 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9418 return Op;
9419 Ops.push_back(Rsrc);
9420 if (BaseOpcode->Sampler) {
9421 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
9422 if (Samp.getValueType() != MVT::v4i32)
9423 return Op;
9424 Ops.push_back(Samp);
9425 }
9426 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
9427 if (IsGFX10Plus)
9428 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
9429 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9430 Ops.push_back(Unorm);
9431 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
9432 Ops.push_back(IsA16 && // r128, a16 for gfx9
9433 ST->hasFeature(AMDGPU::FeatureR128A16)
9434 ? True
9435 : False);
9436 if (IsGFX10Plus)
9437 Ops.push_back(IsA16 ? True : False);
9438
9439 if (!Subtarget->hasGFX90AInsts())
9440 Ops.push_back(TFE); // tfe
9441 else if (TFE->getAsZExtVal()) {
9442 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9444 "TFE is not supported on this GPU", DL.getDebugLoc()));
9445 }
9446
9447 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9448 Ops.push_back(LWE); // lwe
9449 if (!IsGFX10Plus)
9450 Ops.push_back(DimInfo->DA ? True : False);
9451 if (BaseOpcode->HasD16)
9452 Ops.push_back(IsD16 ? True : False);
9453 if (isa<MemSDNode>(Op))
9454 Ops.push_back(Op.getOperand(0)); // chain
9455
9456 int NumVAddrDwords =
9457 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
9458 int Opcode = -1;
9459
9460 if (IsGFX12Plus) {
9461 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
9462 NumVDataDwords, NumVAddrDwords);
9463 } else if (IsGFX11Plus) {
9464 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9465 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9466 : AMDGPU::MIMGEncGfx11Default,
9467 NumVDataDwords, NumVAddrDwords);
9468 } else if (IsGFX10Plus) {
9469 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9470 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9471 : AMDGPU::MIMGEncGfx10Default,
9472 NumVDataDwords, NumVAddrDwords);
9473 } else {
9474 if (Subtarget->hasGFX90AInsts()) {
9475 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
9476 NumVDataDwords, NumVAddrDwords);
9477 if (Opcode == -1) {
9478 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9480 "requested image instruction is not supported on this GPU",
9481 DL.getDebugLoc()));
9482
9483 unsigned Idx = 0;
9484 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
9485 for (EVT VT : OrigResultTypes) {
9486 if (VT == MVT::Other)
9487 RetValues[Idx++] = Op.getOperand(0); // Chain
9488 else
9489 RetValues[Idx++] = DAG.getPOISON(VT);
9490 }
9491
9492 return DAG.getMergeValues(RetValues, DL);
9493 }
9494 }
9495 if (Opcode == -1 &&
9496 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9497 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
9498 NumVDataDwords, NumVAddrDwords);
9499 if (Opcode == -1)
9500 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
9501 NumVDataDwords, NumVAddrDwords);
9502 }
9503 if (Opcode == -1)
9504 return Op;
9505
9506 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
9507 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
9508 MachineMemOperand *MemRef = MemOp->getMemOperand();
9509 DAG.setNodeMemRefs(NewNode, {MemRef});
9510 }
9511
9512 if (BaseOpcode->AtomicX2) {
9514 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
9515 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
9516 }
9517 if (BaseOpcode->NoReturn)
9518 return SDValue(NewNode, 0);
9519 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
9520 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9521 NumVDataDwords, IsAtomicPacked16Bit, DL);
9522}
9523
9524SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
9525 SDValue Offset, SDValue CachePolicy,
9526 SelectionDAG &DAG) const {
9527 MachineFunction &MF = DAG.getMachineFunction();
9528
9529 const DataLayout &DataLayout = DAG.getDataLayout();
9530 Align Alignment =
9531 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
9532
9533 MachineMemOperand *MMO = MF.getMachineMemOperand(
9534 MachinePointerInfo(),
9537 VT.getStoreSize(), Alignment);
9538
9539 if (!Offset->isDivergent()) {
9540 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
9541
9542 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
9543 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
9544 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
9545 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
9546 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9547 SDValue BufferLoad =
9549 DAG.getVTList(MVT::i32), Ops, VT, MMO);
9550 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
9551 }
9552
9553 // Widen vec3 load to vec4.
9554 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
9555 !Subtarget->hasScalarDwordx3Loads()) {
9556 EVT WidenedVT =
9558 auto WidenedOp = DAG.getMemIntrinsicNode(
9559 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
9560 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
9561 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
9562 DAG.getVectorIdxConstant(0, DL));
9563 return Subvector;
9564 }
9565
9567 DAG.getVTList(VT), Ops, VT, MMO);
9568 }
9569
9570 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
9571 // assume that the buffer is unswizzled.
9572 SDValue Ops[] = {
9573 DAG.getEntryNode(), // Chain
9574 Rsrc, // rsrc
9575 DAG.getConstant(0, DL, MVT::i32), // vindex
9576 {}, // voffset
9577 {}, // soffset
9578 {}, // offset
9579 CachePolicy, // cachepolicy
9580 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9581 };
9582 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9583 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
9584 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
9585 }
9586
9588 unsigned NumLoads = 1;
9589 MVT LoadVT = VT.getSimpleVT();
9590 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
9591 assert((LoadVT.getScalarType() == MVT::i32 ||
9592 LoadVT.getScalarType() == MVT::f32));
9593
9594 if (NumElts == 8 || NumElts == 16) {
9595 NumLoads = NumElts / 4;
9596 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
9597 }
9598
9599 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other});
9600
9601 // Use the alignment to ensure that the required offsets will fit into the
9602 // immediate offsets.
9603 setBufferOffsets(Offset, DAG, &Ops[3],
9604 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
9605
9606 uint64_t InstOffset = Ops[5]->getAsZExtVal();
9607 for (unsigned i = 0; i < NumLoads; ++i) {
9608 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
9609 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
9610 LoadVT, MMO, DAG));
9611 }
9612
9613 if (NumElts == 8 || NumElts == 16)
9614 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
9615
9616 return Loads[0];
9617}
9618
9619SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
9620 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
9621 if (!Subtarget->hasArchitectedSGPRs())
9622 return {};
9623 SDLoc SL(Op);
9624 MVT VT = MVT::i32;
9625 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
9626 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
9627 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
9628}
9629
9630SDValue SITargetLowering::lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,
9631 AMDGPU::Hwreg::Id HwReg,
9632 unsigned LowBit,
9633 unsigned Width) const {
9634 SDLoc SL(Op);
9635 using namespace AMDGPU::Hwreg;
9636 return {DAG.getMachineNode(
9637 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
9638 DAG.getTargetConstant(HwregEncoding::encode(HwReg, LowBit, Width),
9639 SL, MVT::i32)),
9640 0};
9641}
9642
9643SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
9644 unsigned Dim,
9645 const ArgDescriptor &Arg) const {
9646 SDLoc SL(Op);
9647 MachineFunction &MF = DAG.getMachineFunction();
9648 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
9649 if (MaxID == 0)
9650 return DAG.getConstant(0, SL, MVT::i32);
9651
9652 // It's undefined behavior if a function marked with the amdgpu-no-*
9653 // attributes uses the corresponding intrinsic.
9654 if (!Arg)
9655 return DAG.getPOISON(Op->getValueType(0));
9656
9657 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
9658 SDLoc(DAG.getEntryNode()), Arg);
9659
9660 // Don't bother inserting AssertZext for packed IDs since we're emitting the
9661 // masking operations anyway.
9662 //
9663 // TODO: We could assert the top bit is 0 for the source copy.
9664 if (Arg.isMasked())
9665 return Val;
9666
9667 // Preserve the known bits after expansion to a copy.
9668 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID));
9669 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
9670 DAG.getValueType(SmallVT));
9671}
9672
9673SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
9674 SelectionDAG &DAG) const {
9675 MachineFunction &MF = DAG.getMachineFunction();
9676 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
9677
9678 EVT VT = Op.getValueType();
9679 SDLoc DL(Op);
9680 unsigned IntrinsicID = Op.getConstantOperandVal(0);
9681
9682 // TODO: Should this propagate fast-math-flags?
9683
9684 switch (IntrinsicID) {
9685 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9686 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
9687 return emitNonHSAIntrinsicError(DAG, DL, VT);
9688 return getPreloadedValue(DAG, *MFI, VT,
9690 }
9691 case Intrinsic::amdgcn_dispatch_ptr:
9692 case Intrinsic::amdgcn_queue_ptr: {
9693 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
9694 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9695 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
9696 DL.getDebugLoc()));
9697 return DAG.getPOISON(VT);
9698 }
9699
9700 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9703 return getPreloadedValue(DAG, *MFI, VT, RegID);
9704 }
9705 case Intrinsic::amdgcn_implicitarg_ptr: {
9706 if (MFI->isEntryFunction())
9707 return getImplicitArgPtr(DAG, DL);
9708 return getPreloadedValue(DAG, *MFI, VT,
9710 }
9711 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9713 // This only makes sense to call in a kernel, so just lower to null.
9714 return DAG.getConstant(0, DL, VT);
9715 }
9716
9717 return getPreloadedValue(DAG, *MFI, VT,
9719 }
9720 case Intrinsic::amdgcn_dispatch_id: {
9721 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
9722 }
9723 case Intrinsic::amdgcn_rcp:
9724 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
9725 case Intrinsic::amdgcn_rsq:
9726 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9727 case Intrinsic::amdgcn_rsq_legacy:
9728 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9729 return emitRemovedIntrinsicError(DAG, DL, VT);
9730 return SDValue();
9731 case Intrinsic::amdgcn_rcp_legacy:
9732 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9733 return emitRemovedIntrinsicError(DAG, DL, VT);
9734 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
9735 case Intrinsic::amdgcn_rsq_clamp: {
9736 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9737 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
9738
9739 Type *Type = VT.getTypeForEVT(*DAG.getContext());
9740 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
9741 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
9742
9743 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9744 SDValue Tmp =
9745 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
9746 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
9747 DAG.getConstantFP(Min, DL, VT));
9748 }
9749 case Intrinsic::r600_read_ngroups_x:
9750 if (Subtarget->isAmdHsaOS())
9751 return emitNonHSAIntrinsicError(DAG, DL, VT);
9752
9753 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9755 false);
9756 case Intrinsic::r600_read_ngroups_y:
9757 if (Subtarget->isAmdHsaOS())
9758 return emitNonHSAIntrinsicError(DAG, DL, VT);
9759
9760 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9762 false);
9763 case Intrinsic::r600_read_ngroups_z:
9764 if (Subtarget->isAmdHsaOS())
9765 return emitNonHSAIntrinsicError(DAG, DL, VT);
9766
9767 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9769 false);
9770 case Intrinsic::r600_read_local_size_x:
9771 if (Subtarget->isAmdHsaOS())
9772 return emitNonHSAIntrinsicError(DAG, DL, VT);
9773
9774 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9776 case Intrinsic::r600_read_local_size_y:
9777 if (Subtarget->isAmdHsaOS())
9778 return emitNonHSAIntrinsicError(DAG, DL, VT);
9779
9780 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9782 case Intrinsic::r600_read_local_size_z:
9783 if (Subtarget->isAmdHsaOS())
9784 return emitNonHSAIntrinsicError(DAG, DL, VT);
9785
9786 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9788 case Intrinsic::amdgcn_workgroup_id_x:
9789 return lowerWorkGroupId(DAG, *MFI, VT,
9793 case Intrinsic::amdgcn_workgroup_id_y:
9794 return lowerWorkGroupId(DAG, *MFI, VT,
9798 case Intrinsic::amdgcn_workgroup_id_z:
9799 return lowerWorkGroupId(DAG, *MFI, VT,
9803 case Intrinsic::amdgcn_cluster_id_x:
9804 return Subtarget->hasClusters()
9805 ? getPreloadedValue(DAG, *MFI, VT,
9807 : DAG.getPOISON(VT);
9808 case Intrinsic::amdgcn_cluster_id_y:
9809 return Subtarget->hasClusters()
9810 ? getPreloadedValue(DAG, *MFI, VT,
9812 : DAG.getPOISON(VT);
9813 case Intrinsic::amdgcn_cluster_id_z:
9814 return Subtarget->hasClusters()
9815 ? getPreloadedValue(DAG, *MFI, VT,
9817 : DAG.getPOISON(VT);
9818 case Intrinsic::amdgcn_cluster_workgroup_id_x:
9819 return Subtarget->hasClusters()
9820 ? getPreloadedValue(
9821 DAG, *MFI, VT,
9823 : DAG.getPOISON(VT);
9824 case Intrinsic::amdgcn_cluster_workgroup_id_y:
9825 return Subtarget->hasClusters()
9826 ? getPreloadedValue(
9827 DAG, *MFI, VT,
9829 : DAG.getPOISON(VT);
9830 case Intrinsic::amdgcn_cluster_workgroup_id_z:
9831 return Subtarget->hasClusters()
9832 ? getPreloadedValue(
9833 DAG, *MFI, VT,
9835 : DAG.getPOISON(VT);
9836 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
9837 return Subtarget->hasClusters()
9838 ? lowerConstHwRegRead(DAG, Op, AMDGPU::Hwreg::ID_IB_STS2, 21, 4)
9839 : SDValue();
9840 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
9841 return Subtarget->hasClusters()
9842 ? getPreloadedValue(
9843 DAG, *MFI, VT,
9845 : DAG.getPOISON(VT);
9846 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
9847 return Subtarget->hasClusters()
9848 ? getPreloadedValue(
9849 DAG, *MFI, VT,
9851 : DAG.getPOISON(VT);
9852 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
9853 return Subtarget->hasClusters()
9854 ? getPreloadedValue(
9855 DAG, *MFI, VT,
9857 : DAG.getPOISON(VT);
9858 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
9859 return Subtarget->hasClusters()
9860 ? getPreloadedValue(
9861 DAG, *MFI, VT,
9863 : DAG.getPOISON(VT);
9864 case Intrinsic::amdgcn_wave_id:
9865 return lowerWaveID(DAG, Op);
9866 case Intrinsic::amdgcn_lds_kernel_id: {
9867 if (MFI->isEntryFunction())
9868 return getLDSKernelId(DAG, DL);
9869 return getPreloadedValue(DAG, *MFI, VT,
9871 }
9872 case Intrinsic::amdgcn_workitem_id_x:
9873 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
9874 case Intrinsic::amdgcn_workitem_id_y:
9875 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
9876 case Intrinsic::amdgcn_workitem_id_z:
9877 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
9878 case Intrinsic::amdgcn_wavefrontsize:
9879 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
9880 SDLoc(Op), MVT::i32);
9881 case Intrinsic::amdgcn_s_buffer_load: {
9882 unsigned CPol = Op.getConstantOperandVal(3);
9883 // s_buffer_load, because of how it's optimized, can't be volatile
9884 // so reject ones with the volatile bit set.
9885 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
9888 return Op;
9889 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
9890 Op.getOperand(3), DAG);
9891 }
9892 case Intrinsic::amdgcn_fdiv_fast:
9893 return lowerFDIV_FAST(Op, DAG);
9894 case Intrinsic::amdgcn_sin:
9895 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
9896
9897 case Intrinsic::amdgcn_cos:
9898 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
9899
9900 case Intrinsic::amdgcn_mul_u24:
9901 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
9902 Op.getOperand(2));
9903 case Intrinsic::amdgcn_mul_i24:
9904 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
9905 Op.getOperand(2));
9906
9907 case Intrinsic::amdgcn_log_clamp: {
9908 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9909 return SDValue();
9910
9911 return emitRemovedIntrinsicError(DAG, DL, VT);
9912 }
9913 case Intrinsic::amdgcn_fract:
9914 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
9915
9916 case Intrinsic::amdgcn_class:
9917 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
9918 Op.getOperand(2));
9919 case Intrinsic::amdgcn_div_fmas:
9920 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
9921 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9922
9923 case Intrinsic::amdgcn_div_fixup:
9924 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
9925 Op.getOperand(2), Op.getOperand(3));
9926
9927 case Intrinsic::amdgcn_div_scale: {
9928 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
9929
9930 // Translate to the operands expected by the machine instruction. The
9931 // first parameter must be the same as the first instruction.
9932 SDValue Numerator = Op.getOperand(1);
9933 SDValue Denominator = Op.getOperand(2);
9934
9935 // Note this order is opposite of the machine instruction's operations,
9936 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
9937 // intrinsic has the numerator as the first operand to match a normal
9938 // division operation.
9939
9940 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
9941
9942 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
9943 Denominator, Numerator);
9944 }
9945 case Intrinsic::amdgcn_icmp: {
9946 // There is a Pat that handles this variant, so return it as-is.
9947 if (Op.getOperand(1).getValueType() == MVT::i1 &&
9948 Op.getConstantOperandVal(2) == 0 &&
9949 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
9950 return Op;
9951 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
9952 }
9953 case Intrinsic::amdgcn_fcmp: {
9954 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
9955 }
9956 case Intrinsic::amdgcn_ballot:
9957 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
9958 case Intrinsic::amdgcn_fmed3:
9959 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
9960 Op.getOperand(2), Op.getOperand(3));
9961 case Intrinsic::amdgcn_fdot2:
9962 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
9963 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9964 case Intrinsic::amdgcn_fmul_legacy:
9965 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
9966 Op.getOperand(2));
9967 case Intrinsic::amdgcn_sffbh:
9968 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
9969 case Intrinsic::amdgcn_sbfe:
9970 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
9971 Op.getOperand(2), Op.getOperand(3));
9972 case Intrinsic::amdgcn_ubfe:
9973 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
9974 Op.getOperand(2), Op.getOperand(3));
9975 case Intrinsic::amdgcn_cvt_pkrtz:
9976 case Intrinsic::amdgcn_cvt_pknorm_i16:
9977 case Intrinsic::amdgcn_cvt_pknorm_u16:
9978 case Intrinsic::amdgcn_cvt_pk_i16:
9979 case Intrinsic::amdgcn_cvt_pk_u16: {
9980 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
9981 EVT VT = Op.getValueType();
9982 unsigned Opcode;
9983
9984 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
9986 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
9988 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
9990 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
9992 else
9994
9995 if (isTypeLegal(VT))
9996 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
9997
9998 SDValue Node =
9999 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
10000 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
10001 }
10002 case Intrinsic::amdgcn_fmad_ftz:
10003 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
10004 Op.getOperand(2), Op.getOperand(3));
10005
10006 case Intrinsic::amdgcn_if_break:
10007 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
10008 Op->getOperand(1), Op->getOperand(2)),
10009 0);
10010
10011 case Intrinsic::amdgcn_groupstaticsize: {
10013 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
10014 return Op;
10015
10016 const Module *M = MF.getFunction().getParent();
10017 const GlobalValue *GV =
10018 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
10019 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
10021 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10022 }
10023 case Intrinsic::amdgcn_is_shared:
10024 case Intrinsic::amdgcn_is_private: {
10025 SDLoc SL(Op);
10026 SDValue SrcVec =
10027 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10028 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
10029 DAG.getConstant(1, SL, MVT::i32));
10030
10031 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10033 : AMDGPUAS::PRIVATE_ADDRESS;
10034 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
10035 Subtarget->hasGloballyAddressableScratch()) {
10036 SDValue FlatScratchBaseHi(
10037 DAG.getMachineNode(
10038 AMDGPU::S_MOV_B32, DL, MVT::i32,
10039 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10040 0);
10041 // Test bits 63..58 against the aperture address.
10042 return DAG.getSetCC(
10043 SL, MVT::i1,
10044 DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
10045 DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
10046 }
10047
10048 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10049 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
10050 }
10051 case Intrinsic::amdgcn_perm:
10052 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
10053 Op.getOperand(2), Op.getOperand(3));
10054 case Intrinsic::amdgcn_reloc_constant: {
10055 Module *M = MF.getFunction().getParent();
10056 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
10057 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
10058 auto *RelocSymbol = cast<GlobalVariable>(
10059 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
10060 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
10062 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10063 }
10064 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10065 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10066 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10067 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10068 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10069 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10070 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10071 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10072 if (Op.getOperand(4).getValueType() == MVT::i32)
10073 return SDValue();
10074
10075 SDLoc SL(Op);
10076 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
10077 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10078 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10079 Op.getOperand(3), IndexKeyi32);
10080 }
10081 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10082 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10083 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10084 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10085 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10086 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10087 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10088 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10089 if (Op.getOperand(4).getValueType() == MVT::i64)
10090 return SDValue();
10091
10092 SDLoc SL(Op);
10093 auto IndexKeyi64 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64);
10094 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10095 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10096 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10097 Op.getOperand(6)});
10098 }
10099 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10100 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10101 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10102 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10103 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10104 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10105 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10106 ? MVT::i64
10107 : MVT::i32;
10108 if (Op.getOperand(6).getValueType() == IndexKeyTy)
10109 return SDValue();
10110
10111 SDLoc SL(Op);
10112 auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
10113 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10114 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10115 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10116 IndexKey, Op.getOperand(7),
10117 Op.getOperand(8)}); // No clamp operand
10118 }
10119 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10120 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10121 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10122 if (Op.getOperand(6).getValueType() == MVT::i32)
10123 return SDValue();
10124
10125 SDLoc SL(Op);
10126 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
10127 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10128 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10129 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10130 IndexKeyi32, Op.getOperand(7)});
10131 }
10132 case Intrinsic::amdgcn_addrspacecast_nonnull:
10133 return lowerADDRSPACECAST(Op, DAG);
10134 case Intrinsic::amdgcn_readlane:
10135 case Intrinsic::amdgcn_readfirstlane:
10136 case Intrinsic::amdgcn_writelane:
10137 case Intrinsic::amdgcn_permlane16:
10138 case Intrinsic::amdgcn_permlanex16:
10139 case Intrinsic::amdgcn_permlane64:
10140 case Intrinsic::amdgcn_set_inactive:
10141 case Intrinsic::amdgcn_set_inactive_chain_arg:
10142 case Intrinsic::amdgcn_mov_dpp8:
10143 case Intrinsic::amdgcn_update_dpp:
10144 return lowerLaneOp(*this, Op.getNode(), DAG);
10145 case Intrinsic::amdgcn_dead: {
10147 for (const EVT ValTy : Op.getNode()->values())
10148 Poisons.push_back(DAG.getPOISON(ValTy));
10149 return DAG.getMergeValues(Poisons, SDLoc(Op));
10150 }
10151 default:
10152 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10154 return lowerImage(Op, ImageDimIntr, DAG, false);
10155
10156 return Op;
10157 }
10158}
10159
10160// On targets not supporting constant in soffset field, turn zero to
10161// SGPR_NULL to avoid generating an extra s_mov with zero.
10163 const GCNSubtarget *Subtarget) {
10164 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
10165 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10166 return SOffset;
10167}
10168
10169SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
10170 SelectionDAG &DAG,
10171 unsigned NewOpcode) const {
10172 SDLoc DL(Op);
10173
10174 SDValue VData = Op.getOperand(2);
10175 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10176 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10177 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10178 SDValue Ops[] = {
10179 Op.getOperand(0), // Chain
10180 VData, // vdata
10181 Rsrc, // rsrc
10182 DAG.getConstant(0, DL, MVT::i32), // vindex
10183 VOffset, // voffset
10184 SOffset, // soffset
10185 Offset, // offset
10186 Op.getOperand(6), // cachepolicy
10187 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10188 };
10189
10190 auto *M = cast<MemSDNode>(Op);
10191
10192 EVT MemVT = VData.getValueType();
10193 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10194 M->getMemOperand());
10195}
10196
10197SDValue
10198SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
10199 unsigned NewOpcode) const {
10200 SDLoc DL(Op);
10201
10202 SDValue VData = Op.getOperand(2);
10203 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10204 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10205 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10206 SDValue Ops[] = {
10207 Op.getOperand(0), // Chain
10208 VData, // vdata
10209 Rsrc, // rsrc
10210 Op.getOperand(4), // vindex
10211 VOffset, // voffset
10212 SOffset, // soffset
10213 Offset, // offset
10214 Op.getOperand(7), // cachepolicy
10215 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10216 };
10217
10218 auto *M = cast<MemSDNode>(Op);
10219
10220 EVT MemVT = VData.getValueType();
10221 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10222 M->getMemOperand());
10223}
10224
10225SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
10226 SelectionDAG &DAG) const {
10227 unsigned IntrID = Op.getConstantOperandVal(1);
10228 SDLoc DL(Op);
10229
10230 switch (IntrID) {
10231 case Intrinsic::amdgcn_ds_ordered_add:
10232 case Intrinsic::amdgcn_ds_ordered_swap: {
10233 MemSDNode *M = cast<MemSDNode>(Op);
10234 SDValue Chain = M->getOperand(0);
10235 SDValue M0 = M->getOperand(2);
10236 SDValue Value = M->getOperand(3);
10237 unsigned IndexOperand = M->getConstantOperandVal(7);
10238 unsigned WaveRelease = M->getConstantOperandVal(8);
10239 unsigned WaveDone = M->getConstantOperandVal(9);
10240
10241 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10242 IndexOperand &= ~0x3f;
10243 unsigned CountDw = 0;
10244
10245 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
10246 CountDw = (IndexOperand >> 24) & 0xf;
10247 IndexOperand &= ~(0xf << 24);
10248
10249 if (CountDw < 1 || CountDw > 4) {
10250 const Function &Fn = DAG.getMachineFunction().getFunction();
10251 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10252 Fn, "ds_ordered_count: dword count must be between 1 and 4",
10253 DL.getDebugLoc()));
10254 CountDw = 1;
10255 }
10256 }
10257
10258 if (IndexOperand) {
10259 const Function &Fn = DAG.getMachineFunction().getFunction();
10260 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10261 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
10262 }
10263
10264 if (WaveDone && !WaveRelease) {
10265 // TODO: Move this to IR verifier
10266 const Function &Fn = DAG.getMachineFunction().getFunction();
10267 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10268 Fn, "ds_ordered_count: wave_done requires wave_release",
10269 DL.getDebugLoc()));
10270 }
10271
10272 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10273 unsigned ShaderType =
10275 unsigned Offset0 = OrderedCountIndex << 2;
10276 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10277
10278 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
10279 Offset1 |= (CountDw - 1) << 6;
10280
10281 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
10282 Offset1 |= ShaderType << 2;
10283
10284 unsigned Offset = Offset0 | (Offset1 << 8);
10285
10286 SDValue Ops[] = {
10287 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
10288 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
10289 };
10291 M->getVTList(), Ops, M->getMemoryVT(),
10292 M->getMemOperand());
10293 }
10294 case Intrinsic::amdgcn_raw_buffer_load:
10295 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10296 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10297 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10298 case Intrinsic::amdgcn_raw_buffer_load_format:
10299 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10300 const bool IsFormat =
10301 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10302 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10303
10304 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10305 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10306 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10307 SDValue Ops[] = {
10308 Op.getOperand(0), // Chain
10309 Rsrc, // rsrc
10310 DAG.getConstant(0, DL, MVT::i32), // vindex
10311 VOffset, // voffset
10312 SOffset, // soffset
10313 Offset, // offset
10314 Op.getOperand(5), // cachepolicy, swizzled buffer
10315 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10316 };
10317
10318 auto *M = cast<MemSDNode>(Op);
10319 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
10320 }
10321 case Intrinsic::amdgcn_struct_buffer_load:
10322 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10323 case Intrinsic::amdgcn_struct_buffer_load_format:
10324 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10325 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10326 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10327 const bool IsFormat =
10328 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10329 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10330
10331 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10332 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10333 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10334 SDValue Ops[] = {
10335 Op.getOperand(0), // Chain
10336 Rsrc, // rsrc
10337 Op.getOperand(3), // vindex
10338 VOffset, // voffset
10339 SOffset, // soffset
10340 Offset, // offset
10341 Op.getOperand(6), // cachepolicy, swizzled buffer
10342 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10343 };
10344
10345 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
10346 }
10347 case Intrinsic::amdgcn_raw_tbuffer_load:
10348 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10349 MemSDNode *M = cast<MemSDNode>(Op);
10350 EVT LoadVT = Op.getValueType();
10351 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10352 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10353 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10354
10355 SDValue Ops[] = {
10356 Op.getOperand(0), // Chain
10357 Rsrc, // rsrc
10358 DAG.getConstant(0, DL, MVT::i32), // vindex
10359 VOffset, // voffset
10360 SOffset, // soffset
10361 Offset, // offset
10362 Op.getOperand(5), // format
10363 Op.getOperand(6), // cachepolicy, swizzled buffer
10364 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10365 };
10366
10367 if (LoadVT.getScalarType() == MVT::f16)
10368 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10369 Ops);
10370 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10371 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10372 DAG);
10373 }
10374 case Intrinsic::amdgcn_struct_tbuffer_load:
10375 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10376 MemSDNode *M = cast<MemSDNode>(Op);
10377 EVT LoadVT = Op.getValueType();
10378 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10379 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10380 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10381
10382 SDValue Ops[] = {
10383 Op.getOperand(0), // Chain
10384 Rsrc, // rsrc
10385 Op.getOperand(3), // vindex
10386 VOffset, // voffset
10387 SOffset, // soffset
10388 Offset, // offset
10389 Op.getOperand(6), // format
10390 Op.getOperand(7), // cachepolicy, swizzled buffer
10391 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10392 };
10393
10394 if (LoadVT.getScalarType() == MVT::f16)
10395 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10396 Ops);
10397 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10398 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10399 DAG);
10400 }
10401 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10402 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10403 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
10404 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10405 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10406 return lowerStructBufferAtomicIntrin(Op, DAG,
10408 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10409 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10410 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
10411 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10412 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10413 return lowerStructBufferAtomicIntrin(Op, DAG,
10415 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10416 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10417 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
10418 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10419 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10420 return lowerStructBufferAtomicIntrin(Op, DAG,
10422 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10423 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10424 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
10425 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10426 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10427 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10428 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10429 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10430 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10431 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10432 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10433 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
10434 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10435 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10436 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
10437 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10438 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10439 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
10440 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10441 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10442 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
10443 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10444 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10445 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10446 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10447 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10448 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10449 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10450 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10451 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10452 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10453 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10454 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10455 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10456 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10457 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10458 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10459 return lowerRawBufferAtomicIntrin(Op, DAG,
10461 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10462 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10463 return lowerStructBufferAtomicIntrin(Op, DAG,
10465 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10466 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10467 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10468 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10469 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10470 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10471 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10472 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10473 return lowerStructBufferAtomicIntrin(Op, DAG,
10475 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10476 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10477 return lowerStructBufferAtomicIntrin(Op, DAG,
10479 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10480 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10481 return lowerStructBufferAtomicIntrin(Op, DAG,
10483 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10484 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10485 return lowerStructBufferAtomicIntrin(Op, DAG,
10487 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10488 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10489 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10490 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10491 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10492 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10493 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10494 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10495 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10496 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10497 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10498 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10499 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10500 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10501 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10502 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10503 return lowerStructBufferAtomicIntrin(Op, DAG,
10505
10506 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10507 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10508 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
10509 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10510 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10511 SDValue Ops[] = {
10512 Op.getOperand(0), // Chain
10513 Op.getOperand(2), // src
10514 Op.getOperand(3), // cmp
10515 Rsrc, // rsrc
10516 DAG.getConstant(0, DL, MVT::i32), // vindex
10517 VOffset, // voffset
10518 SOffset, // soffset
10519 Offset, // offset
10520 Op.getOperand(7), // cachepolicy
10521 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10522 };
10523 EVT VT = Op.getValueType();
10524 auto *M = cast<MemSDNode>(Op);
10525
10527 Op->getVTList(), Ops, VT,
10528 M->getMemOperand());
10529 }
10530 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10531 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10532 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
10533 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
10534 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
10535 SDValue Ops[] = {
10536 Op.getOperand(0), // Chain
10537 Op.getOperand(2), // src
10538 Op.getOperand(3), // cmp
10539 Rsrc, // rsrc
10540 Op.getOperand(5), // vindex
10541 VOffset, // voffset
10542 SOffset, // soffset
10543 Offset, // offset
10544 Op.getOperand(8), // cachepolicy
10545 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10546 };
10547 EVT VT = Op.getValueType();
10548 auto *M = cast<MemSDNode>(Op);
10549
10551 Op->getVTList(), Ops, VT,
10552 M->getMemOperand());
10553 }
10554 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10555 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10556 MemSDNode *M = cast<MemSDNode>(Op);
10557 SDValue NodePtr = M->getOperand(2);
10558 SDValue RayExtent = M->getOperand(3);
10559 SDValue InstanceMask = M->getOperand(4);
10560 SDValue RayOrigin = M->getOperand(5);
10561 SDValue RayDir = M->getOperand(6);
10562 SDValue Offsets = M->getOperand(7);
10563 SDValue TDescr = M->getOperand(8);
10564
10565 assert(NodePtr.getValueType() == MVT::i64);
10566 assert(RayDir.getValueType() == MVT::v3f32);
10567
10568 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10569 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10570 return SDValue();
10571 }
10572
10573 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10574 const unsigned NumVDataDwords = 10;
10575 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10576 int Opcode = AMDGPU::getMIMGOpcode(
10577 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10578 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10579 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10580 assert(Opcode != -1);
10581
10583 Ops.push_back(NodePtr);
10584 Ops.push_back(DAG.getBuildVector(
10585 MVT::v2i32, DL,
10586 {DAG.getBitcast(MVT::i32, RayExtent),
10587 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10588 Ops.push_back(RayOrigin);
10589 Ops.push_back(RayDir);
10590 Ops.push_back(Offsets);
10591 Ops.push_back(TDescr);
10592 Ops.push_back(M->getChain());
10593
10594 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10595 MachineMemOperand *MemRef = M->getMemOperand();
10596 DAG.setNodeMemRefs(NewNode, {MemRef});
10597 return SDValue(NewNode, 0);
10598 }
10599 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10600 MemSDNode *M = cast<MemSDNode>(Op);
10601 SDValue NodePtr = M->getOperand(2);
10602 SDValue RayExtent = M->getOperand(3);
10603 SDValue RayOrigin = M->getOperand(4);
10604 SDValue RayDir = M->getOperand(5);
10605 SDValue RayInvDir = M->getOperand(6);
10606 SDValue TDescr = M->getOperand(7);
10607
10608 assert(NodePtr.getValueType() == MVT::i32 ||
10609 NodePtr.getValueType() == MVT::i64);
10610 assert(RayDir.getValueType() == MVT::v3f16 ||
10611 RayDir.getValueType() == MVT::v3f32);
10612
10613 if (!Subtarget->hasGFX10_AEncoding()) {
10614 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10615 return SDValue();
10616 }
10617
10618 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
10619 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
10620 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10621 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
10622 const bool Is64 = NodePtr.getValueType() == MVT::i64;
10623 const unsigned NumVDataDwords = 4;
10624 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10625 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10626 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10627 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
10628 IsGFX12Plus;
10629 const unsigned BaseOpcodes[2][2] = {
10630 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10631 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10632 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10633 int Opcode;
10634 if (UseNSA) {
10635 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10636 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10637 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10638 : AMDGPU::MIMGEncGfx10NSA,
10639 NumVDataDwords, NumVAddrDwords);
10640 } else {
10641 assert(!IsGFX12Plus);
10642 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10643 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10644 : AMDGPU::MIMGEncGfx10Default,
10645 NumVDataDwords, NumVAddrDwords);
10646 }
10647 assert(Opcode != -1);
10648
10650
10651 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
10653 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
10654 if (Lanes[0].getValueSizeInBits() == 32) {
10655 for (unsigned I = 0; I < 3; ++I)
10656 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
10657 } else {
10658 if (IsAligned) {
10659 Ops.push_back(DAG.getBitcast(
10660 MVT::i32,
10661 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
10662 Ops.push_back(Lanes[2]);
10663 } else {
10664 SDValue Elt0 = Ops.pop_back_val();
10665 Ops.push_back(DAG.getBitcast(
10666 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
10667 Ops.push_back(DAG.getBitcast(
10668 MVT::i32,
10669 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
10670 }
10671 }
10672 };
10673
10674 if (UseNSA && IsGFX11Plus) {
10675 Ops.push_back(NodePtr);
10676 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10677 Ops.push_back(RayOrigin);
10678 if (IsA16) {
10679 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
10680 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
10681 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
10682 for (unsigned I = 0; I < 3; ++I) {
10683 MergedLanes.push_back(DAG.getBitcast(
10684 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
10685 {DirLanes[I], InvDirLanes[I]})));
10686 }
10687 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
10688 } else {
10689 Ops.push_back(RayDir);
10690 Ops.push_back(RayInvDir);
10691 }
10692 } else {
10693 if (Is64)
10694 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
10695 2);
10696 else
10697 Ops.push_back(NodePtr);
10698
10699 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10700 packLanes(RayOrigin, true);
10701 packLanes(RayDir, true);
10702 packLanes(RayInvDir, false);
10703 }
10704
10705 if (!UseNSA) {
10706 // Build a single vector containing all the operands so far prepared.
10707 if (NumVAddrDwords > 12) {
10708 SDValue Undef = DAG.getPOISON(MVT::i32);
10709 Ops.append(16 - Ops.size(), Undef);
10710 }
10711 assert(Ops.size() >= 8 && Ops.size() <= 12);
10712 SDValue MergedOps =
10713 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
10714 Ops.clear();
10715 Ops.push_back(MergedOps);
10716 }
10717
10718 Ops.push_back(TDescr);
10719 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
10720 Ops.push_back(M->getChain());
10721
10722 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10723 MachineMemOperand *MemRef = M->getMemOperand();
10724 DAG.setNodeMemRefs(NewNode, {MemRef});
10725 return SDValue(NewNode, 0);
10726 }
10727 case Intrinsic::amdgcn_global_atomic_fmin_num:
10728 case Intrinsic::amdgcn_global_atomic_fmax_num:
10729 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10730 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10731 MemSDNode *M = cast<MemSDNode>(Op);
10732 SDValue Ops[] = {
10733 M->getOperand(0), // Chain
10734 M->getOperand(2), // Ptr
10735 M->getOperand(3) // Value
10736 };
10737 unsigned Opcode = 0;
10738 switch (IntrID) {
10739 case Intrinsic::amdgcn_global_atomic_fmin_num:
10740 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10741 Opcode = ISD::ATOMIC_LOAD_FMIN;
10742 break;
10743 }
10744 case Intrinsic::amdgcn_global_atomic_fmax_num:
10745 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10746 Opcode = ISD::ATOMIC_LOAD_FMAX;
10747 break;
10748 }
10749 default:
10750 llvm_unreachable("unhandled atomic opcode");
10751 }
10752 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
10753 Ops, M->getMemOperand());
10754 }
10755 case Intrinsic::amdgcn_s_get_barrier_state:
10756 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10757 SDValue Chain = Op->getOperand(0);
10759 unsigned Opc;
10760
10761 if (isa<ConstantSDNode>(Op->getOperand(2))) {
10762 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
10763 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10764 BarID = (BarID >> 4) & 0x3F;
10765 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10766 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10767 Ops.push_back(K);
10768 Ops.push_back(Chain);
10769 } else {
10770 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10771 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10772 SDValue M0Val;
10773 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
10774 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10775 M0Val = SDValue(
10776 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10777 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10778 0);
10779 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10780 } else
10781 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
10782 }
10783
10784 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10785 return SDValue(NewMI, 0);
10786 }
10787 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
10788 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
10789 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
10790 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
10791 SDValue Chain = Op->getOperand(0);
10792 SDValue Ptr = Op->getOperand(2);
10793 EVT VT = Op->getValueType(0);
10794 return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
10795 Chain, Ptr, MII->getMemOperand());
10796 }
10797 default:
10798
10799 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10801 return lowerImage(Op, ImageDimIntr, DAG, true);
10802
10803 return SDValue();
10804 }
10805}
10806
10807// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
10808// dwordx4 if on SI and handle TFE loads.
10809SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
10810 SDVTList VTList,
10811 ArrayRef<SDValue> Ops, EVT MemVT,
10812 MachineMemOperand *MMO,
10813 SelectionDAG &DAG) const {
10814 LLVMContext &C = *DAG.getContext();
10815 MachineFunction &MF = DAG.getMachineFunction();
10816 EVT VT = VTList.VTs[0];
10817
10818 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
10819 bool IsTFE = VTList.NumVTs == 3;
10820 if (IsTFE) {
10821 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
10822 unsigned NumOpDWords = NumValueDWords + 1;
10823 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
10824 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
10825 MachineMemOperand *OpDWordsMMO =
10826 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
10827 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
10828 OpDWordsVT, OpDWordsMMO, DAG);
10829 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10830 DAG.getVectorIdxConstant(NumValueDWords, DL));
10831 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
10832 SDValue ValueDWords =
10833 NumValueDWords == 1
10834 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
10836 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
10837 ZeroIdx);
10838 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
10839 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10840 }
10841
10842 if (!Subtarget->hasDwordx3LoadStores() &&
10843 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10844 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
10845 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
10846 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
10847 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
10848 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
10849 WidenedMemVT, WidenedMMO);
10851 DAG.getVectorIdxConstant(0, DL));
10852 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
10853 }
10854
10855 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
10856}
10857
10858SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
10859 bool ImageStore) const {
10860 EVT StoreVT = VData.getValueType();
10861
10862 // No change for f16 and legal vector D16 types.
10863 if (!StoreVT.isVector())
10864 return VData;
10865
10866 SDLoc DL(VData);
10867 unsigned NumElements = StoreVT.getVectorNumElements();
10868
10869 if (Subtarget->hasUnpackedD16VMem()) {
10870 // We need to unpack the packed data to store.
10871 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10872 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10873
10874 EVT EquivStoreVT =
10875 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
10876 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
10877 return DAG.UnrollVectorOp(ZExt.getNode());
10878 }
10879
10880 // The sq block of gfx8.1 does not estimate register use correctly for d16
10881 // image store instructions. The data operand is computed as if it were not a
10882 // d16 image instruction.
10883 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
10884 // Bitcast to i16
10885 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10886 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10887
10888 // Decompose into scalars
10890 DAG.ExtractVectorElements(IntVData, Elts);
10891
10892 // Group pairs of i16 into v2i16 and bitcast to i32
10893 SmallVector<SDValue, 4> PackedElts;
10894 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
10895 SDValue Pair =
10896 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
10897 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10898 PackedElts.push_back(IntPair);
10899 }
10900 if ((NumElements % 2) == 1) {
10901 // Handle v3i16
10902 unsigned I = Elts.size() / 2;
10903 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
10904 {Elts[I * 2], DAG.getPOISON(MVT::i16)});
10905 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10906 PackedElts.push_back(IntPair);
10907 }
10908
10909 // Pad using UNDEF
10910 PackedElts.resize(Elts.size(), DAG.getPOISON(MVT::i32));
10911
10912 // Build final vector
10913 EVT VecVT =
10914 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
10915 return DAG.getBuildVector(VecVT, DL, PackedElts);
10916 }
10917
10918 if (NumElements == 3) {
10919 EVT IntStoreVT =
10921 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10922
10923 EVT WidenedStoreVT = EVT::getVectorVT(
10924 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
10925 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
10926 WidenedStoreVT.getStoreSizeInBits());
10927 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
10928 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
10929 }
10930
10931 assert(isTypeLegal(StoreVT));
10932 return VData;
10933}
10934
10935SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
10936 SelectionDAG &DAG) const {
10937 SDLoc DL(Op);
10938 SDValue Chain = Op.getOperand(0);
10939 unsigned IntrinsicID = Op.getConstantOperandVal(1);
10940 MachineFunction &MF = DAG.getMachineFunction();
10941
10942 switch (IntrinsicID) {
10943 case Intrinsic::amdgcn_exp_compr: {
10944 if (!Subtarget->hasCompressedExport()) {
10945 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10947 "intrinsic not supported on subtarget", DL.getDebugLoc()));
10948 }
10949 SDValue Src0 = Op.getOperand(4);
10950 SDValue Src1 = Op.getOperand(5);
10951 // Hack around illegal type on SI by directly selecting it.
10952 if (isTypeLegal(Src0.getValueType()))
10953 return SDValue();
10954
10955 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
10956 SDValue Undef = DAG.getPOISON(MVT::f32);
10957 const SDValue Ops[] = {
10958 Op.getOperand(2), // tgt
10959 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
10960 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
10961 Undef, // src2
10962 Undef, // src3
10963 Op.getOperand(7), // vm
10964 DAG.getTargetConstant(1, DL, MVT::i1), // compr
10965 Op.getOperand(3), // en
10966 Op.getOperand(0) // Chain
10967 };
10968
10969 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
10970 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
10971 }
10972
10973 case Intrinsic::amdgcn_struct_tbuffer_store:
10974 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
10975 SDValue VData = Op.getOperand(2);
10976 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
10977 if (IsD16)
10978 VData = handleD16VData(VData, DAG);
10979 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10980 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10981 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10982 SDValue Ops[] = {
10983 Chain,
10984 VData, // vdata
10985 Rsrc, // rsrc
10986 Op.getOperand(4), // vindex
10987 VOffset, // voffset
10988 SOffset, // soffset
10989 Offset, // offset
10990 Op.getOperand(7), // format
10991 Op.getOperand(8), // cachepolicy, swizzled buffer
10992 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10993 };
10994 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
10996 MemSDNode *M = cast<MemSDNode>(Op);
10997 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
10998 M->getMemoryVT(), M->getMemOperand());
10999 }
11000
11001 case Intrinsic::amdgcn_raw_tbuffer_store:
11002 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11003 SDValue VData = Op.getOperand(2);
11004 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11005 if (IsD16)
11006 VData = handleD16VData(VData, DAG);
11007 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11008 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11009 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11010 SDValue Ops[] = {
11011 Chain,
11012 VData, // vdata
11013 Rsrc, // rsrc
11014 DAG.getConstant(0, DL, MVT::i32), // vindex
11015 VOffset, // voffset
11016 SOffset, // soffset
11017 Offset, // offset
11018 Op.getOperand(6), // format
11019 Op.getOperand(7), // cachepolicy, swizzled buffer
11020 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11021 };
11022 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11024 MemSDNode *M = cast<MemSDNode>(Op);
11025 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11026 M->getMemoryVT(), M->getMemOperand());
11027 }
11028
11029 case Intrinsic::amdgcn_raw_buffer_store:
11030 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11031 case Intrinsic::amdgcn_raw_buffer_store_format:
11032 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11033 const bool IsFormat =
11034 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11035 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11036
11037 SDValue VData = Op.getOperand(2);
11038 EVT VDataVT = VData.getValueType();
11039 EVT EltType = VDataVT.getScalarType();
11040 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11041 if (IsD16) {
11042 VData = handleD16VData(VData, DAG);
11043 VDataVT = VData.getValueType();
11044 }
11045
11046 if (!isTypeLegal(VDataVT)) {
11047 VData =
11048 DAG.getNode(ISD::BITCAST, DL,
11049 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11050 }
11051
11052 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11053 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11054 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11055 SDValue Ops[] = {
11056 Chain,
11057 VData,
11058 Rsrc,
11059 DAG.getConstant(0, DL, MVT::i32), // vindex
11060 VOffset, // voffset
11061 SOffset, // soffset
11062 Offset, // offset
11063 Op.getOperand(6), // cachepolicy, swizzled buffer
11064 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11065 };
11066 unsigned Opc =
11069 MemSDNode *M = cast<MemSDNode>(Op);
11070
11071 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11072 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11073 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
11074
11075 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11076 M->getMemoryVT(), M->getMemOperand());
11077 }
11078
11079 case Intrinsic::amdgcn_struct_buffer_store:
11080 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11081 case Intrinsic::amdgcn_struct_buffer_store_format:
11082 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11083 const bool IsFormat =
11084 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11085 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11086
11087 SDValue VData = Op.getOperand(2);
11088 EVT VDataVT = VData.getValueType();
11089 EVT EltType = VDataVT.getScalarType();
11090 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11091
11092 if (IsD16) {
11093 VData = handleD16VData(VData, DAG);
11094 VDataVT = VData.getValueType();
11095 }
11096
11097 if (!isTypeLegal(VDataVT)) {
11098 VData =
11099 DAG.getNode(ISD::BITCAST, DL,
11100 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11101 }
11102
11103 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11104 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11105 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11106 SDValue Ops[] = {
11107 Chain,
11108 VData,
11109 Rsrc,
11110 Op.getOperand(4), // vindex
11111 VOffset, // voffset
11112 SOffset, // soffset
11113 Offset, // offset
11114 Op.getOperand(7), // cachepolicy, swizzled buffer
11115 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11116 };
11117 unsigned Opc =
11120 MemSDNode *M = cast<MemSDNode>(Op);
11121
11122 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11123 EVT VDataType = VData.getValueType().getScalarType();
11124 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11125 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
11126
11127 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11128 M->getMemoryVT(), M->getMemOperand());
11129 }
11130 case Intrinsic::amdgcn_raw_buffer_load_lds:
11131 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11132 case Intrinsic::amdgcn_struct_buffer_load_lds:
11133 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
11134 if (!Subtarget->hasVMemToLDSLoad())
11135 return SDValue();
11136 unsigned Opc;
11137 bool HasVIndex =
11138 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11139 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
11140 unsigned OpOffset = HasVIndex ? 1 : 0;
11141 SDValue VOffset = Op.getOperand(5 + OpOffset);
11142 bool HasVOffset = !isNullConstant(VOffset);
11143 unsigned Size = Op->getConstantOperandVal(4);
11144
11145 switch (Size) {
11146 default:
11147 return SDValue();
11148 case 1:
11149 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11150 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11151 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11152 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11153 break;
11154 case 2:
11155 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11156 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11157 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11158 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11159 break;
11160 case 4:
11161 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11162 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11163 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11164 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11165 break;
11166 case 12:
11167 if (!Subtarget->hasLDSLoadB96_B128())
11168 return SDValue();
11169 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11170 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11171 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11172 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11173 break;
11174 case 16:
11175 if (!Subtarget->hasLDSLoadB96_B128())
11176 return SDValue();
11177 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11178 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11179 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11180 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11181 break;
11182 }
11183
11184 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11185
11187
11188 if (HasVIndex && HasVOffset)
11189 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
11190 {Op.getOperand(5), // VIndex
11191 VOffset}));
11192 else if (HasVIndex)
11193 Ops.push_back(Op.getOperand(5));
11194 else if (HasVOffset)
11195 Ops.push_back(VOffset);
11196
11197 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11198 Ops.push_back(Rsrc);
11199 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
11200 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
11201 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
11202 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
11203 Ops.push_back(DAG.getTargetConstant(
11204 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
11205 DL, MVT::i8)); // cpol
11206 Ops.push_back(DAG.getTargetConstant(
11207 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
11208 ? 1
11209 : 0,
11210 DL, MVT::i8)); // swz
11211 Ops.push_back(M0Val.getValue(0)); // Chain
11212 Ops.push_back(M0Val.getValue(1)); // Glue
11213
11214 auto *M = cast<MemSDNode>(Op);
11215 MachineMemOperand *LoadMMO = M->getMemOperand();
11216 // Don't set the offset value here because the pointer points to the base of
11217 // the buffer.
11218 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11219
11220 MachinePointerInfo StorePtrI = LoadPtrI;
11221 LoadPtrI.V = PoisonValue::get(
11225
11226 auto F = LoadMMO->getFlags() &
11228 LoadMMO =
11230 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11231
11232 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11233 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
11234 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11235
11236 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
11237 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11238
11239 return SDValue(Load, 0);
11240 }
11241 // Buffers are handled by LowerBufferFatPointers, and we're going to go
11242 // for "trust me" that the remaining cases are global pointers until
11243 // such time as we can put two mem operands on an intrinsic.
11244 case Intrinsic::amdgcn_load_to_lds:
11245 case Intrinsic::amdgcn_global_load_lds: {
11246 if (!Subtarget->hasVMemToLDSLoad())
11247 return SDValue();
11248
11249 unsigned Opc;
11250 unsigned Size = Op->getConstantOperandVal(4);
11251 switch (Size) {
11252 default:
11253 return SDValue();
11254 case 1:
11255 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11256 break;
11257 case 2:
11258 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11259 break;
11260 case 4:
11261 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11262 break;
11263 case 12:
11264 if (!Subtarget->hasLDSLoadB96_B128())
11265 return SDValue();
11266 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11267 break;
11268 case 16:
11269 if (!Subtarget->hasLDSLoadB96_B128())
11270 return SDValue();
11271 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11272 break;
11273 }
11274
11275 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11276
11278
11279 SDValue Addr = Op.getOperand(2); // Global ptr
11280 SDValue VOffset;
11281 // Try to split SAddr and VOffset. Global and LDS pointers share the same
11282 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
11283 if (Addr->isDivergent() && Addr->isAnyAdd()) {
11284 SDValue LHS = Addr.getOperand(0);
11285 SDValue RHS = Addr.getOperand(1);
11286
11287 if (LHS->isDivergent())
11288 std::swap(LHS, RHS);
11289
11290 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
11291 RHS.getOperand(0).getValueType() == MVT::i32) {
11292 // add (i64 sgpr), (zero_extend (i32 vgpr))
11293 Addr = LHS;
11294 VOffset = RHS.getOperand(0);
11295 }
11296 }
11297
11298 Ops.push_back(Addr);
11299 if (!Addr->isDivergent()) {
11301 if (!VOffset)
11302 VOffset =
11303 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
11304 DAG.getTargetConstant(0, DL, MVT::i32)),
11305 0);
11306 Ops.push_back(VOffset);
11307 }
11308
11309 Ops.push_back(Op.getOperand(5)); // Offset
11310 Ops.push_back(Op.getOperand(6)); // CPol
11311 Ops.push_back(M0Val.getValue(0)); // Chain
11312 Ops.push_back(M0Val.getValue(1)); // Glue
11313
11314 auto *M = cast<MemSDNode>(Op);
11315 MachineMemOperand *LoadMMO = M->getMemOperand();
11316 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11317 LoadPtrI.Offset = Op->getConstantOperandVal(5);
11318 MachinePointerInfo StorePtrI = LoadPtrI;
11319 LoadPtrI.V = PoisonValue::get(
11323 auto F = LoadMMO->getFlags() &
11325 LoadMMO =
11327 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11328 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11329 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
11330 LoadMMO->getAAInfo());
11331
11332 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11333 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11334
11335 return SDValue(Load, 0);
11336 }
11337 case Intrinsic::amdgcn_end_cf:
11338 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
11339 Op->getOperand(2), Chain),
11340 0);
11341 case Intrinsic::amdgcn_s_barrier_init:
11342 case Intrinsic::amdgcn_s_barrier_signal_var: {
11343 // these two intrinsics have two operands: barrier pointer and member count
11344 SDValue Chain = Op->getOperand(0);
11346 SDValue BarOp = Op->getOperand(2);
11347 SDValue CntOp = Op->getOperand(3);
11348 SDValue M0Val;
11349 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11350 ? AMDGPU::S_BARRIER_INIT_M0
11351 : AMDGPU::S_BARRIER_SIGNAL_M0;
11352 // extract the BarrierID from bits 4-9 of BarOp
11353 SDValue BarID;
11354 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11355 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11356 BarID =
11357 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
11358 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11359 0);
11360 // Member count should be put into M0[ShAmt:+6]
11361 // Barrier ID should be put into M0[5:0]
11362 M0Val =
11363 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
11364 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11365 0);
11366 constexpr unsigned ShAmt = 16;
11367 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
11368 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
11369
11370 M0Val = SDValue(
11371 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
11372
11373 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11374
11375 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11376 return SDValue(NewMI, 0);
11377 }
11378 case Intrinsic::amdgcn_s_barrier_join: {
11379 // these three intrinsics have one operand: barrier pointer
11380 SDValue Chain = Op->getOperand(0);
11382 SDValue BarOp = Op->getOperand(2);
11383 unsigned Opc;
11384
11385 if (isa<ConstantSDNode>(BarOp)) {
11386 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
11387 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11388
11389 // extract the BarrierID from bits 4-9 of the immediate
11390 unsigned BarID = (BarVal >> 4) & 0x3F;
11391 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
11392 Ops.push_back(K);
11393 Ops.push_back(Chain);
11394 } else {
11395 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11396
11397 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
11398 SDValue M0Val;
11399 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11400 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11401 M0Val =
11402 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
11403 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11404 0);
11405 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11406 }
11407
11408 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11409 return SDValue(NewMI, 0);
11410 }
11411 case Intrinsic::amdgcn_s_prefetch_data: {
11412 // For non-global address space preserve the chain and remove the call.
11414 return Op.getOperand(0);
11415 return Op;
11416 }
11417 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11418 SDValue Ops[] = {
11419 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
11420 Op.getOperand(3), // offset
11421 Op.getOperand(4), // length
11422 };
11423
11424 MemSDNode *M = cast<MemSDNode>(Op);
11426 Op->getVTList(), Ops, M->getMemoryVT(),
11427 M->getMemOperand());
11428 }
11429 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11430 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11431 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11432 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11433 SDValue Chain = Op->getOperand(0);
11434 SDValue Ptr = Op->getOperand(2);
11435 SDValue Val = Op->getOperand(3);
11436 return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val,
11437 Ptr, MII->getMemOperand());
11438 }
11439 default: {
11440 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11442 return lowerImage(Op, ImageDimIntr, DAG, true);
11443
11444 return Op;
11445 }
11446 }
11447}
11448
11449// Return whether the operation has NoUnsignedWrap property.
11450static bool isNoUnsignedWrap(SDValue Addr) {
11451 return (Addr.getOpcode() == ISD::ADD &&
11452 Addr->getFlags().hasNoUnsignedWrap()) ||
11453 Addr->getOpcode() == ISD::OR;
11454}
11455
11457 EVT PtrVT) const {
11458 return PtrVT == MVT::i64;
11459}
11460
11462 EVT PtrVT) const {
11463 return true;
11464}
11465
11466// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
11467// offset (the offset that is included in bounds checking and swizzling, to be
11468// split between the instruction's voffset and immoffset fields) and soffset
11469// (the offset that is excluded from bounds checking and swizzling, to go in
11470// the instruction's soffset field). This function takes the first kind of
11471// offset and figures out how to split it between voffset and immoffset.
11472std::pair<SDValue, SDValue>
11473SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
11474 SDLoc DL(Offset);
11475 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
11476 SDValue N0 = Offset;
11477 ConstantSDNode *C1 = nullptr;
11478
11479 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
11480 N0 = SDValue();
11481 else if (DAG.isBaseWithConstantOffset(N0)) {
11482 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
11483 // being added, so we can only safely match a 32-bit addition with no
11484 // unsigned overflow.
11485 bool CheckNUW = AMDGPU::isGFX1250(*Subtarget);
11486 if (!CheckNUW || isNoUnsignedWrap(N0)) {
11487 C1 = cast<ConstantSDNode>(N0.getOperand(1));
11488 N0 = N0.getOperand(0);
11489 }
11490 }
11491
11492 if (C1) {
11493 unsigned ImmOffset = C1->getZExtValue();
11494 // If the immediate value is too big for the immoffset field, put only bits
11495 // that would normally fit in the immoffset field. The remaining value that
11496 // is copied/added for the voffset field is a large power of 2, and it
11497 // stands more chance of being CSEd with the copy/add for another similar
11498 // load/store.
11499 // However, do not do that rounding down if that is a negative
11500 // number, as it appears to be illegal to have a negative offset in the
11501 // vgpr, even if adding the immediate offset makes it positive.
11502 unsigned Overflow = ImmOffset & ~MaxImm;
11503 ImmOffset -= Overflow;
11504 if ((int32_t)Overflow < 0) {
11505 Overflow += ImmOffset;
11506 ImmOffset = 0;
11507 }
11508 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
11509 if (Overflow) {
11510 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
11511 if (!N0)
11512 N0 = OverflowVal;
11513 else {
11514 SDValue Ops[] = {N0, OverflowVal};
11515 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
11516 }
11517 }
11518 }
11519 if (!N0)
11520 N0 = DAG.getConstant(0, DL, MVT::i32);
11521 if (!C1)
11522 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
11523 return {N0, SDValue(C1, 0)};
11524}
11525
11526// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
11527// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
11528// pointed to by Offsets.
11529void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
11530 SelectionDAG &DAG, SDValue *Offsets,
11531 Align Alignment) const {
11532 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11533 SDLoc DL(CombinedOffset);
11534 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
11535 uint32_t Imm = C->getZExtValue();
11536 uint32_t SOffset, ImmOffset;
11537 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11538 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
11539 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11540 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11541 return;
11542 }
11543 }
11544 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
11545 SDValue N0 = CombinedOffset.getOperand(0);
11546 SDValue N1 = CombinedOffset.getOperand(1);
11547 uint32_t SOffset, ImmOffset;
11548 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
11549 if (Offset >= 0 &&
11550 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
11551 Offsets[0] = N0;
11552 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11553 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11554 return;
11555 }
11556 }
11557
11558 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11559 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
11560 : DAG.getConstant(0, DL, MVT::i32);
11561
11562 Offsets[0] = CombinedOffset;
11563 Offsets[1] = SOffsetZero;
11564 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
11565}
11566
11567SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
11568 SelectionDAG &DAG) const {
11569 if (!MaybePointer.getValueType().isScalarInteger())
11570 return MaybePointer;
11571
11572 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
11573 return Rsrc;
11574}
11575
11576// Wrap a global or flat pointer into a buffer intrinsic using the flags
11577// specified in the intrinsic.
11578SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
11579 SelectionDAG &DAG) const {
11580 SDLoc Loc(Op);
11581
11582 SDValue Pointer = Op->getOperand(1);
11583 SDValue Stride = Op->getOperand(2);
11584 SDValue NumRecords = Op->getOperand(3);
11585 SDValue Flags = Op->getOperand(4);
11586
11587 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
11588 SDValue Rsrc;
11589
11590 if (Subtarget->has45BitNumRecordsBufferResource()) {
11591 SDValue Zero = DAG.getConstant(0, Loc, MVT::i32);
11592 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
11593 // num_records.
11594 SDValue ExtPointer = DAG.getAnyExtOrTrunc(Pointer, Loc, MVT::i64);
11595 SDValue NumRecordsLHS =
11596 DAG.getNode(ISD::SHL, Loc, MVT::i64, NumRecords,
11597 DAG.getShiftAmountConstant(57, MVT::i32, Loc));
11598 SDValue LowHalf =
11599 DAG.getNode(ISD::OR, Loc, MVT::i64, ExtPointer, NumRecordsLHS);
11600
11601 // Build the higher 64-bit value, which has the higher 38-bit num_records,
11602 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
11603 SDValue NumRecordsRHS =
11604 DAG.getNode(ISD::SRL, Loc, MVT::i64, NumRecords,
11605 DAG.getShiftAmountConstant(7, MVT::i32, Loc));
11606 SDValue ShiftedStride =
11607 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11608 DAG.getShiftAmountConstant(12, MVT::i32, Loc));
11609 SDValue ExtShiftedStrideVec =
11610 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedStride);
11611 SDValue ExtShiftedStride =
11612 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedStrideVec);
11613 SDValue ShiftedFlags =
11614 DAG.getNode(ISD::SHL, Loc, MVT::i32, Flags,
11615 DAG.getShiftAmountConstant(28, MVT::i32, Loc));
11616 SDValue ExtShiftedFlagsVec =
11617 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedFlags);
11618 SDValue ExtShiftedFlags =
11619 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedFlagsVec);
11620 SDValue CombinedFields =
11621 DAG.getNode(ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
11622 SDValue HighHalf =
11623 DAG.getNode(ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
11624
11625 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i64, LowHalf, HighHalf);
11626 } else {
11627 NumRecords = DAG.getAnyExtOrTrunc(NumRecords, Loc, MVT::i32);
11628 auto [LowHalf, HighHalf] =
11629 DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11630 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
11631 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
11632 SDValue ShiftedStride =
11633 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11634 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
11635 SDValue NewHighHalf =
11636 DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
11637
11638 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf, NewHighHalf,
11639 NumRecords, Flags);
11640 }
11641
11642 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11643 return RsrcPtr;
11644}
11645
11646// Handle 8 bit and 16 bit buffer loads
11647SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
11648 EVT LoadVT, SDLoc DL,
11650 MachineMemOperand *MMO,
11651 bool IsTFE) const {
11652 EVT IntVT = LoadVT.changeTypeToInteger();
11653
11654 if (IsTFE) {
11655 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
11658 MachineFunction &MF = DAG.getMachineFunction();
11659 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
11660 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
11661 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
11662 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11663 DAG.getConstant(1, DL, MVT::i32));
11664 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11665 DAG.getConstant(0, DL, MVT::i32));
11666 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
11667 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
11668 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
11669 }
11670
11671 unsigned Opc = LoadVT.getScalarType() == MVT::i8
11674
11675 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
11676 SDValue BufferLoad =
11677 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
11678 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
11679 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
11680
11681 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
11682}
11683
11684// Handle 8 bit and 16 bit buffer stores
11685SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
11686 EVT VDataType, SDLoc DL,
11687 SDValue Ops[],
11688 MemSDNode *M) const {
11689 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11690 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
11691
11692 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
11693 Ops[1] = BufferStoreExt;
11694 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
11695 : AMDGPUISD::BUFFER_STORE_SHORT;
11696 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
11697 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
11698 M->getMemOperand());
11699}
11700
11702 SDValue Op, const SDLoc &SL, EVT VT) {
11703 if (VT.bitsLT(Op.getValueType()))
11704 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
11705
11706 switch (ExtType) {
11707 case ISD::SEXTLOAD:
11708 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
11709 case ISD::ZEXTLOAD:
11710 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
11711 case ISD::EXTLOAD:
11712 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
11713 case ISD::NON_EXTLOAD:
11714 return Op;
11715 }
11716
11717 llvm_unreachable("invalid ext type");
11718}
11719
11720// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
11721// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
11722SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
11723 DAGCombinerInfo &DCI) const {
11724 SelectionDAG &DAG = DCI.DAG;
11725 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
11726 return SDValue();
11727
11728 // FIXME: Constant loads should all be marked invariant.
11729 unsigned AS = Ld->getAddressSpace();
11730 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
11732 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
11733 return SDValue();
11734
11735 // Don't do this early, since it may interfere with adjacent load merging for
11736 // illegal types. We can avoid losing alignment information for exotic types
11737 // pre-legalize.
11738 EVT MemVT = Ld->getMemoryVT();
11739 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
11740 MemVT.getSizeInBits() >= 32)
11741 return SDValue();
11742
11743 SDLoc SL(Ld);
11744
11745 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
11746 "unexpected vector extload");
11747
11748 // TODO: Drop only high part of range.
11749 SDValue Ptr = Ld->getBasePtr();
11750 SDValue NewLoad = DAG.getLoad(
11751 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
11752 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
11753 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
11754 nullptr); // Drop ranges
11755
11756 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
11757 if (MemVT.isFloatingPoint()) {
11759 "unexpected fp extload");
11760 TruncVT = MemVT.changeTypeToInteger();
11761 }
11762
11763 SDValue Cvt = NewLoad;
11764 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
11765 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
11766 DAG.getValueType(TruncVT));
11767 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
11769 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
11770 } else {
11772 }
11773
11774 EVT VT = Ld->getValueType(0);
11775 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
11776
11777 DCI.AddToWorklist(Cvt.getNode());
11778
11779 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
11780 // the appropriate extension from the 32-bit load.
11781 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
11782 DCI.AddToWorklist(Cvt.getNode());
11783
11784 // Handle conversion back to floating point if necessary.
11785 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
11786
11787 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
11788}
11789
11791 const SIMachineFunctionInfo &Info) {
11792 // TODO: Should check if the address can definitely not access stack.
11793 if (Info.isEntryFunction())
11794 return Info.getUserSGPRInfo().hasFlatScratchInit();
11795 return true;
11796}
11797
11798SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
11799 SDLoc DL(Op);
11800 LoadSDNode *Load = cast<LoadSDNode>(Op);
11801 ISD::LoadExtType ExtType = Load->getExtensionType();
11802 EVT MemVT = Load->getMemoryVT();
11803 MachineMemOperand *MMO = Load->getMemOperand();
11804
11805 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
11806 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
11807 return SDValue();
11808
11809 // FIXME: Copied from PPC
11810 // First, load into 32 bits, then truncate to 1 bit.
11811
11812 SDValue Chain = Load->getChain();
11813 SDValue BasePtr = Load->getBasePtr();
11814
11815 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
11816
11817 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
11818 RealMemVT, MMO);
11819
11820 if (!MemVT.isVector()) {
11821 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
11822 NewLD.getValue(1)};
11823
11824 return DAG.getMergeValues(Ops, DL);
11825 }
11826
11828 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
11829 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
11830 DAG.getConstant(I, DL, MVT::i32));
11831
11832 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
11833 }
11834
11835 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
11836
11837 return DAG.getMergeValues(Ops, DL);
11838 }
11839
11840 if (!MemVT.isVector())
11841 return SDValue();
11842
11843 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
11844 "Custom lowering for non-i32 vectors hasn't been implemented.");
11845
11846 Align Alignment = Load->getAlign();
11847 unsigned AS = Load->getAddressSpace();
11848 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
11849 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
11850 return SplitVectorLoad(Op, DAG);
11851 }
11852
11853 MachineFunction &MF = DAG.getMachineFunction();
11854 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
11855 // If there is a possibility that flat instruction access scratch memory
11856 // then we need to use the same legalization rules we use for private.
11857 if (AS == AMDGPUAS::FLAT_ADDRESS &&
11858 !Subtarget->hasMultiDwordFlatScratchAddressing())
11859 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
11862
11863 unsigned NumElements = MemVT.getVectorNumElements();
11864
11865 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11867 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
11868 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
11870 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
11871 Alignment >= Align(4) && NumElements < 32) {
11872 if (MemVT.isPow2VectorType() ||
11873 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
11874 return SDValue();
11875 return WidenOrSplitVectorLoad(Op, DAG);
11876 }
11877 // Non-uniform loads will be selected to MUBUF instructions, so they
11878 // have the same legalization requirements as global and private
11879 // loads.
11880 //
11881 }
11882 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11885 if (NumElements > 4)
11886 return SplitVectorLoad(Op, DAG);
11887 // v3 loads not supported on SI.
11888 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11889 return WidenOrSplitVectorLoad(Op, DAG);
11890
11891 // v3 and v4 loads are supported for private and global memory.
11892 return SDValue();
11893 }
11894 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11895 // Depending on the setting of the private_element_size field in the
11896 // resource descriptor, we can only make private accesses up to a certain
11897 // size.
11898 switch (Subtarget->getMaxPrivateElementSize()) {
11899 case 4: {
11900 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
11901 return DAG.getMergeValues({Op0, Op1}, DL);
11902 }
11903 case 8:
11904 if (NumElements > 2)
11905 return SplitVectorLoad(Op, DAG);
11906 return SDValue();
11907 case 16:
11908 // Same as global/flat
11909 if (NumElements > 4)
11910 return SplitVectorLoad(Op, DAG);
11911 // v3 loads not supported on SI.
11912 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11913 return WidenOrSplitVectorLoad(Op, DAG);
11914
11915 return SDValue();
11916 default:
11917 llvm_unreachable("unsupported private_element_size");
11918 }
11919 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
11920 unsigned Fast = 0;
11921 auto Flags = Load->getMemOperand()->getFlags();
11923 Load->getAlign(), Flags, &Fast) &&
11924 Fast > 1)
11925 return SDValue();
11926
11927 if (MemVT.isVector())
11928 return SplitVectorLoad(Op, DAG);
11929 }
11930
11932 MemVT, *Load->getMemOperand())) {
11933 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
11934 return DAG.getMergeValues({Op0, Op1}, DL);
11935 }
11936
11937 return SDValue();
11938}
11939
11940SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
11941 EVT VT = Op.getValueType();
11942 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
11943 VT.getSizeInBits() == 512)
11944 return splitTernaryVectorOp(Op, DAG);
11945
11946 assert(VT.getSizeInBits() == 64);
11947
11948 SDLoc DL(Op);
11949 SDValue Cond = DAG.getFreeze(Op.getOperand(0));
11950
11951 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
11952 SDValue One = DAG.getConstant(1, DL, MVT::i32);
11953
11954 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
11955 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
11956
11957 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
11958 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
11959
11960 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
11961
11962 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
11963 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
11964
11965 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
11966
11967 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
11968 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
11969}
11970
11971// Catch division cases where we can use shortcuts with rcp and rsq
11972// instructions.
11973SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
11974 SelectionDAG &DAG) const {
11975 SDLoc SL(Op);
11976 SDValue LHS = Op.getOperand(0);
11977 SDValue RHS = Op.getOperand(1);
11978 EVT VT = Op.getValueType();
11979 const SDNodeFlags Flags = Op->getFlags();
11980
11981 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
11982
11983 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
11984 // Without !fpmath accuracy information, we can't do more because we don't
11985 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
11986 // f16 is always accurate enough
11987 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
11988 return SDValue();
11989
11990 if (CLHS->isExactlyValue(1.0)) {
11991 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
11992 // the CI documentation has a worst case error of 1 ulp.
11993 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
11994 // use it as long as we aren't trying to use denormals.
11995 //
11996 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
11997
11998 // 1.0 / sqrt(x) -> rsq(x)
11999
12000 // XXX - Is afn sufficient to do this for f64? The maximum ULP
12001 // error seems really high at 2^29 ULP.
12002 // 1.0 / x -> rcp(x)
12003 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12004 }
12005
12006 // Same as for 1.0, but expand the sign out of the constant.
12007 if (CLHS->isExactlyValue(-1.0)) {
12008 // -1.0 / x -> rcp (fneg x)
12009 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
12010 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
12011 }
12012 }
12013
12014 // For f16 and bf16 require afn or arcp.
12015 // For f32 require afn.
12016 if (!AllowInaccurateRcp &&
12017 ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
12018 return SDValue();
12019
12020 // Turn into multiply by the reciprocal.
12021 // x / y -> x * (1.0 / y)
12022 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12023 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
12024}
12025
12026SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
12027 SelectionDAG &DAG) const {
12028 SDLoc SL(Op);
12029 SDValue X = Op.getOperand(0);
12030 SDValue Y = Op.getOperand(1);
12031 EVT VT = Op.getValueType();
12032 const SDNodeFlags Flags = Op->getFlags();
12033
12034 bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
12035 if (!AllowInaccurateDiv)
12036 return SDValue();
12037
12038 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
12039 SDValue One = DAG.getConstantFP(1.0, SL, VT);
12040
12041 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
12042 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12043
12044 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
12045 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12046 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
12047 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
12048 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
12049 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
12050}
12051
12052static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12053 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
12054 SDNodeFlags Flags) {
12055 if (GlueChain->getNumValues() <= 1) {
12056 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
12057 }
12058
12059 assert(GlueChain->getNumValues() == 3);
12060
12061 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12062 switch (Opcode) {
12063 default:
12064 llvm_unreachable("no chain equivalent for opcode");
12065 case ISD::FMUL:
12066 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12067 break;
12068 }
12069
12070 return DAG.getNode(Opcode, SL, VTList,
12071 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
12072 Flags);
12073}
12074
12075static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12076 EVT VT, SDValue A, SDValue B, SDValue C,
12077 SDValue GlueChain, SDNodeFlags Flags) {
12078 if (GlueChain->getNumValues() <= 1) {
12079 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
12080 }
12081
12082 assert(GlueChain->getNumValues() == 3);
12083
12084 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12085 switch (Opcode) {
12086 default:
12087 llvm_unreachable("no chain equivalent for opcode");
12088 case ISD::FMA:
12089 Opcode = AMDGPUISD::FMA_W_CHAIN;
12090 break;
12091 }
12092
12093 return DAG.getNode(Opcode, SL, VTList,
12094 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
12095 Flags);
12096}
12097
12098SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
12099 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12100 return FastLowered;
12101
12102 SDLoc SL(Op);
12103 EVT VT = Op.getValueType();
12104 SDValue LHS = Op.getOperand(0);
12105 SDValue RHS = Op.getOperand(1);
12106
12107 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
12108 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
12109
12110 if (VT == MVT::bf16) {
12111 SDValue ExtDiv =
12112 DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags());
12113 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv,
12114 DAG.getTargetConstant(0, SL, MVT::i32));
12115 }
12116
12117 assert(VT == MVT::f16);
12118
12119 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
12120 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
12121 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
12122 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
12123 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12124 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
12125 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12126 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
12127 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
12128 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
12129 // q16.u = opx(V_CVT_F16_F32, q32.u);
12130 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
12131
12132 // We will use ISD::FMA on targets that don't support ISD::FMAD.
12133 unsigned FMADOpCode =
12135 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
12136 SDValue Rcp =
12137 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
12138 SDValue Quot =
12139 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
12140 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12141 Op->getFlags());
12142 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
12143 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12144 Op->getFlags());
12145 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
12146 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
12147 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
12148 DAG.getConstant(0xff800000, SL, MVT::i32));
12149 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
12150 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
12151 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
12152 DAG.getTargetConstant(0, SL, MVT::i32));
12153 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
12154 Op->getFlags());
12155}
12156
12157// Faster 2.5 ULP division that does not support denormals.
12158SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
12159 SDNodeFlags Flags = Op->getFlags();
12160 SDLoc SL(Op);
12161 SDValue LHS = Op.getOperand(1);
12162 SDValue RHS = Op.getOperand(2);
12163
12164 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
12165
12166 const APFloat K0Val(0x1p+96f);
12167 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
12168
12169 const APFloat K1Val(0x1p-32f);
12170 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
12171
12172 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12173
12174 EVT SetCCVT =
12175 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
12176
12177 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
12178
12179 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
12180
12181 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
12182
12183 // rcp does not support denormals.
12184 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
12185
12186 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
12187
12188 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
12189}
12190
12191// Returns immediate value for setting the F32 denorm mode when using the
12192// S_DENORM_MODE instruction.
12195 const GCNSubtarget *ST) {
12196 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
12197 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
12198 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12199 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
12200}
12201
12202SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
12203 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12204 return FastLowered;
12205
12206 // The selection matcher assumes anything with a chain selecting to a
12207 // mayRaiseFPException machine instruction. Since we're introducing a chain
12208 // here, we need to explicitly report nofpexcept for the regular fdiv
12209 // lowering.
12210 SDNodeFlags Flags = Op->getFlags();
12211 Flags.setNoFPExcept(true);
12212
12213 SDLoc SL(Op);
12214 SDValue LHS = Op.getOperand(0);
12215 SDValue RHS = Op.getOperand(1);
12216
12217 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12218
12219 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
12220
12221 SDValue DenominatorScaled =
12222 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
12223 SDValue NumeratorScaled =
12224 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
12225
12226 // Denominator is scaled to not be denormal, so using rcp is ok.
12227 SDValue ApproxRcp =
12228 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
12229 SDValue NegDivScale0 =
12230 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
12231
12232 using namespace AMDGPU::Hwreg;
12233 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12234 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
12235
12236 const MachineFunction &MF = DAG.getMachineFunction();
12237 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
12238 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
12239
12240 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
12241 const bool HasDynamicDenormals =
12242 (DenormMode.Input == DenormalMode::Dynamic) ||
12243 (DenormMode.Output == DenormalMode::Dynamic);
12244
12245 SDValue SavedDenormMode;
12246
12247 if (!PreservesDenormals) {
12248 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
12249 // lowering. The chain dependence is insufficient, and we need glue. We do
12250 // not need the glue variants in a strictfp function.
12251
12252 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12253
12254 SDValue Glue = DAG.getEntryNode();
12255 if (HasDynamicDenormals) {
12256 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
12257 DAG.getVTList(MVT::i32, MVT::Glue),
12258 {BitField, Glue});
12259 SavedDenormMode = SDValue(GetReg, 0);
12260
12261 Glue = DAG.getMergeValues(
12262 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
12263 }
12264
12265 SDNode *EnableDenorm;
12266 if (Subtarget->hasDenormModeInst()) {
12267 const SDValue EnableDenormValue =
12269
12270 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
12271 EnableDenormValue)
12272 .getNode();
12273 } else {
12274 const SDValue EnableDenormValue =
12275 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
12276 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12277 {EnableDenormValue, BitField, Glue});
12278 }
12279
12280 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
12281 SDValue(EnableDenorm, 1)};
12282
12283 NegDivScale0 = DAG.getMergeValues(Ops, SL);
12284 }
12285
12286 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
12287 ApproxRcp, One, NegDivScale0, Flags);
12288
12289 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
12290 ApproxRcp, Fma0, Flags);
12291
12292 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
12293 Fma1, Flags);
12294
12295 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
12296 NumeratorScaled, Mul, Flags);
12297
12298 SDValue Fma3 =
12299 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
12300
12301 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
12302 NumeratorScaled, Fma3, Flags);
12303
12304 if (!PreservesDenormals) {
12305 SDNode *DisableDenorm;
12306 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12307 const SDValue DisableDenormValue = getSPDenormModeValue(
12308 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
12309
12310 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12311 DisableDenorm =
12312 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
12313 Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2))
12314 .getNode();
12315 } else {
12316 assert(HasDynamicDenormals == (bool)SavedDenormMode);
12317 const SDValue DisableDenormValue =
12318 HasDynamicDenormals
12319 ? SavedDenormMode
12320 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
12321
12322 DisableDenorm = DAG.getMachineNode(
12323 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12324 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
12325 }
12326
12327 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
12328 SDValue(DisableDenorm, 0), DAG.getRoot());
12329 DAG.setRoot(OutputChain);
12330 }
12331
12332 SDValue Scale = NumeratorScaled.getValue(1);
12333 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
12334 {Fma4, Fma1, Fma3, Scale}, Flags);
12335
12336 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
12337}
12338
12339SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
12340 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
12341 return FastLowered;
12342
12343 SDLoc SL(Op);
12344 SDValue X = Op.getOperand(0);
12345 SDValue Y = Op.getOperand(1);
12346
12347 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
12348
12349 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
12350
12351 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
12352
12353 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
12354
12355 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
12356
12357 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
12358
12359 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
12360
12361 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
12362
12363 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
12364
12365 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
12366 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
12367
12368 SDValue Fma4 =
12369 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
12370
12371 SDValue Scale;
12372
12373 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12374 // Workaround a hardware bug on SI where the condition output from div_scale
12375 // is not usable.
12376
12377 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
12378
12379 // Figure out if the scale to use for div_fmas.
12380 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
12381 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
12382 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
12383 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
12384
12385 SDValue NumHi =
12386 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
12387 SDValue DenHi =
12388 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
12389
12390 SDValue Scale0Hi =
12391 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
12392 SDValue Scale1Hi =
12393 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
12394
12395 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
12396 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
12397 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
12398 } else {
12399 Scale = DivScale1.getValue(1);
12400 }
12401
12402 SDValue Fmas =
12403 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
12404
12405 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
12406}
12407
12408SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
12409 EVT VT = Op.getValueType();
12410
12411 if (VT == MVT::f32)
12412 return LowerFDIV32(Op, DAG);
12413
12414 if (VT == MVT::f64)
12415 return LowerFDIV64(Op, DAG);
12416
12417 if (VT == MVT::f16 || VT == MVT::bf16)
12418 return LowerFDIV16(Op, DAG);
12419
12420 llvm_unreachable("Unexpected type for fdiv");
12421}
12422
12423SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
12424 SDLoc dl(Op);
12425 SDValue Val = Op.getOperand(0);
12426 EVT VT = Val.getValueType();
12427 EVT ResultExpVT = Op->getValueType(1);
12428 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12429
12430 SDValue Mant = DAG.getNode(
12432 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
12433
12434 SDValue Exp = DAG.getNode(
12435 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
12436 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
12437
12438 if (Subtarget->hasFractBug()) {
12439 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
12440 SDValue Inf =
12442
12443 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
12444 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
12445 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
12446 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
12447 }
12448
12449 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
12450 return DAG.getMergeValues({Mant, CastExp}, dl);
12451}
12452
12453SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
12454 SDLoc DL(Op);
12455 StoreSDNode *Store = cast<StoreSDNode>(Op);
12456 EVT VT = Store->getMemoryVT();
12457
12458 if (VT == MVT::i1) {
12459 return DAG.getTruncStore(
12460 Store->getChain(), DL,
12461 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
12462 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
12463 }
12464
12465 assert(VT.isVector() &&
12466 Store->getValue().getValueType().getScalarType() == MVT::i32);
12467
12468 unsigned AS = Store->getAddressSpace();
12469 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
12470 Store->getAlign().value() < VT.getStoreSize() &&
12471 VT.getSizeInBits() > 32) {
12472 return SplitVectorStore(Op, DAG);
12473 }
12474
12475 MachineFunction &MF = DAG.getMachineFunction();
12476 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12477 // If there is a possibility that flat instruction access scratch memory
12478 // then we need to use the same legalization rules we use for private.
12479 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12480 !Subtarget->hasMultiDwordFlatScratchAddressing())
12481 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
12484
12485 unsigned NumElements = VT.getVectorNumElements();
12487 if (NumElements > 4)
12488 return SplitVectorStore(Op, DAG);
12489 // v3 stores not supported on SI.
12490 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12491 return SplitVectorStore(Op, DAG);
12492
12494 VT, *Store->getMemOperand()))
12495 return expandUnalignedStore(Store, DAG);
12496
12497 return SDValue();
12498 }
12499 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12500 switch (Subtarget->getMaxPrivateElementSize()) {
12501 case 4:
12502 return scalarizeVectorStore(Store, DAG);
12503 case 8:
12504 if (NumElements > 2)
12505 return SplitVectorStore(Op, DAG);
12506 return SDValue();
12507 case 16:
12508 if (NumElements > 4 ||
12509 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12510 return SplitVectorStore(Op, DAG);
12511 return SDValue();
12512 default:
12513 llvm_unreachable("unsupported private_element_size");
12514 }
12515 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12516 unsigned Fast = 0;
12517 auto Flags = Store->getMemOperand()->getFlags();
12519 Store->getAlign(), Flags, &Fast) &&
12520 Fast > 1)
12521 return SDValue();
12522
12523 if (VT.isVector())
12524 return SplitVectorStore(Op, DAG);
12525
12526 return expandUnalignedStore(Store, DAG);
12527 }
12528
12529 // Probably an invalid store. If so we'll end up emitting a selection error.
12530 return SDValue();
12531}
12532
12533// Avoid the full correct expansion for f32 sqrt when promoting from f16.
12534SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
12535 SDLoc SL(Op);
12536 assert(!Subtarget->has16BitInsts());
12537 SDNodeFlags Flags = Op->getFlags();
12538 SDValue Ext =
12539 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
12540
12541 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
12542 SDValue Sqrt =
12543 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
12544
12545 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
12546 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
12547}
12548
12549SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
12550 SDLoc DL(Op);
12551 SDNodeFlags Flags = Op->getFlags();
12552 MVT VT = Op.getValueType().getSimpleVT();
12553 const SDValue X = Op.getOperand(0);
12554
12555 if (allowApproxFunc(DAG, Flags)) {
12556 // Instruction is 1ulp but ignores denormals.
12557 return DAG.getNode(
12559 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
12560 }
12561
12562 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
12563 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
12564
12565 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
12566
12567 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
12568
12569 SDValue SqrtX =
12570 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
12571
12572 SDValue SqrtS;
12573 if (needsDenormHandlingF32(DAG, X, Flags)) {
12574 SDValue SqrtID =
12575 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
12576 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
12577
12578 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
12579 SDValue SqrtSNextDownInt =
12580 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12581 DAG.getAllOnesConstant(DL, MVT::i32));
12582 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
12583
12584 SDValue NegSqrtSNextDown =
12585 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
12586
12587 SDValue SqrtVP =
12588 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
12589
12590 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12591 DAG.getConstant(1, DL, MVT::i32));
12592 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
12593
12594 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
12595 SDValue SqrtVS =
12596 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
12597
12598 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
12599 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
12600
12601 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
12602 Flags);
12603
12604 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
12605 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
12606 Flags);
12607 } else {
12608 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
12609
12610 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
12611
12612 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
12613 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
12614 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
12615
12616 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
12617 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
12618 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
12619
12620 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
12621 SDValue SqrtD =
12622 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
12623 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
12624 }
12625
12626 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
12627
12628 SDValue ScaledDown =
12629 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
12630
12631 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
12632 SDValue IsZeroOrInf =
12633 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12634 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12635
12636 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
12637}
12638
12639SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
12640 // For double type, the SQRT and RSQ instructions don't have required
12641 // precision, we apply Goldschmidt's algorithm to improve the result:
12642 //
12643 // y0 = rsq(x)
12644 // g0 = x * y0
12645 // h0 = 0.5 * y0
12646 //
12647 // r0 = 0.5 - h0 * g0
12648 // g1 = g0 * r0 + g0
12649 // h1 = h0 * r0 + h0
12650 //
12651 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
12652 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
12653 // h2 = h1 * r1 + h1
12654 //
12655 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
12656 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
12657 //
12658 // sqrt(x) = g3
12659
12660 SDNodeFlags Flags = Op->getFlags();
12661
12662 SDLoc DL(Op);
12663
12664 SDValue X = Op.getOperand(0);
12665 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
12666
12667 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
12668
12669 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
12670
12671 // Scale up input if it is too small.
12672 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
12673 SDValue ScaleUp =
12674 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
12675 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
12676
12677 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
12678
12679 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
12680
12681 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
12682 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
12683
12684 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
12685 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
12686
12687 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
12688
12689 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
12690
12691 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
12692 SDValue SqrtD0 =
12693 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
12694
12695 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
12696
12697 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
12698 SDValue SqrtD1 =
12699 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
12700
12701 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
12702
12703 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
12704 SDValue ScaleDown =
12705 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
12706 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12707
12708 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
12709 // with finite only or nsz because rsq(+/-0) = +/-inf
12710
12711 // TODO: Check for DAZ and expand to subnormals
12712 SDValue IsZeroOrInf =
12713 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12714 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12715
12716 // If x is +INF, +0, or -0, use its original value
12717 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
12718 Flags);
12719}
12720
12721SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
12722 SDLoc DL(Op);
12723 EVT VT = Op.getValueType();
12724 SDValue Arg = Op.getOperand(0);
12725 SDValue TrigVal;
12726
12727 // Propagate fast-math flags so that the multiply we introduce can be folded
12728 // if Arg is already the result of a multiply by constant.
12729 auto Flags = Op->getFlags();
12730
12731 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
12732
12733 if (Subtarget->hasTrigReducedRange()) {
12734 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12735 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
12736 } else {
12737 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12738 }
12739
12740 switch (Op.getOpcode()) {
12741 case ISD::FCOS:
12742 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
12743 case ISD::FSIN:
12744 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
12745 default:
12746 llvm_unreachable("Wrong trig opcode");
12747 }
12748}
12749
12750SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
12751 SelectionDAG &DAG) const {
12752 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
12753 assert(AtomicNode->isCompareAndSwap());
12754 unsigned AS = AtomicNode->getAddressSpace();
12755
12756 // No custom lowering required for local address space
12758 return Op;
12759
12760 // Non-local address space requires custom lowering for atomic compare
12761 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
12762 SDLoc DL(Op);
12763 SDValue ChainIn = Op.getOperand(0);
12764 SDValue Addr = Op.getOperand(1);
12765 SDValue Old = Op.getOperand(2);
12766 SDValue New = Op.getOperand(3);
12767 EVT VT = Op.getValueType();
12768 MVT SimpleVT = VT.getSimpleVT();
12769 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
12770
12771 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
12772 SDValue Ops[] = {ChainIn, Addr, NewOld};
12773
12775 Op->getVTList(), Ops, VT,
12776 AtomicNode->getMemOperand());
12777}
12778
12779//===----------------------------------------------------------------------===//
12780// Custom DAG optimizations
12781//===----------------------------------------------------------------------===//
12782
12783SDValue
12784SITargetLowering::performUCharToFloatCombine(SDNode *N,
12785 DAGCombinerInfo &DCI) const {
12786 EVT VT = N->getValueType(0);
12787 EVT ScalarVT = VT.getScalarType();
12788 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12789 return SDValue();
12790
12791 SelectionDAG &DAG = DCI.DAG;
12792 SDLoc DL(N);
12793
12794 SDValue Src = N->getOperand(0);
12795 EVT SrcVT = Src.getValueType();
12796
12797 // TODO: We could try to match extracting the higher bytes, which would be
12798 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
12799 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
12800 // about in practice.
12801 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12802 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
12803 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
12804 DCI.AddToWorklist(Cvt.getNode());
12805
12806 // For the f16 case, fold to a cast to f32 and then cast back to f16.
12807 if (ScalarVT != MVT::f32) {
12808 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
12809 DAG.getTargetConstant(0, DL, MVT::i32));
12810 }
12811 return Cvt;
12812 }
12813 }
12814
12815 return SDValue();
12816}
12817
12818SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
12819 DAGCombinerInfo &DCI) const {
12820 SDValue MagnitudeOp = N->getOperand(0);
12821 SDValue SignOp = N->getOperand(1);
12822
12823 // The generic combine for fcopysign + fp cast is too conservative with
12824 // vectors, and also gets confused by the splitting we will perform here, so
12825 // peek through FP casts.
12826 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
12827 SignOp.getOpcode() == ISD::FP_ROUND)
12828 SignOp = SignOp.getOperand(0);
12829
12830 SelectionDAG &DAG = DCI.DAG;
12831 SDLoc DL(N);
12832 EVT SignVT = SignOp.getValueType();
12833
12834 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
12835 // lower half with a copy.
12836 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
12837 EVT MagVT = MagnitudeOp.getValueType();
12838
12839 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
12840
12841 if (MagVT.getScalarType() == MVT::f64) {
12842 EVT F32VT = MagVT.isVector()
12843 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
12844 : MVT::v2f32;
12845
12846 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
12847
12849 for (unsigned I = 0; I != NumElts; ++I) {
12850 SDValue MagLo =
12851 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
12852 DAG.getConstant(2 * I, DL, MVT::i32));
12853 SDValue MagHi =
12854 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
12855 DAG.getConstant(2 * I + 1, DL, MVT::i32));
12856
12857 SDValue SignOpElt =
12858 MagVT.isVector()
12860 SignOp, DAG.getConstant(I, DL, MVT::i32))
12861 : SignOp;
12862
12863 SDValue HiOp =
12864 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
12865
12866 SDValue Vector =
12867 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
12868
12869 SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
12870 NewElts.push_back(NewElt);
12871 }
12872
12873 if (NewElts.size() == 1)
12874 return NewElts[0];
12875
12876 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
12877 }
12878
12879 if (SignVT.getScalarType() != MVT::f64)
12880 return SDValue();
12881
12882 // Reduce width of sign operand, we only need the highest bit.
12883 //
12884 // fcopysign f64:x, f64:y ->
12885 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
12886 // TODO: In some cases it might make sense to go all the way to f16.
12887
12888 EVT F32VT = MagVT.isVector()
12889 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
12890 : MVT::v2f32;
12891
12892 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
12893
12894 SmallVector<SDValue, 8> F32Signs;
12895 for (unsigned I = 0; I != NumElts; ++I) {
12896 // Take sign from odd elements of cast vector
12897 SDValue SignAsF32 =
12898 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
12899 DAG.getConstant(2 * I + 1, DL, MVT::i32));
12900 F32Signs.push_back(SignAsF32);
12901 }
12902
12903 SDValue NewSign =
12904 NumElts == 1
12905 ? F32Signs.back()
12907 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
12908 F32Signs);
12909
12910 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
12911 NewSign);
12912}
12913
12914// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
12915// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
12916// bits
12917
12918// This is a variant of
12919// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
12920//
12921// The normal DAG combiner will do this, but only if the add has one use since
12922// that would increase the number of instructions.
12923//
12924// This prevents us from seeing a constant offset that can be folded into a
12925// memory instruction's addressing mode. If we know the resulting add offset of
12926// a pointer can be folded into an addressing offset, we can replace the pointer
12927// operand with the add of new constant offset. This eliminates one of the uses,
12928// and may allow the remaining use to also be simplified.
12929//
12930SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
12931 EVT MemVT,
12932 DAGCombinerInfo &DCI) const {
12933 SDValue N0 = N->getOperand(0);
12934 SDValue N1 = N->getOperand(1);
12935
12936 // We only do this to handle cases where it's profitable when there are
12937 // multiple uses of the add, so defer to the standard combine.
12938 if ((!N0->isAnyAdd() && N0.getOpcode() != ISD::OR) || N0->hasOneUse())
12939 return SDValue();
12940
12941 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
12942 if (!CN1)
12943 return SDValue();
12944
12945 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
12946 if (!CAdd)
12947 return SDValue();
12948
12949 SelectionDAG &DAG = DCI.DAG;
12950
12951 if (N0->getOpcode() == ISD::OR &&
12952 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
12953 return SDValue();
12954
12955 // If the resulting offset is too large, we can't fold it into the
12956 // addressing mode offset.
12957 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
12958 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
12959
12960 AddrMode AM;
12961 AM.HasBaseReg = true;
12962 AM.BaseOffs = Offset.getSExtValue();
12963 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
12964 return SDValue();
12965
12966 SDLoc SL(N);
12967 EVT VT = N->getValueType(0);
12968
12969 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
12970 SDValue COffset = DAG.getConstant(Offset, SL, VT);
12971
12972 SDNodeFlags Flags;
12973 Flags.setNoUnsignedWrap(
12974 N->getFlags().hasNoUnsignedWrap() &&
12975 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
12976
12977 // Use ISD::ADD even if the original operation was ISD::PTRADD, since we can't
12978 // be sure that the new left operand is a proper base pointer.
12979 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
12980}
12981
12982/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
12983/// by the chain and intrinsic ID. Theoretically we would also need to check the
12984/// specific intrinsic, but they all place the pointer operand first.
12985static unsigned getBasePtrIndex(const MemSDNode *N) {
12986 switch (N->getOpcode()) {
12987 case ISD::STORE:
12990 return 2;
12991 default:
12992 return 1;
12993 }
12994}
12995
12996SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
12997 DAGCombinerInfo &DCI) const {
12998 SelectionDAG &DAG = DCI.DAG;
12999
13000 unsigned PtrIdx = getBasePtrIndex(N);
13001 SDValue Ptr = N->getOperand(PtrIdx);
13002
13003 // TODO: We could also do this for multiplies.
13004 if (Ptr.getOpcode() == ISD::SHL) {
13005 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
13006 N->getMemoryVT(), DCI);
13007 if (NewPtr) {
13008 SmallVector<SDValue, 8> NewOps(N->ops());
13009
13010 NewOps[PtrIdx] = NewPtr;
13011 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
13012 }
13013 }
13014
13015 return SDValue();
13016}
13017
13018static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
13019 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13020 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13021 (Opc == ISD::XOR && Val == 0);
13022}
13023
13024// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
13025// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
13026// integer combine opportunities since most 64-bit operations are decomposed
13027// this way. TODO: We won't want this for SALU especially if it is an inline
13028// immediate.
13029SDValue SITargetLowering::splitBinaryBitConstantOp(
13030 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
13031 const ConstantSDNode *CRHS) const {
13032 uint64_t Val = CRHS->getZExtValue();
13033 uint32_t ValLo = Lo_32(Val);
13034 uint32_t ValHi = Hi_32(Val);
13035 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13036
13037 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
13039 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
13040 // We have 64-bit scalar and/or/xor, but do not have vector forms.
13041 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
13042 !CRHS->user_begin()->isDivergent())
13043 return SDValue();
13044
13045 // If we need to materialize a 64-bit immediate, it will be split up later
13046 // anyway. Avoid creating the harder to understand 64-bit immediate
13047 // materialization.
13048 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
13049 }
13050
13051 return SDValue();
13052}
13053
13055 if (V.getValueType() != MVT::i1)
13056 return false;
13057 switch (V.getOpcode()) {
13058 default:
13059 break;
13060 case ISD::SETCC:
13061 case ISD::IS_FPCLASS:
13063 return true;
13064 case ISD::AND:
13065 case ISD::OR:
13066 case ISD::XOR:
13067 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
13068 case ISD::SADDO:
13069 case ISD::UADDO:
13070 case ISD::SSUBO:
13071 case ISD::USUBO:
13072 case ISD::SMULO:
13073 case ISD::UMULO:
13074 return V.getResNo() == 1;
13076 unsigned IntrinsicID = V.getConstantOperandVal(0);
13077 switch (IntrinsicID) {
13078 case Intrinsic::amdgcn_is_shared:
13079 case Intrinsic::amdgcn_is_private:
13080 return true;
13081 default:
13082 return false;
13083 }
13084
13085 return false;
13086 }
13087 }
13088 return false;
13089}
13090
13091// If a constant has all zeroes or all ones within each byte return it.
13092// Otherwise return 0.
13094 // 0xff for any zero byte in the mask
13095 uint32_t ZeroByteMask = 0;
13096 if (!(C & 0x000000ff))
13097 ZeroByteMask |= 0x000000ff;
13098 if (!(C & 0x0000ff00))
13099 ZeroByteMask |= 0x0000ff00;
13100 if (!(C & 0x00ff0000))
13101 ZeroByteMask |= 0x00ff0000;
13102 if (!(C & 0xff000000))
13103 ZeroByteMask |= 0xff000000;
13104 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
13105 if ((NonZeroByteMask & C) != NonZeroByteMask)
13106 return 0; // Partial bytes selected.
13107 return C;
13108}
13109
13110// Check if a node selects whole bytes from its operand 0 starting at a byte
13111// boundary while masking the rest. Returns select mask as in the v_perm_b32
13112// or -1 if not succeeded.
13113// Note byte select encoding:
13114// value 0-3 selects corresponding source byte;
13115// value 0xc selects zero;
13116// value 0xff selects 0xff.
13118 assert(V.getValueSizeInBits() == 32);
13119
13120 if (V.getNumOperands() != 2)
13121 return ~0;
13122
13123 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
13124 if (!N1)
13125 return ~0;
13126
13127 uint32_t C = N1->getZExtValue();
13128
13129 switch (V.getOpcode()) {
13130 default:
13131 break;
13132 case ISD::AND:
13133 if (uint32_t ConstMask = getConstantPermuteMask(C))
13134 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13135 break;
13136
13137 case ISD::OR:
13138 if (uint32_t ConstMask = getConstantPermuteMask(C))
13139 return (0x03020100 & ~ConstMask) | ConstMask;
13140 break;
13141
13142 case ISD::SHL:
13143 if (C % 8)
13144 return ~0;
13145
13146 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
13147
13148 case ISD::SRL:
13149 if (C % 8)
13150 return ~0;
13151
13152 return uint32_t(0x0c0c0c0c03020100ull >> C);
13153 }
13154
13155 return ~0;
13156}
13157
13158SDValue SITargetLowering::performAndCombine(SDNode *N,
13159 DAGCombinerInfo &DCI) const {
13160 if (DCI.isBeforeLegalize())
13161 return SDValue();
13162
13163 SelectionDAG &DAG = DCI.DAG;
13164 EVT VT = N->getValueType(0);
13165 SDValue LHS = N->getOperand(0);
13166 SDValue RHS = N->getOperand(1);
13167
13168 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13169 if (VT == MVT::i64 && CRHS) {
13170 if (SDValue Split =
13171 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
13172 return Split;
13173 }
13174
13175 if (CRHS && VT == MVT::i32) {
13176 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
13177 // nb = number of trailing zeroes in mask
13178 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
13179 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
13180 uint64_t Mask = CRHS->getZExtValue();
13181 unsigned Bits = llvm::popcount(Mask);
13182 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
13183 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
13184 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
13185 unsigned Shift = CShift->getZExtValue();
13186 unsigned NB = CRHS->getAPIntValue().countr_zero();
13187 unsigned Offset = NB + Shift;
13188 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
13189 SDLoc SL(N);
13190 SDValue BFE =
13191 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
13192 DAG.getConstant(Offset, SL, MVT::i32),
13193 DAG.getConstant(Bits, SL, MVT::i32));
13194 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
13195 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
13196 DAG.getValueType(NarrowVT));
13197 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
13198 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
13199 return Shl;
13200 }
13201 }
13202 }
13203
13204 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13205 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
13206 isa<ConstantSDNode>(LHS.getOperand(2))) {
13207 uint32_t Sel = getConstantPermuteMask(Mask);
13208 if (!Sel)
13209 return SDValue();
13210
13211 // Select 0xc for all zero bytes
13212 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13213 SDLoc DL(N);
13214 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13215 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13216 }
13217 }
13218
13219 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
13220 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
13221 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
13222 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13223 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
13224
13225 SDValue X = LHS.getOperand(0);
13226 SDValue Y = RHS.getOperand(0);
13227 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
13228 !isTypeLegal(X.getValueType()))
13229 return SDValue();
13230
13231 if (LCC == ISD::SETO) {
13232 if (X != LHS.getOperand(1))
13233 return SDValue();
13234
13235 if (RCC == ISD::SETUNE) {
13236 const ConstantFPSDNode *C1 =
13237 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
13238 if (!C1 || !C1->isInfinity() || C1->isNegative())
13239 return SDValue();
13240
13241 const uint32_t Mask = SIInstrFlags::N_NORMAL |
13245
13246 static_assert(
13249 0x3ff) == Mask,
13250 "mask not equal");
13251
13252 SDLoc DL(N);
13253 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
13254 DAG.getConstant(Mask, DL, MVT::i32));
13255 }
13256 }
13257 }
13258
13259 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13260 std::swap(LHS, RHS);
13261
13262 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13263 RHS.hasOneUse()) {
13264 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13265 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
13266 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
13267 // | n_nan)
13268 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13269 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
13270 (RHS.getOperand(0) == LHS.getOperand(0) &&
13271 LHS.getOperand(0) == LHS.getOperand(1))) {
13272 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
13273 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
13274 : Mask->getZExtValue() & OrdMask;
13275
13276 SDLoc DL(N);
13277 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
13278 DAG.getConstant(NewMask, DL, MVT::i32));
13279 }
13280 }
13281
13282 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
13283 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
13284 // and x, (sext cc from i1) => select cc, x, 0
13285 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
13286 std::swap(LHS, RHS);
13287 if (isBoolSGPR(RHS.getOperand(0)))
13288 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
13289 DAG.getConstant(0, SDLoc(N), MVT::i32));
13290 }
13291
13292 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13293 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13294 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13295 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13296 uint32_t LHSMask = getPermuteMask(LHS);
13297 uint32_t RHSMask = getPermuteMask(RHS);
13298 if (LHSMask != ~0u && RHSMask != ~0u) {
13299 // Canonicalize the expression in an attempt to have fewer unique masks
13300 // and therefore fewer registers used to hold the masks.
13301 if (LHSMask > RHSMask) {
13302 std::swap(LHSMask, RHSMask);
13303 std::swap(LHS, RHS);
13304 }
13305
13306 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13307 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13308 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13309 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13310
13311 // Check of we need to combine values from two sources within a byte.
13312 if (!(LHSUsedLanes & RHSUsedLanes) &&
13313 // If we select high and lower word keep it for SDWA.
13314 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13315 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13316 // Each byte in each mask is either selector mask 0-3, or has higher
13317 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
13318 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
13319 // mask which is not 0xff wins. By anding both masks we have a correct
13320 // result except that 0x0c shall be corrected to give 0x0c only.
13321 uint32_t Mask = LHSMask & RHSMask;
13322 for (unsigned I = 0; I < 32; I += 8) {
13323 uint32_t ByteSel = 0xff << I;
13324 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13325 Mask &= (0x0c << I) & 0xffffffff;
13326 }
13327
13328 // Add 4 to each active LHS lane. It will not affect any existing 0xff
13329 // or 0x0c.
13330 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
13331 SDLoc DL(N);
13332
13333 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13334 RHS.getOperand(0),
13335 DAG.getConstant(Sel, DL, MVT::i32));
13336 }
13337 }
13338 }
13339
13340 return SDValue();
13341}
13342
13343// A key component of v_perm is a mapping between byte position of the src
13344// operands, and the byte position of the dest. To provide such, we need: 1. the
13345// node that provides x byte of the dest of the OR, and 2. the byte of the node
13346// used to provide that x byte. calculateByteProvider finds which node provides
13347// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
13348// and finds an ultimate src and byte position For example: The supported
13349// LoadCombine pattern for vector loads is as follows
13350// t1
13351// or
13352// / \
13353// t2 t3
13354// zext shl
13355// | | \
13356// t4 t5 16
13357// or anyext
13358// / \ |
13359// t6 t7 t8
13360// srl shl or
13361// / | / \ / \
13362// t9 t10 t11 t12 t13 t14
13363// trunc* 8 trunc* 8 and and
13364// | | / | | \
13365// t15 t16 t17 t18 t19 t20
13366// trunc* 255 srl -256
13367// | / \
13368// t15 t15 16
13369//
13370// *In this example, the truncs are from i32->i16
13371//
13372// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
13373// respectively. calculateSrcByte would find (given node) -> ultimate src &
13374// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
13375// After finding the mapping, we can combine the tree into vperm t15, t16,
13376// 0x05000407
13377
13378// Find the source and byte position from a node.
13379// \p DestByte is the byte position of the dest of the or that the src
13380// ultimately provides. \p SrcIndex is the byte of the src that maps to this
13381// dest of the or byte. \p Depth tracks how many recursive iterations we have
13382// performed.
13383static const std::optional<ByteProvider<SDValue>>
13384calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
13385 unsigned Depth = 0) {
13386 // We may need to recursively traverse a series of SRLs
13387 if (Depth >= 6)
13388 return std::nullopt;
13389
13390 if (Op.getValueSizeInBits() < 8)
13391 return std::nullopt;
13392
13393 if (Op.getValueType().isVector())
13394 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13395
13396 switch (Op->getOpcode()) {
13397 case ISD::TRUNCATE: {
13398 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13399 }
13400
13401 case ISD::SIGN_EXTEND:
13402 case ISD::ZERO_EXTEND:
13404 SDValue NarrowOp = Op->getOperand(0);
13405 auto NarrowVT = NarrowOp.getValueType();
13406 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
13407 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13408 NarrowVT = VTSign->getVT();
13409 }
13410 if (!NarrowVT.isByteSized())
13411 return std::nullopt;
13412 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
13413
13414 if (SrcIndex >= NarrowByteWidth)
13415 return std::nullopt;
13416 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13417 }
13418
13419 case ISD::SRA:
13420 case ISD::SRL: {
13421 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13422 if (!ShiftOp)
13423 return std::nullopt;
13424
13425 uint64_t BitShift = ShiftOp->getZExtValue();
13426
13427 if (BitShift % 8 != 0)
13428 return std::nullopt;
13429
13430 SrcIndex += BitShift / 8;
13431
13432 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13433 }
13434
13435 default: {
13436 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13437 }
13438 }
13439 llvm_unreachable("fully handled switch");
13440}
13441
13442// For a byte position in the result of an Or, traverse the tree and find the
13443// node (and the byte of the node) which ultimately provides this {Or,
13444// BytePosition}. \p Op is the operand we are currently examining. \p Index is
13445// the byte position of the Op that corresponds with the originally requested
13446// byte of the Or \p Depth tracks how many recursive iterations we have
13447// performed. \p StartingIndex is the originally requested byte of the Or
13448static const std::optional<ByteProvider<SDValue>>
13449calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
13450 unsigned StartingIndex = 0) {
13451 // Finding Src tree of RHS of or typically requires at least 1 additional
13452 // depth
13453 if (Depth > 6)
13454 return std::nullopt;
13455
13456 unsigned BitWidth = Op.getScalarValueSizeInBits();
13457 if (BitWidth % 8 != 0)
13458 return std::nullopt;
13459 if (Index > BitWidth / 8 - 1)
13460 return std::nullopt;
13461
13462 bool IsVec = Op.getValueType().isVector();
13463 switch (Op.getOpcode()) {
13464 case ISD::OR: {
13465 if (IsVec)
13466 return std::nullopt;
13467
13468 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
13469 StartingIndex);
13470 if (!RHS)
13471 return std::nullopt;
13472 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13473 StartingIndex);
13474 if (!LHS)
13475 return std::nullopt;
13476 // A well formed Or will have two ByteProviders for each byte, one of which
13477 // is constant zero
13478 if (!LHS->isConstantZero() && !RHS->isConstantZero())
13479 return std::nullopt;
13480 if (!LHS || LHS->isConstantZero())
13481 return RHS;
13482 if (!RHS || RHS->isConstantZero())
13483 return LHS;
13484 return std::nullopt;
13485 }
13486
13487 case ISD::AND: {
13488 if (IsVec)
13489 return std::nullopt;
13490
13491 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13492 if (!BitMaskOp)
13493 return std::nullopt;
13494
13495 uint32_t BitMask = BitMaskOp->getZExtValue();
13496 // Bits we expect for our StartingIndex
13497 uint32_t IndexMask = 0xFF << (Index * 8);
13498
13499 if ((IndexMask & BitMask) != IndexMask) {
13500 // If the result of the and partially provides the byte, then it
13501 // is not well formatted
13502 if (IndexMask & BitMask)
13503 return std::nullopt;
13505 }
13506
13507 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
13508 }
13509
13510 case ISD::FSHR: {
13511 if (IsVec)
13512 return std::nullopt;
13513
13514 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
13515 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13516 if (!ShiftOp || Op.getValueType().isVector())
13517 return std::nullopt;
13518
13519 uint64_t BitsProvided = Op.getValueSizeInBits();
13520 if (BitsProvided % 8 != 0)
13521 return std::nullopt;
13522
13523 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13524 if (BitShift % 8)
13525 return std::nullopt;
13526
13527 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13528 uint64_t ByteShift = BitShift / 8;
13529
13530 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13531 uint64_t BytesProvided = BitsProvided / 8;
13532 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13533 NewIndex %= BytesProvided;
13534 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
13535 }
13536
13537 case ISD::SRA:
13538 case ISD::SRL: {
13539 if (IsVec)
13540 return std::nullopt;
13541
13542 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13543 if (!ShiftOp)
13544 return std::nullopt;
13545
13546 uint64_t BitShift = ShiftOp->getZExtValue();
13547 if (BitShift % 8)
13548 return std::nullopt;
13549
13550 auto BitsProvided = Op.getScalarValueSizeInBits();
13551 if (BitsProvided % 8 != 0)
13552 return std::nullopt;
13553
13554 uint64_t BytesProvided = BitsProvided / 8;
13555 uint64_t ByteShift = BitShift / 8;
13556 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
13557 // If the byte we are trying to provide (as tracked by index) falls in this
13558 // range, then the SRL provides the byte. The byte of interest of the src of
13559 // the SRL is Index + ByteShift
13560 return BytesProvided - ByteShift > Index
13561 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
13562 Index + ByteShift)
13564 }
13565
13566 case ISD::SHL: {
13567 if (IsVec)
13568 return std::nullopt;
13569
13570 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13571 if (!ShiftOp)
13572 return std::nullopt;
13573
13574 uint64_t BitShift = ShiftOp->getZExtValue();
13575 if (BitShift % 8 != 0)
13576 return std::nullopt;
13577 uint64_t ByteShift = BitShift / 8;
13578
13579 // If we are shifting by an amount greater than (or equal to)
13580 // the index we are trying to provide, then it provides 0s. If not,
13581 // then this bytes are not definitively 0s, and the corresponding byte
13582 // of interest is Index - ByteShift of the src
13583 return Index < ByteShift
13585 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
13586 Depth + 1, StartingIndex);
13587 }
13588 case ISD::ANY_EXTEND:
13589 case ISD::SIGN_EXTEND:
13590 case ISD::ZERO_EXTEND:
13592 case ISD::AssertZext:
13593 case ISD::AssertSext: {
13594 if (IsVec)
13595 return std::nullopt;
13596
13597 SDValue NarrowOp = Op->getOperand(0);
13598 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
13599 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
13600 Op->getOpcode() == ISD::AssertZext ||
13601 Op->getOpcode() == ISD::AssertSext) {
13602 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13603 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13604 }
13605 if (NarrowBitWidth % 8 != 0)
13606 return std::nullopt;
13607 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13608
13609 if (Index >= NarrowByteWidth)
13610 return Op.getOpcode() == ISD::ZERO_EXTEND
13611 ? std::optional<ByteProvider<SDValue>>(
13613 : std::nullopt;
13614 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
13615 }
13616
13617 case ISD::TRUNCATE: {
13618 if (IsVec)
13619 return std::nullopt;
13620
13621 uint64_t NarrowByteWidth = BitWidth / 8;
13622
13623 if (NarrowByteWidth >= Index) {
13624 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13625 StartingIndex);
13626 }
13627
13628 return std::nullopt;
13629 }
13630
13631 case ISD::CopyFromReg: {
13632 if (BitWidth / 8 > Index)
13633 return calculateSrcByte(Op, StartingIndex, Index);
13634
13635 return std::nullopt;
13636 }
13637
13638 case ISD::LOAD: {
13639 auto *L = cast<LoadSDNode>(Op.getNode());
13640
13641 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13642 if (NarrowBitWidth % 8 != 0)
13643 return std::nullopt;
13644 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13645
13646 // If the width of the load does not reach byte we are trying to provide for
13647 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
13648 // question
13649 if (Index >= NarrowByteWidth) {
13650 return L->getExtensionType() == ISD::ZEXTLOAD
13651 ? std::optional<ByteProvider<SDValue>>(
13653 : std::nullopt;
13654 }
13655
13656 if (NarrowByteWidth > Index) {
13657 return calculateSrcByte(Op, StartingIndex, Index);
13658 }
13659
13660 return std::nullopt;
13661 }
13662
13663 case ISD::BSWAP: {
13664 if (IsVec)
13665 return std::nullopt;
13666
13667 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
13668 Depth + 1, StartingIndex);
13669 }
13670
13672 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13673 if (!IdxOp)
13674 return std::nullopt;
13675 auto VecIdx = IdxOp->getZExtValue();
13676 auto ScalarSize = Op.getScalarValueSizeInBits();
13677 if (ScalarSize < 32)
13678 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13679 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
13680 StartingIndex, Index);
13681 }
13682
13683 case AMDGPUISD::PERM: {
13684 if (IsVec)
13685 return std::nullopt;
13686
13687 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13688 if (!PermMask)
13689 return std::nullopt;
13690
13691 auto IdxMask =
13692 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13693 if (IdxMask > 0x07 && IdxMask != 0x0c)
13694 return std::nullopt;
13695
13696 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13697 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13698
13699 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
13702 }
13703
13704 default: {
13705 return std::nullopt;
13706 }
13707 }
13708
13709 llvm_unreachable("fully handled switch");
13710}
13711
13712// Returns true if the Operand is a scalar and is 16 bits
13713static bool isExtendedFrom16Bits(SDValue &Operand) {
13714
13715 switch (Operand.getOpcode()) {
13716 case ISD::ANY_EXTEND:
13717 case ISD::SIGN_EXTEND:
13718 case ISD::ZERO_EXTEND: {
13719 auto OpVT = Operand.getOperand(0).getValueType();
13720 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
13721 }
13722 case ISD::LOAD: {
13723 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
13724 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
13725 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
13726 ExtType == ISD::EXTLOAD) {
13727 auto MemVT = L->getMemoryVT();
13728 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
13729 }
13730 return L->getMemoryVT().getSizeInBits() == 16;
13731 }
13732 default:
13733 return false;
13734 }
13735}
13736
13737// Returns true if the mask matches consecutive bytes, and the first byte
13738// begins at a power of 2 byte offset from 0th byte
13739static bool addresses16Bits(int Mask) {
13740 int Low8 = Mask & 0xff;
13741 int Hi8 = (Mask & 0xff00) >> 8;
13742
13743 assert(Low8 < 8 && Hi8 < 8);
13744 // Are the bytes contiguous in the order of increasing addresses.
13745 bool IsConsecutive = (Hi8 - Low8 == 1);
13746 // Is the first byte at location that is aligned for 16 bit instructions.
13747 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
13748 // In this case, we still need code to extract the 16 bit operand, so it
13749 // is better to use i8 v_perm
13750 bool Is16Aligned = !(Low8 % 2);
13751
13752 return IsConsecutive && Is16Aligned;
13753}
13754
13755// Do not lower into v_perm if the operands are actually 16 bit
13756// and the selected bits (based on PermMask) correspond with two
13757// easily addressable 16 bit operands.
13759 SDValue &OtherOp) {
13760 int Low16 = PermMask & 0xffff;
13761 int Hi16 = (PermMask & 0xffff0000) >> 16;
13762
13763 auto TempOp = peekThroughBitcasts(Op);
13764 auto TempOtherOp = peekThroughBitcasts(OtherOp);
13765
13766 auto OpIs16Bit =
13767 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
13768 if (!OpIs16Bit)
13769 return true;
13770
13771 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13772 isExtendedFrom16Bits(TempOtherOp);
13773 if (!OtherOpIs16Bit)
13774 return true;
13775
13776 // Do we cleanly address both
13777 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
13778}
13779
13781 unsigned DWordOffset) {
13782 SDValue Ret;
13783
13784 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
13785 // ByteProvider must be at least 8 bits
13786 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13787
13788 if (TypeSize <= 32)
13789 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
13790
13791 if (Src.getValueType().isVector()) {
13792 auto ScalarTySize = Src.getScalarValueSizeInBits();
13793 auto ScalarTy = Src.getValueType().getScalarType();
13794 if (ScalarTySize == 32) {
13795 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
13796 DAG.getConstant(DWordOffset, SL, MVT::i32));
13797 }
13798 if (ScalarTySize > 32) {
13799 Ret = DAG.getNode(
13800 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
13801 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13802 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13803 if (ShiftVal)
13804 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
13805 DAG.getConstant(ShiftVal, SL, MVT::i32));
13806 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13807 }
13808
13809 assert(ScalarTySize < 32);
13810 auto NumElements = TypeSize / ScalarTySize;
13811 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13812 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13813 auto NumElementsIn32 = 32 / ScalarTySize;
13814 auto NumAvailElements = DWordOffset < Trunc32Elements
13815 ? NumElementsIn32
13816 : NumElements - NormalizedTrunc;
13817
13819 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
13820 NumAvailElements);
13821
13822 Ret = DAG.getBuildVector(
13823 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
13824 VecSrcs);
13825 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13826 }
13827
13828 /// Scalar Type
13829 auto ShiftVal = 32 * DWordOffset;
13830 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
13831 DAG.getConstant(ShiftVal, SL, MVT::i32));
13832 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13833}
13834
13836 SelectionDAG &DAG = DCI.DAG;
13837 [[maybe_unused]] EVT VT = N->getValueType(0);
13839
13840 // VT is known to be MVT::i32, so we need to provide 4 bytes.
13841 assert(VT == MVT::i32);
13842 for (int i = 0; i < 4; i++) {
13843 // Find the ByteProvider that provides the ith byte of the result of OR
13844 std::optional<ByteProvider<SDValue>> P =
13845 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
13846 // TODO support constantZero
13847 if (!P || P->isConstantZero())
13848 return SDValue();
13849
13850 PermNodes.push_back(*P);
13851 }
13852 if (PermNodes.size() != 4)
13853 return SDValue();
13854
13855 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
13856 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
13857 uint64_t PermMask = 0x00000000;
13858 for (size_t i = 0; i < PermNodes.size(); i++) {
13859 auto PermOp = PermNodes[i];
13860 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
13861 // by sizeof(Src2) = 4
13862 int SrcByteAdjust = 4;
13863
13864 // If the Src uses a byte from a different DWORD, then it corresponds
13865 // with a difference source
13866 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
13867 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
13868 if (SecondSrc)
13869 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
13870 ((PermOp.SrcOffset / 4) != SecondSrc->second))
13871 return SDValue();
13872
13873 // Set the index of the second distinct Src node
13874 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
13875 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
13876 SrcByteAdjust = 0;
13877 }
13878 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
13880 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
13881 }
13882 SDLoc DL(N);
13883 SDValue Op = *PermNodes[FirstSrc.first].Src;
13884 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
13885 assert(Op.getValueSizeInBits() == 32);
13886
13887 // Check that we are not just extracting the bytes in order from an op
13888 if (!SecondSrc) {
13889 int Low16 = PermMask & 0xffff;
13890 int Hi16 = (PermMask & 0xffff0000) >> 16;
13891
13892 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
13893 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
13894
13895 // The perm op would really just produce Op. So combine into Op
13896 if (WellFormedLow && WellFormedHi)
13897 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
13898 }
13899
13900 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
13901
13902 if (SecondSrc) {
13903 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
13904 assert(OtherOp.getValueSizeInBits() == 32);
13905 }
13906
13907 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
13908
13909 assert(Op.getValueType().isByteSized() &&
13910 OtherOp.getValueType().isByteSized());
13911
13912 // If the ultimate src is less than 32 bits, then we will only be
13913 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
13914 // CalculateByteProvider would not have returned Op as source if we
13915 // used a byte that is outside its ValueType. Thus, we are free to
13916 // ANY_EXTEND as the extended bits are dont-cares.
13917 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
13918 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
13919
13920 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
13921 DAG.getConstant(PermMask, DL, MVT::i32));
13922 }
13923 return SDValue();
13924}
13925
13926SDValue SITargetLowering::performOrCombine(SDNode *N,
13927 DAGCombinerInfo &DCI) const {
13928 SelectionDAG &DAG = DCI.DAG;
13929 SDValue LHS = N->getOperand(0);
13930 SDValue RHS = N->getOperand(1);
13931
13932 EVT VT = N->getValueType(0);
13933 if (VT == MVT::i1) {
13934 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
13935 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13936 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
13937 SDValue Src = LHS.getOperand(0);
13938 if (Src != RHS.getOperand(0))
13939 return SDValue();
13940
13941 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
13942 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13943 if (!CLHS || !CRHS)
13944 return SDValue();
13945
13946 // Only 10 bits are used.
13947 static const uint32_t MaxMask = 0x3ff;
13948
13949 uint32_t NewMask =
13950 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
13951 SDLoc DL(N);
13952 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
13953 DAG.getConstant(NewMask, DL, MVT::i32));
13954 }
13955
13956 return SDValue();
13957 }
13958
13959 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13961 LHS.getOpcode() == AMDGPUISD::PERM &&
13962 isa<ConstantSDNode>(LHS.getOperand(2))) {
13963 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
13964 if (!Sel)
13965 return SDValue();
13966
13967 Sel |= LHS.getConstantOperandVal(2);
13968 SDLoc DL(N);
13969 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13970 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13971 }
13972
13973 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13974 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13975 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13976 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13977
13978 // If all the uses of an or need to extract the individual elements, do not
13979 // attempt to lower into v_perm
13980 auto usesCombinedOperand = [](SDNode *OrUse) {
13981 // If we have any non-vectorized use, then it is a candidate for v_perm
13982 if (OrUse->getOpcode() != ISD::BITCAST ||
13983 !OrUse->getValueType(0).isVector())
13984 return true;
13985
13986 // If we have any non-vectorized use, then it is a candidate for v_perm
13987 for (auto *VUser : OrUse->users()) {
13988 if (!VUser->getValueType(0).isVector())
13989 return true;
13990
13991 // If the use of a vector is a store, then combining via a v_perm
13992 // is beneficial.
13993 // TODO -- whitelist more uses
13994 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
13995 if (VUser->getOpcode() == VectorwiseOp)
13996 return true;
13997 }
13998 return false;
13999 };
14000
14001 if (!any_of(N->users(), usesCombinedOperand))
14002 return SDValue();
14003
14004 uint32_t LHSMask = getPermuteMask(LHS);
14005 uint32_t RHSMask = getPermuteMask(RHS);
14006
14007 if (LHSMask != ~0u && RHSMask != ~0u) {
14008 // Canonicalize the expression in an attempt to have fewer unique masks
14009 // and therefore fewer registers used to hold the masks.
14010 if (LHSMask > RHSMask) {
14011 std::swap(LHSMask, RHSMask);
14012 std::swap(LHS, RHS);
14013 }
14014
14015 // Select 0xc for each lane used from source operand. Zero has 0xc mask
14016 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
14017 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14018 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14019
14020 // Check of we need to combine values from two sources within a byte.
14021 if (!(LHSUsedLanes & RHSUsedLanes) &&
14022 // If we select high and lower word keep it for SDWA.
14023 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
14024 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14025 // Kill zero bytes selected by other mask. Zero value is 0xc.
14026 LHSMask &= ~RHSUsedLanes;
14027 RHSMask &= ~LHSUsedLanes;
14028 // Add 4 to each active LHS lane
14029 LHSMask |= LHSUsedLanes & 0x04040404;
14030 // Combine masks
14031 uint32_t Sel = LHSMask | RHSMask;
14032 SDLoc DL(N);
14033
14034 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14035 RHS.getOperand(0),
14036 DAG.getConstant(Sel, DL, MVT::i32));
14037 }
14038 }
14039 if (LHSMask == ~0u || RHSMask == ~0u) {
14040 if (SDValue Perm = matchPERM(N, DCI))
14041 return Perm;
14042 }
14043 }
14044
14045 // Detect identity v2i32 OR and replace with identity source node.
14046 // Specifically an Or that has operands constructed from the same source node
14047 // via extract_vector_elt and build_vector. I.E.
14048 // v2i32 or(
14049 // v2i32 build_vector(
14050 // i32 extract_elt(%IdentitySrc, 0),
14051 // i32 0
14052 // ),
14053 // v2i32 build_vector(
14054 // i32 0,
14055 // i32 extract_elt(%IdentitySrc, 1)
14056 // ) )
14057 // =>
14058 // v2i32 %IdentitySrc
14059
14060 if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
14061 RHS->getOpcode() == ISD::BUILD_VECTOR) {
14062
14063 ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
14064 ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
14065
14066 // Test for and normalise build vectors.
14067 if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
14068
14069 // Get the extract_vector_element operands.
14070 SDValue LEVE = LHS->getOperand(0);
14071 SDValue REVE = RHS->getOperand(1);
14072
14073 if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14075 // Check that different elements from the same vector are
14076 // extracted.
14077 if (LEVE->getOperand(0) == REVE->getOperand(0) &&
14078 LEVE->getOperand(1) != REVE->getOperand(1)) {
14079 SDValue IdentitySrc = LEVE.getOperand(0);
14080 return IdentitySrc;
14081 }
14082 }
14083 }
14084 }
14085
14086 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14087 return SDValue();
14088
14089 // TODO: This could be a generic combine with a predicate for extracting the
14090 // high half of an integer being free.
14091
14092 // (or i64:x, (zero_extend i32:y)) ->
14093 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
14094 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
14095 RHS.getOpcode() != ISD::ZERO_EXTEND)
14096 std::swap(LHS, RHS);
14097
14098 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
14099 SDValue ExtSrc = RHS.getOperand(0);
14100 EVT SrcVT = ExtSrc.getValueType();
14101 if (SrcVT == MVT::i32) {
14102 SDLoc SL(N);
14103 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
14104 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
14105
14106 DCI.AddToWorklist(LowOr.getNode());
14107 DCI.AddToWorklist(HiBits.getNode());
14108
14109 SDValue Vec =
14110 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
14111 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
14112 }
14113 }
14114
14115 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
14116 if (CRHS) {
14117 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
14118 N->getOperand(0), CRHS))
14119 return Split;
14120 }
14121
14122 return SDValue();
14123}
14124
14125SDValue SITargetLowering::performXorCombine(SDNode *N,
14126 DAGCombinerInfo &DCI) const {
14127 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
14128 return RV;
14129
14130 SDValue LHS = N->getOperand(0);
14131 SDValue RHS = N->getOperand(1);
14132
14133 const ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
14134 SelectionDAG &DAG = DCI.DAG;
14135
14136 EVT VT = N->getValueType(0);
14137 if (CRHS && VT == MVT::i64) {
14138 if (SDValue Split =
14139 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
14140 return Split;
14141 }
14142
14143 // v2i32 (xor (vselect cc, x, y), K) ->
14144 // (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be
14145 // replaced with source modifiers when the select is lowered to CNDMASK.
14146 unsigned Opc = LHS.getOpcode();
14147 if (((Opc == ISD::VSELECT && VT == MVT::v2i32) ||
14148 (Opc == ISD::SELECT && VT == MVT::i64)) &&
14149 CRHS && CRHS->getAPIntValue().isSignMask()) {
14150 SDValue CC = LHS->getOperand(0);
14151 SDValue TRUE = LHS->getOperand(1);
14152 SDValue FALSE = LHS->getOperand(2);
14153 SDValue XTrue = DAG.getNode(ISD::XOR, SDLoc(N), VT, TRUE, RHS);
14154 SDValue XFalse = DAG.getNode(ISD::XOR, SDLoc(N), VT, FALSE, RHS);
14155 SDValue XSelect =
14156 DAG.getNode(ISD::VSELECT, SDLoc(N), VT, CC, XTrue, XFalse);
14157 return XSelect;
14158 }
14159
14160 // Make sure to apply the 64-bit constant splitting fold before trying to fold
14161 // fneg-like xors into 64-bit select.
14162 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
14163 // This looks like an fneg, try to fold as a source modifier.
14164 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
14166 // xor (select c, a, b), 0x80000000 ->
14167 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
14168 SDLoc DL(N);
14169 SDValue CastLHS =
14170 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
14171 SDValue CastRHS =
14172 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
14173 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
14174 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
14175 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
14176 LHS->getOperand(0), FNegLHS, FNegRHS);
14177 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
14178 }
14179 }
14180
14181 return SDValue();
14182}
14183
14184SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
14185 DAGCombinerInfo &DCI) const {
14186 if (!Subtarget->has16BitInsts() ||
14187 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14188 return SDValue();
14189
14190 EVT VT = N->getValueType(0);
14191 if (VT != MVT::i32)
14192 return SDValue();
14193
14194 SDValue Src = N->getOperand(0);
14195 if (Src.getValueType() != MVT::i16)
14196 return SDValue();
14197
14198 return SDValue();
14199}
14200
14201SDValue
14202SITargetLowering::performSignExtendInRegCombine(SDNode *N,
14203 DAGCombinerInfo &DCI) const {
14204 SDValue Src = N->getOperand(0);
14205 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
14206
14207 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
14208 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
14209 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14210 VTSign->getVT() == MVT::i8) ||
14211 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14212 VTSign->getVT() == MVT::i16))) {
14213 assert(Subtarget->hasScalarSubwordLoads() &&
14214 "s_buffer_load_{u8, i8} are supported "
14215 "in GFX12 (or newer) architectures.");
14216 EVT VT = Src.getValueType();
14217 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14220 SDLoc DL(N);
14221 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14222 SDValue Ops[] = {
14223 Src.getOperand(0), // source register
14224 Src.getOperand(1), // offset
14225 Src.getOperand(2) // cachePolicy
14226 };
14227 auto *M = cast<MemSDNode>(Src);
14228 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14229 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14230 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
14231 return LoadVal;
14232 }
14233 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
14234 VTSign->getVT() == MVT::i8) ||
14235 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
14236 VTSign->getVT() == MVT::i16)) &&
14237 Src.hasOneUse()) {
14238 auto *M = cast<MemSDNode>(Src);
14239 SDValue Ops[] = {Src.getOperand(0), // Chain
14240 Src.getOperand(1), // rsrc
14241 Src.getOperand(2), // vindex
14242 Src.getOperand(3), // voffset
14243 Src.getOperand(4), // soffset
14244 Src.getOperand(5), // offset
14245 Src.getOperand(6), Src.getOperand(7)};
14246 // replace with BUFFER_LOAD_BYTE/SHORT
14247 SDVTList ResList =
14248 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14249 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
14252 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14253 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14254 return DCI.DAG.getMergeValues(
14255 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
14256 }
14257 return SDValue();
14258}
14259
14260SDValue SITargetLowering::performClassCombine(SDNode *N,
14261 DAGCombinerInfo &DCI) const {
14262 SelectionDAG &DAG = DCI.DAG;
14263 SDValue Mask = N->getOperand(1);
14264
14265 // fp_class x, 0 -> false
14266 if (isNullConstant(Mask))
14267 return DAG.getConstant(0, SDLoc(N), MVT::i1);
14268
14269 if (N->getOperand(0).isUndef())
14270 return DAG.getUNDEF(MVT::i1);
14271
14272 return SDValue();
14273}
14274
14275SDValue SITargetLowering::performRcpCombine(SDNode *N,
14276 DAGCombinerInfo &DCI) const {
14277 EVT VT = N->getValueType(0);
14278 SDValue N0 = N->getOperand(0);
14279
14280 if (N0.isUndef()) {
14281 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
14282 SDLoc(N), VT);
14283 }
14284
14285 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
14286 N0.getOpcode() == ISD::SINT_TO_FP)) {
14287 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
14288 N->getFlags());
14289 }
14290
14291 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
14292 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
14293 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
14294 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
14295 N->getFlags());
14296 }
14297
14299}
14300
14302 unsigned MaxDepth) const {
14303 unsigned Opcode = Op.getOpcode();
14304 if (Opcode == ISD::FCANONICALIZE)
14305 return true;
14306
14307 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14308 const auto &F = CFP->getValueAPF();
14309 if (F.isNaN() && F.isSignaling())
14310 return false;
14311 if (!F.isDenormal())
14312 return true;
14313
14314 DenormalMode Mode =
14315 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
14316 return Mode == DenormalMode::getIEEE();
14317 }
14318
14319 // If source is a result of another standard FP operation it is already in
14320 // canonical form.
14321 if (MaxDepth == 0)
14322 return false;
14323
14324 switch (Opcode) {
14325 // These will flush denorms if required.
14326 case ISD::FADD:
14327 case ISD::FSUB:
14328 case ISD::FMUL:
14329 case ISD::FCEIL:
14330 case ISD::FFLOOR:
14331 case ISD::FMA:
14332 case ISD::FMAD:
14333 case ISD::FSQRT:
14334 case ISD::FDIV:
14335 case ISD::FREM:
14336 case ISD::FP_ROUND:
14337 case ISD::FP_EXTEND:
14338 case ISD::FP16_TO_FP:
14339 case ISD::FP_TO_FP16:
14340 case ISD::BF16_TO_FP:
14341 case ISD::FP_TO_BF16:
14342 case ISD::FLDEXP:
14345 case AMDGPUISD::RCP:
14346 case AMDGPUISD::RSQ:
14350 case AMDGPUISD::LOG:
14351 case AMDGPUISD::EXP:
14355 case AMDGPUISD::FRACT:
14362 case AMDGPUISD::SIN_HW:
14363 case AMDGPUISD::COS_HW:
14364 return true;
14365
14366 // It can/will be lowered or combined as a bit operation.
14367 // Need to check their input recursively to handle.
14368 case ISD::FNEG:
14369 case ISD::FABS:
14370 case ISD::FCOPYSIGN:
14371 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14372
14373 case ISD::AND:
14374 if (Op.getValueType() == MVT::i32) {
14375 // Be careful as we only know it is a bitcast floating point type. It
14376 // could be f32, v2f16, we have no way of knowing. Luckily the constant
14377 // value that we optimize for, which comes up in fp32 to bf16 conversions,
14378 // is valid to optimize for all types.
14379 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
14380 if (RHS->getZExtValue() == 0xffff0000) {
14381 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14382 }
14383 }
14384 }
14385 break;
14386
14387 case ISD::FSIN:
14388 case ISD::FCOS:
14389 case ISD::FSINCOS:
14390 return Op.getValueType().getScalarType() != MVT::f16;
14391
14392 case ISD::FMINNUM:
14393 case ISD::FMAXNUM:
14394 case ISD::FMINNUM_IEEE:
14395 case ISD::FMAXNUM_IEEE:
14396 case ISD::FMINIMUM:
14397 case ISD::FMAXIMUM:
14398 case ISD::FMINIMUMNUM:
14399 case ISD::FMAXIMUMNUM:
14400 case AMDGPUISD::CLAMP:
14401 case AMDGPUISD::FMED3:
14402 case AMDGPUISD::FMAX3:
14403 case AMDGPUISD::FMIN3:
14405 case AMDGPUISD::FMINIMUM3: {
14406 // FIXME: Shouldn't treat the generic operations different based these.
14407 // However, we aren't really required to flush the result from
14408 // minnum/maxnum..
14409
14410 // snans will be quieted, so we only need to worry about denormals.
14411 if (Subtarget->supportsMinMaxDenormModes() ||
14412 // FIXME: denormalsEnabledForType is broken for dynamic
14413 denormalsEnabledForType(DAG, Op.getValueType()))
14414 return true;
14415
14416 // Flushing may be required.
14417 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
14418 // targets need to check their input recursively.
14419
14420 // FIXME: Does this apply with clamp? It's implemented with max.
14421 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
14422 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
14423 return false;
14424 }
14425
14426 return true;
14427 }
14428 case ISD::SELECT: {
14429 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
14430 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
14431 }
14432 case ISD::BUILD_VECTOR: {
14433 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
14434 SDValue SrcOp = Op.getOperand(i);
14435 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
14436 return false;
14437 }
14438
14439 return true;
14440 }
14443 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14444 }
14446 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
14447 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
14448 }
14449 case ISD::UNDEF:
14450 // Could be anything.
14451 return false;
14452
14453 case ISD::BITCAST:
14454 // TODO: This is incorrect as it loses track of the operand's type. We may
14455 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
14456 // same bits that are canonicalized in one type need not be in the other.
14457 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14458 case ISD::TRUNCATE: {
14459 // Hack round the mess we make when legalizing extract_vector_elt
14460 if (Op.getValueType() == MVT::i16) {
14461 SDValue TruncSrc = Op.getOperand(0);
14462 if (TruncSrc.getValueType() == MVT::i32 &&
14463 TruncSrc.getOpcode() == ISD::BITCAST &&
14464 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
14465 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
14466 }
14467 }
14468 return false;
14469 }
14471 unsigned IntrinsicID = Op.getConstantOperandVal(0);
14472 // TODO: Handle more intrinsics
14473 switch (IntrinsicID) {
14474 case Intrinsic::amdgcn_cvt_pkrtz:
14475 case Intrinsic::amdgcn_cubeid:
14476 case Intrinsic::amdgcn_frexp_mant:
14477 case Intrinsic::amdgcn_fdot2:
14478 case Intrinsic::amdgcn_rcp:
14479 case Intrinsic::amdgcn_rsq:
14480 case Intrinsic::amdgcn_rsq_clamp:
14481 case Intrinsic::amdgcn_rcp_legacy:
14482 case Intrinsic::amdgcn_rsq_legacy:
14483 case Intrinsic::amdgcn_trig_preop:
14484 case Intrinsic::amdgcn_tanh:
14485 case Intrinsic::amdgcn_log:
14486 case Intrinsic::amdgcn_exp2:
14487 case Intrinsic::amdgcn_sqrt:
14488 return true;
14489 default:
14490 break;
14491 }
14492
14493 break;
14494 }
14495 default:
14496 break;
14497 }
14498
14499 // FIXME: denormalsEnabledForType is broken for dynamic
14500 return denormalsEnabledForType(DAG, Op.getValueType()) &&
14501 DAG.isKnownNeverSNaN(Op);
14502}
14503
14505 unsigned MaxDepth) const {
14506 const MachineRegisterInfo &MRI = MF.getRegInfo();
14507 MachineInstr *MI = MRI.getVRegDef(Reg);
14508 unsigned Opcode = MI->getOpcode();
14509
14510 if (Opcode == AMDGPU::G_FCANONICALIZE)
14511 return true;
14512
14513 std::optional<FPValueAndVReg> FCR;
14514 // Constant splat (can be padded with undef) or scalar constant.
14516 if (FCR->Value.isSignaling())
14517 return false;
14518 if (!FCR->Value.isDenormal())
14519 return true;
14520
14521 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
14522 return Mode == DenormalMode::getIEEE();
14523 }
14524
14525 if (MaxDepth == 0)
14526 return false;
14527
14528 switch (Opcode) {
14529 case AMDGPU::G_FADD:
14530 case AMDGPU::G_FSUB:
14531 case AMDGPU::G_FMUL:
14532 case AMDGPU::G_FCEIL:
14533 case AMDGPU::G_FFLOOR:
14534 case AMDGPU::G_FRINT:
14535 case AMDGPU::G_FNEARBYINT:
14536 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14537 case AMDGPU::G_INTRINSIC_TRUNC:
14538 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14539 case AMDGPU::G_FMA:
14540 case AMDGPU::G_FMAD:
14541 case AMDGPU::G_FSQRT:
14542 case AMDGPU::G_FDIV:
14543 case AMDGPU::G_FREM:
14544 case AMDGPU::G_FPOW:
14545 case AMDGPU::G_FPEXT:
14546 case AMDGPU::G_FLOG:
14547 case AMDGPU::G_FLOG2:
14548 case AMDGPU::G_FLOG10:
14549 case AMDGPU::G_FPTRUNC:
14550 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14551 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14552 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14553 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14554 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14555 return true;
14556 case AMDGPU::G_FNEG:
14557 case AMDGPU::G_FABS:
14558 case AMDGPU::G_FCOPYSIGN:
14559 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
14560 case AMDGPU::G_FMINNUM:
14561 case AMDGPU::G_FMAXNUM:
14562 case AMDGPU::G_FMINNUM_IEEE:
14563 case AMDGPU::G_FMAXNUM_IEEE:
14564 case AMDGPU::G_FMINIMUM:
14565 case AMDGPU::G_FMAXIMUM:
14566 case AMDGPU::G_FMINIMUMNUM:
14567 case AMDGPU::G_FMAXIMUMNUM: {
14568 if (Subtarget->supportsMinMaxDenormModes() ||
14569 // FIXME: denormalsEnabledForType is broken for dynamic
14570 denormalsEnabledForType(MRI.getType(Reg), MF))
14571 return true;
14572
14573 [[fallthrough]];
14574 }
14575 case AMDGPU::G_BUILD_VECTOR:
14576 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
14577 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
14578 return false;
14579 return true;
14580 case AMDGPU::G_INTRINSIC:
14581 case AMDGPU::G_INTRINSIC_CONVERGENT:
14582 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
14583 case Intrinsic::amdgcn_fmul_legacy:
14584 case Intrinsic::amdgcn_fmad_ftz:
14585 case Intrinsic::amdgcn_sqrt:
14586 case Intrinsic::amdgcn_fmed3:
14587 case Intrinsic::amdgcn_sin:
14588 case Intrinsic::amdgcn_cos:
14589 case Intrinsic::amdgcn_log:
14590 case Intrinsic::amdgcn_exp2:
14591 case Intrinsic::amdgcn_log_clamp:
14592 case Intrinsic::amdgcn_rcp:
14593 case Intrinsic::amdgcn_rcp_legacy:
14594 case Intrinsic::amdgcn_rsq:
14595 case Intrinsic::amdgcn_rsq_clamp:
14596 case Intrinsic::amdgcn_rsq_legacy:
14597 case Intrinsic::amdgcn_div_scale:
14598 case Intrinsic::amdgcn_div_fmas:
14599 case Intrinsic::amdgcn_div_fixup:
14600 case Intrinsic::amdgcn_fract:
14601 case Intrinsic::amdgcn_cvt_pkrtz:
14602 case Intrinsic::amdgcn_cubeid:
14603 case Intrinsic::amdgcn_cubema:
14604 case Intrinsic::amdgcn_cubesc:
14605 case Intrinsic::amdgcn_cubetc:
14606 case Intrinsic::amdgcn_frexp_mant:
14607 case Intrinsic::amdgcn_fdot2:
14608 case Intrinsic::amdgcn_trig_preop:
14609 case Intrinsic::amdgcn_tanh:
14610 return true;
14611 default:
14612 break;
14613 }
14614
14615 [[fallthrough]];
14616 default:
14617 return false;
14618 }
14619
14620 llvm_unreachable("invalid operation");
14621}
14622
14623// Constant fold canonicalize.
14624SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
14625 const SDLoc &SL, EVT VT,
14626 const APFloat &C) const {
14627 // Flush denormals to 0 if not enabled.
14628 if (C.isDenormal()) {
14629 DenormalMode Mode =
14630 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
14631 if (Mode == DenormalMode::getPreserveSign()) {
14632 return DAG.getConstantFP(
14633 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
14634 }
14635
14636 if (Mode != DenormalMode::getIEEE())
14637 return SDValue();
14638 }
14639
14640 if (C.isNaN()) {
14641 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
14642 if (C.isSignaling()) {
14643 // Quiet a signaling NaN.
14644 // FIXME: Is this supposed to preserve payload bits?
14645 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14646 }
14647
14648 // Make sure it is the canonical NaN bitpattern.
14649 //
14650 // TODO: Can we use -1 as the canonical NaN value since it's an inline
14651 // immediate?
14652 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
14653 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14654 }
14655
14656 // Already canonical.
14657 return DAG.getConstantFP(C, SL, VT);
14658}
14659
14661 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
14662}
14663
14664SDValue
14665SITargetLowering::performFCanonicalizeCombine(SDNode *N,
14666 DAGCombinerInfo &DCI) const {
14667 SelectionDAG &DAG = DCI.DAG;
14668 SDValue N0 = N->getOperand(0);
14669 EVT VT = N->getValueType(0);
14670
14671 // fcanonicalize undef -> qnan
14672 if (N0.isUndef()) {
14674 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
14675 }
14676
14677 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
14678 EVT VT = N->getValueType(0);
14679 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
14680 }
14681
14682 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
14683 // (fcanonicalize k)
14684 //
14685 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
14686
14687 // TODO: This could be better with wider vectors that will be split to v2f16,
14688 // and to consider uses since there aren't that many packed operations.
14689 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
14690 isTypeLegal(MVT::v2f16)) {
14691 SDLoc SL(N);
14692 SDValue NewElts[2];
14693 SDValue Lo = N0.getOperand(0);
14694 SDValue Hi = N0.getOperand(1);
14695 EVT EltVT = Lo.getValueType();
14696
14698 for (unsigned I = 0; I != 2; ++I) {
14699 SDValue Op = N0.getOperand(I);
14700 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14701 NewElts[I] =
14702 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14703 } else if (Op.isUndef()) {
14704 // Handled below based on what the other operand is.
14705 NewElts[I] = Op;
14706 } else {
14707 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
14708 }
14709 }
14710
14711 // If one half is undef, and one is constant, prefer a splat vector rather
14712 // than the normal qNaN. If it's a register, prefer 0.0 since that's
14713 // cheaper to use and may be free with a packed operation.
14714 if (NewElts[0].isUndef()) {
14715 if (isa<ConstantFPSDNode>(NewElts[1]))
14716 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
14717 ? NewElts[1]
14718 : DAG.getConstantFP(0.0f, SL, EltVT);
14719 }
14720
14721 if (NewElts[1].isUndef()) {
14722 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
14723 ? NewElts[0]
14724 : DAG.getConstantFP(0.0f, SL, EltVT);
14725 }
14726
14727 return DAG.getBuildVector(VT, SL, NewElts);
14728 }
14729 }
14730
14731 return SDValue();
14732}
14733
14734static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
14735 switch (Opc) {
14736 case ISD::FMAXNUM:
14737 case ISD::FMAXNUM_IEEE:
14738 case ISD::FMAXIMUMNUM:
14739 return AMDGPUISD::FMAX3;
14740 case ISD::FMAXIMUM:
14741 return AMDGPUISD::FMAXIMUM3;
14742 case ISD::SMAX:
14743 return AMDGPUISD::SMAX3;
14744 case ISD::UMAX:
14745 return AMDGPUISD::UMAX3;
14746 case ISD::FMINNUM:
14747 case ISD::FMINNUM_IEEE:
14748 case ISD::FMINIMUMNUM:
14749 return AMDGPUISD::FMIN3;
14750 case ISD::FMINIMUM:
14751 return AMDGPUISD::FMINIMUM3;
14752 case ISD::SMIN:
14753 return AMDGPUISD::SMIN3;
14754 case ISD::UMIN:
14755 return AMDGPUISD::UMIN3;
14756 default:
14757 llvm_unreachable("Not a min/max opcode");
14758 }
14759}
14760
14761SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
14762 const SDLoc &SL, SDValue Src,
14763 SDValue MinVal,
14764 SDValue MaxVal,
14765 bool Signed) const {
14766
14767 // med3 comes from
14768 // min(max(x, K0), K1), K0 < K1
14769 // max(min(x, K0), K1), K1 < K0
14770 //
14771 // "MinVal" and "MaxVal" respectively refer to the rhs of the
14772 // min/max op.
14773 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
14774 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
14775
14776 if (!MinK || !MaxK)
14777 return SDValue();
14778
14779 if (Signed) {
14780 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
14781 return SDValue();
14782 } else {
14783 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
14784 return SDValue();
14785 }
14786
14787 EVT VT = MinK->getValueType(0);
14788 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
14789 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
14790 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
14791
14792 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
14793 // not available, but this is unlikely to be profitable as constants
14794 // will often need to be materialized & extended, especially on
14795 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
14796 return SDValue();
14797}
14798
14801 return C;
14802
14804 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
14805 return C;
14806 }
14807
14808 return nullptr;
14809}
14810
14811SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
14812 const SDLoc &SL, SDValue Op0,
14813 SDValue Op1) const {
14814 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
14815 if (!K1)
14816 return SDValue();
14817
14818 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
14819 if (!K0)
14820 return SDValue();
14821
14822 // Ordered >= (although NaN inputs should have folded away by now).
14823 if (K0->getValueAPF() > K1->getValueAPF())
14824 return SDValue();
14825
14826 // med3 with a nan input acts like
14827 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
14828 //
14829 // So the result depends on whether the IEEE mode bit is enabled or not with a
14830 // signaling nan input.
14831 // ieee=1
14832 // s0 snan: yields s2
14833 // s1 snan: yields s2
14834 // s2 snan: qnan
14835
14836 // s0 qnan: min(s1, s2)
14837 // s1 qnan: min(s0, s2)
14838 // s2 qnan: min(s0, s1)
14839
14840 // ieee=0
14841 // s0 snan: min(s1, s2)
14842 // s1 snan: min(s0, s2)
14843 // s2 snan: qnan
14844
14845 // s0 qnan: min(s1, s2)
14846 // s1 qnan: min(s0, s2)
14847 // s2 qnan: min(s0, s1)
14848 const MachineFunction &MF = DAG.getMachineFunction();
14849 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
14850
14851 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
14852 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
14853 // can only form if op0 is fmaxnum_ieee if IEEE=1.
14854 EVT VT = Op0.getValueType();
14855 if (Info->getMode().DX10Clamp) {
14856 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
14857 // hardware fmed3 behavior converting to a min.
14858 // FIXME: Should this be allowing -0.0?
14859 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
14860 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
14861 }
14862
14863 // med3 for f16 is only available on gfx9+, and not available for v2f16.
14864 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
14865 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
14866 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
14867 // then give the other result, which is different from med3 with a NaN
14868 // input.
14869 SDValue Var = Op0.getOperand(0);
14870 if (!DAG.isKnownNeverSNaN(Var))
14871 return SDValue();
14872
14873 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14874
14875 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
14876 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
14877 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
14878 SDValue(K0, 0), SDValue(K1, 0));
14879 }
14880 }
14881
14882 return SDValue();
14883}
14884
14885/// \return true if the subtarget supports minimum3 and maximum3 with the given
14886/// base min/max opcode \p Opc for type \p VT.
14887static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
14888 EVT VT) {
14889 switch (Opc) {
14890 case ISD::FMINNUM:
14891 case ISD::FMAXNUM:
14892 case ISD::FMINNUM_IEEE:
14893 case ISD::FMAXNUM_IEEE:
14894 case ISD::FMINIMUMNUM:
14895 case ISD::FMAXIMUMNUM:
14898 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
14899 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
14900 case ISD::FMINIMUM:
14901 case ISD::FMAXIMUM:
14902 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
14903 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
14904 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
14905 case ISD::SMAX:
14906 case ISD::SMIN:
14907 case ISD::UMAX:
14908 case ISD::UMIN:
14909 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
14910 default:
14911 return false;
14912 }
14913
14914 llvm_unreachable("not a min/max opcode");
14915}
14916
14917SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
14918 DAGCombinerInfo &DCI) const {
14919 SelectionDAG &DAG = DCI.DAG;
14920
14921 EVT VT = N->getValueType(0);
14922 unsigned Opc = N->getOpcode();
14923 SDValue Op0 = N->getOperand(0);
14924 SDValue Op1 = N->getOperand(1);
14925
14926 // Only do this if the inner op has one use since this will just increases
14927 // register pressure for no benefit.
14928
14929 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
14930 // max(max(a, b), c) -> max3(a, b, c)
14931 // min(min(a, b), c) -> min3(a, b, c)
14932 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
14933 SDLoc DL(N);
14934 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
14935 Op0.getOperand(0), Op0.getOperand(1), Op1);
14936 }
14937
14938 // Try commuted.
14939 // max(a, max(b, c)) -> max3(a, b, c)
14940 // min(a, min(b, c)) -> min3(a, b, c)
14941 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
14942 SDLoc DL(N);
14943 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
14944 Op0, Op1.getOperand(0), Op1.getOperand(1));
14945 }
14946 }
14947
14948 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
14949 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
14950 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
14951 if (SDValue Med3 = performIntMed3ImmCombine(
14952 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
14953 return Med3;
14954 }
14955 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
14956 if (SDValue Med3 = performIntMed3ImmCombine(
14957 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
14958 return Med3;
14959 }
14960
14961 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
14962 if (SDValue Med3 = performIntMed3ImmCombine(
14963 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
14964 return Med3;
14965 }
14966 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
14967 if (SDValue Med3 = performIntMed3ImmCombine(
14968 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
14969 return Med3;
14970 }
14971
14972 // if !is_snan(x):
14973 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14974 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14975 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14976 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14977 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
14978 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
14979 (Opc == ISD::FMINIMUMNUM && Op0.getOpcode() == ISD::FMAXIMUMNUM) ||
14981 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
14982 (VT == MVT::f32 || VT == MVT::f64 ||
14983 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
14984 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
14985 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
14986 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
14987 Op0.hasOneUse()) {
14988 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
14989 return Res;
14990 }
14991
14992 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
14993 // for some types, but at a higher cost since it's implemented with a 3
14994 // operand form.
14995 const SDNodeFlags Flags = N->getFlags();
14996 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) &&
14997 !Subtarget->hasIEEEMinimumMaximumInsts() && Flags.hasNoNaNs()) {
14998 unsigned NewOpc =
14999 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
15000 return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
15001 }
15002
15003 return SDValue();
15004}
15005
15009 // FIXME: Should this be allowing -0.0?
15010 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
15011 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
15012 }
15013 }
15014
15015 return false;
15016}
15017
15018// FIXME: Should only worry about snans for version with chain.
15019SDValue SITargetLowering::performFMed3Combine(SDNode *N,
15020 DAGCombinerInfo &DCI) const {
15021 EVT VT = N->getValueType(0);
15022 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
15023 // NaNs. With a NaN input, the order of the operands may change the result.
15024
15025 SelectionDAG &DAG = DCI.DAG;
15026 SDLoc SL(N);
15027
15028 SDValue Src0 = N->getOperand(0);
15029 SDValue Src1 = N->getOperand(1);
15030 SDValue Src2 = N->getOperand(2);
15031
15032 if (isClampZeroToOne(Src0, Src1)) {
15033 // const_a, const_b, x -> clamp is safe in all cases including signaling
15034 // nans.
15035 // FIXME: Should this be allowing -0.0?
15036 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
15037 }
15038
15039 const MachineFunction &MF = DAG.getMachineFunction();
15040 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15041
15042 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
15043 // handling no dx10-clamp?
15044 if (Info->getMode().DX10Clamp) {
15045 // If NaNs is clamped to 0, we are free to reorder the inputs.
15046
15047 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15048 std::swap(Src0, Src1);
15049
15050 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
15051 std::swap(Src1, Src2);
15052
15053 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15054 std::swap(Src0, Src1);
15055
15056 if (isClampZeroToOne(Src1, Src2))
15057 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
15058 }
15059
15060 return SDValue();
15061}
15062
15063SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
15064 DAGCombinerInfo &DCI) const {
15065 SDValue Src0 = N->getOperand(0);
15066 SDValue Src1 = N->getOperand(1);
15067 if (Src0.isUndef() && Src1.isUndef())
15068 return DCI.DAG.getUNDEF(N->getValueType(0));
15069 return SDValue();
15070}
15071
15072// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
15073// expanded into a set of cmp/select instructions.
15075 unsigned NumElem,
15076 bool IsDivergentIdx,
15077 const GCNSubtarget *Subtarget) {
15079 return false;
15080
15081 unsigned VecSize = EltSize * NumElem;
15082
15083 // Sub-dword vectors of size 2 dword or less have better implementation.
15084 if (VecSize <= 64 && EltSize < 32)
15085 return false;
15086
15087 // Always expand the rest of sub-dword instructions, otherwise it will be
15088 // lowered via memory.
15089 if (EltSize < 32)
15090 return true;
15091
15092 // Always do this if var-idx is divergent, otherwise it will become a loop.
15093 if (IsDivergentIdx)
15094 return true;
15095
15096 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
15097 unsigned NumInsts = NumElem /* Number of compares */ +
15098 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
15099
15100 // On some architectures (GFX9) movrel is not available and it's better
15101 // to expand.
15102 if (Subtarget->useVGPRIndexMode())
15103 return NumInsts <= 16;
15104
15105 // If movrel is available, use it instead of expanding for vector of 8
15106 // elements.
15107 if (Subtarget->hasMovrel())
15108 return NumInsts <= 15;
15109
15110 return true;
15111}
15112
15114 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
15115 if (isa<ConstantSDNode>(Idx))
15116 return false;
15117
15118 SDValue Vec = N->getOperand(0);
15119 EVT VecVT = Vec.getValueType();
15120 EVT EltVT = VecVT.getVectorElementType();
15121 unsigned EltSize = EltVT.getSizeInBits();
15122 unsigned NumElem = VecVT.getVectorNumElements();
15123
15125 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
15126}
15127
15128SDValue
15129SITargetLowering::performExtractVectorEltCombine(SDNode *N,
15130 DAGCombinerInfo &DCI) const {
15131 SDValue Vec = N->getOperand(0);
15132 SelectionDAG &DAG = DCI.DAG;
15133
15134 EVT VecVT = Vec.getValueType();
15135 EVT VecEltVT = VecVT.getVectorElementType();
15136 EVT ResVT = N->getValueType(0);
15137
15138 unsigned VecSize = VecVT.getSizeInBits();
15139 unsigned VecEltSize = VecEltVT.getSizeInBits();
15140
15141 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
15143 SDLoc SL(N);
15144 SDValue Idx = N->getOperand(1);
15145 SDValue Elt =
15146 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
15147 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
15148 }
15149
15150 // (extract_vector_element (and {y0, y1}, (build_vector 0x1f, 0x1f)), index)
15151 // -> (and (extract_vector_element {y0, y1}, index), 0x1f)
15152 // There are optimisations to transform 64-bit shifts into 32-bit shifts
15153 // depending on the shift operand. See e.g. performSraCombine().
15154 // This combine ensures that the optimisation is compatible with v2i32
15155 // legalised AND.
15156 if (VecVT == MVT::v2i32 && Vec->getOpcode() == ISD::AND &&
15157 Vec->getOperand(1)->getOpcode() == ISD::BUILD_VECTOR) {
15158
15160 if (!C || C->getZExtValue() != 0x1f)
15161 return SDValue();
15162
15163 SDLoc SL(N);
15164 SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
15165 SDValue EVE = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
15166 Vec->getOperand(0), N->getOperand(1));
15167 SDValue A = DAG.getNode(ISD::AND, SL, MVT::i32, EVE, AndMask);
15168 DAG.ReplaceAllUsesWith(N, A.getNode());
15169 }
15170
15171 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
15172 // =>
15173 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
15174 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
15175 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
15176 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15177 SDLoc SL(N);
15178 SDValue Idx = N->getOperand(1);
15179 unsigned Opc = Vec.getOpcode();
15180
15181 switch (Opc) {
15182 default:
15183 break;
15184 // TODO: Support other binary operations.
15185 case ISD::FADD:
15186 case ISD::FSUB:
15187 case ISD::FMUL:
15188 case ISD::ADD:
15189 case ISD::UMIN:
15190 case ISD::UMAX:
15191 case ISD::SMIN:
15192 case ISD::SMAX:
15193 case ISD::FMAXNUM:
15194 case ISD::FMINNUM:
15195 case ISD::FMAXNUM_IEEE:
15196 case ISD::FMINNUM_IEEE:
15197 case ISD::FMAXIMUM:
15198 case ISD::FMINIMUM: {
15199 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15200 Vec.getOperand(0), Idx);
15201 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15202 Vec.getOperand(1), Idx);
15203
15204 DCI.AddToWorklist(Elt0.getNode());
15205 DCI.AddToWorklist(Elt1.getNode());
15206 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
15207 }
15208 }
15209 }
15210
15211 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
15213 SDLoc SL(N);
15214 SDValue Idx = N->getOperand(1);
15215 SDValue V;
15216 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15217 SDValue IC = DAG.getVectorIdxConstant(I, SL);
15218 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
15219 if (I == 0)
15220 V = Elt;
15221 else
15222 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
15223 }
15224 return V;
15225 }
15226
15227 if (!DCI.isBeforeLegalize())
15228 return SDValue();
15229
15230 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
15231 // elements. This exposes more load reduction opportunities by replacing
15232 // multiple small extract_vector_elements with a single 32-bit extract.
15233 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
15234 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
15235 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15236 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
15237
15238 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15239 unsigned EltIdx = BitIndex / 32;
15240 unsigned LeftoverBitIdx = BitIndex % 32;
15241 SDLoc SL(N);
15242
15243 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
15244 DCI.AddToWorklist(Cast.getNode());
15245
15246 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
15247 DAG.getConstant(EltIdx, SL, MVT::i32));
15248 DCI.AddToWorklist(Elt.getNode());
15249 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
15250 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
15251 DCI.AddToWorklist(Srl.getNode());
15252
15253 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
15254 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
15255 DCI.AddToWorklist(Trunc.getNode());
15256
15257 if (VecEltVT == ResVT) {
15258 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
15259 }
15260
15261 assert(ResVT.isScalarInteger());
15262 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
15263 }
15264
15265 return SDValue();
15266}
15267
15268SDValue
15269SITargetLowering::performInsertVectorEltCombine(SDNode *N,
15270 DAGCombinerInfo &DCI) const {
15271 SDValue Vec = N->getOperand(0);
15272 SDValue Idx = N->getOperand(2);
15273 EVT VecVT = Vec.getValueType();
15274 EVT EltVT = VecVT.getVectorElementType();
15275
15276 // INSERT_VECTOR_ELT (<n x e>, var-idx)
15277 // => BUILD_VECTOR n x select (e, const-idx)
15279 return SDValue();
15280
15281 SelectionDAG &DAG = DCI.DAG;
15282 SDLoc SL(N);
15283 SDValue Ins = N->getOperand(1);
15284 EVT IdxVT = Idx.getValueType();
15285
15287 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15288 SDValue IC = DAG.getConstant(I, SL, IdxVT);
15289 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
15290 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
15291 Ops.push_back(V);
15292 }
15293
15294 return DAG.getBuildVector(VecVT, SL, Ops);
15295}
15296
15297/// Return the source of an fp_extend from f16 to f32, or a converted FP
15298/// constant.
15300 if (Src.getOpcode() == ISD::FP_EXTEND &&
15301 Src.getOperand(0).getValueType() == MVT::f16) {
15302 return Src.getOperand(0);
15303 }
15304
15305 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
15306 APFloat Val = CFP->getValueAPF();
15307 bool LosesInfo = true;
15309 if (!LosesInfo)
15310 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
15311 }
15312
15313 return SDValue();
15314}
15315
15316SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
15317 DAGCombinerInfo &DCI) const {
15318 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15319 "combine only useful on gfx8");
15320
15321 SDValue TruncSrc = N->getOperand(0);
15322 EVT VT = N->getValueType(0);
15323 if (VT != MVT::f16)
15324 return SDValue();
15325
15326 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
15327 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
15328 return SDValue();
15329
15330 SelectionDAG &DAG = DCI.DAG;
15331 SDLoc SL(N);
15332
15333 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
15334 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
15335 // casting back.
15336
15337 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
15338 // fmin(fmax(a, b), fmax(fmin(a, b), c))
15339 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
15340 if (!A)
15341 return SDValue();
15342
15343 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
15344 if (!B)
15345 return SDValue();
15346
15347 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
15348 if (!C)
15349 return SDValue();
15350
15351 // This changes signaling nan behavior. If an input is a signaling nan, it
15352 // would have been quieted by the fpext originally. We don't care because
15353 // these are unconstrained ops. If we needed to insert quieting canonicalizes
15354 // we would be worse off than just doing the promotion.
15355 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
15356 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
15357 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
15358 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
15359}
15360
15361unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
15362 const SDNode *N0,
15363 const SDNode *N1) const {
15364 EVT VT = N0->getValueType(0);
15365
15366 // Only do this if we are not trying to support denormals. v_mad_f32 does not
15367 // support denormals ever.
15368 if (((VT == MVT::f32 &&
15370 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15373 return ISD::FMAD;
15374
15375 const TargetOptions &Options = DAG.getTarget().Options;
15376 if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
15377 (N0->getFlags().hasAllowContract() &&
15378 N1->getFlags().hasAllowContract())) &&
15380 return ISD::FMA;
15381 }
15382
15383 return 0;
15384}
15385
15386// For a reassociatable opcode perform:
15387// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
15388SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
15389 SelectionDAG &DAG) const {
15390 EVT VT = N->getValueType(0);
15391 if (VT != MVT::i32 && VT != MVT::i64)
15392 return SDValue();
15393
15394 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
15395 return SDValue();
15396
15397 unsigned Opc = N->getOpcode();
15398 SDValue Op0 = N->getOperand(0);
15399 SDValue Op1 = N->getOperand(1);
15400
15401 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
15402 return SDValue();
15403
15404 if (Op0->isDivergent())
15405 std::swap(Op0, Op1);
15406
15407 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
15408 return SDValue();
15409
15410 SDValue Op2 = Op1.getOperand(1);
15411 Op1 = Op1.getOperand(0);
15412 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
15413 return SDValue();
15414
15415 if (Op1->isDivergent())
15416 std::swap(Op1, Op2);
15417
15418 SDLoc SL(N);
15419 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
15420 return DAG.getNode(Opc, SL, VT, Add1, Op2);
15421}
15422
15423static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
15424 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
15426 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
15427 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
15428 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
15429}
15430
15431// Fold
15432// y = lshr i64 x, 32
15433// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
15434// with Const.hi == -1
15435// To
15436// res = mad_u64_u32 y.lo ,Const.lo, x.lo
15438 SDValue MulLHS, SDValue MulRHS,
15439 SDValue AddRHS) {
15440 if (MulRHS.getOpcode() == ISD::SRL)
15441 std::swap(MulLHS, MulRHS);
15442
15443 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
15444 return SDValue();
15445
15446 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
15447 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
15448 MulLHS.getOperand(0) != AddRHS)
15449 return SDValue();
15450
15452 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
15453 return SDValue();
15454
15455 SDValue ConstMul =
15456 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
15457 return getMad64_32(DAG, SL, MVT::i64,
15458 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
15459 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
15460}
15461
15462// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
15463// multiplies, if any.
15464//
15465// Full 64-bit multiplies that feed into an addition are lowered here instead
15466// of using the generic expansion. The generic expansion ends up with
15467// a tree of ADD nodes that prevents us from using the "add" part of the
15468// MAD instruction. The expansion produced here results in a chain of ADDs
15469// instead of a tree.
15470SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
15471 DAGCombinerInfo &DCI) const {
15472 assert(N->isAnyAdd());
15473
15474 SelectionDAG &DAG = DCI.DAG;
15475 EVT VT = N->getValueType(0);
15476 SDLoc SL(N);
15477 SDValue LHS = N->getOperand(0);
15478 SDValue RHS = N->getOperand(1);
15479
15480 if (VT.isVector())
15481 return SDValue();
15482
15483 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
15484 // result in scalar registers for uniform values.
15485 if (!N->isDivergent() && Subtarget->hasSMulHi())
15486 return SDValue();
15487
15488 unsigned NumBits = VT.getScalarSizeInBits();
15489 if (NumBits <= 32 || NumBits > 64)
15490 return SDValue();
15491
15492 if (LHS.getOpcode() != ISD::MUL) {
15493 assert(RHS.getOpcode() == ISD::MUL);
15494 std::swap(LHS, RHS);
15495 }
15496
15497 // Avoid the fold if it would unduly increase the number of multiplies due to
15498 // multiple uses, except on hardware with full-rate multiply-add (which is
15499 // part of full-rate 64-bit ops).
15500 if (!Subtarget->hasFullRate64Ops()) {
15501 unsigned NumUsers = 0;
15502 for (SDNode *User : LHS->users()) {
15503 // There is a use that does not feed into addition, so the multiply can't
15504 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
15505 if (!User->isAnyAdd())
15506 return SDValue();
15507
15508 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
15509 // MUL + 3xADD + 3xADDC over 3xMAD.
15510 ++NumUsers;
15511 if (NumUsers >= 3)
15512 return SDValue();
15513 }
15514 }
15515
15516 SDValue MulLHS = LHS.getOperand(0);
15517 SDValue MulRHS = LHS.getOperand(1);
15518 SDValue AddRHS = RHS;
15519
15520 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
15521 return FoldedMAD;
15522
15523 // Always check whether operands are small unsigned values, since that
15524 // knowledge is useful in more cases. Check for small signed values only if
15525 // doing so can unlock a shorter code sequence.
15526 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
15527 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
15528
15529 bool MulSignedLo = false;
15530 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15531 MulSignedLo =
15532 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
15533 }
15534
15535 // The operands and final result all have the same number of bits. If
15536 // operands need to be extended, they can be extended with garbage. The
15537 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
15538 // truncated away in the end.
15539 if (VT != MVT::i64) {
15540 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
15541 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
15542 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
15543 }
15544
15545 // The basic code generated is conceptually straightforward. Pseudo code:
15546 //
15547 // accum = mad_64_32 lhs.lo, rhs.lo, accum
15548 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
15549 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
15550 //
15551 // The second and third lines are optional, depending on whether the factors
15552 // are {sign,zero}-extended or not.
15553 //
15554 // The actual DAG is noisier than the pseudo code, but only due to
15555 // instructions that disassemble values into low and high parts, and
15556 // assemble the final result.
15557 SDValue One = DAG.getConstant(1, SL, MVT::i32);
15558
15559 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
15560 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
15561 SDValue Accum =
15562 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15563
15564 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15565 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15566
15567 if (!MulLHSUnsigned32) {
15568 auto MulLHSHi =
15569 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
15570 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
15571 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15572 }
15573
15574 if (!MulRHSUnsigned32) {
15575 auto MulRHSHi =
15576 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
15577 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
15578 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15579 }
15580
15581 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
15582 Accum = DAG.getBitcast(MVT::i64, Accum);
15583 }
15584
15585 if (VT != MVT::i64)
15586 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
15587 return Accum;
15588}
15589
15590SDValue
15591SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
15592 DAGCombinerInfo &DCI) const {
15593 SDValue RHS = N->getOperand(1);
15594 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
15595 if (!CRHS)
15596 return SDValue();
15597
15598 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
15599 // common.
15600 uint64_t Val = CRHS->getZExtValue();
15601 if (countr_zero(Val) >= 32) {
15602 SelectionDAG &DAG = DCI.DAG;
15603 SDLoc SL(N);
15604 SDValue LHS = N->getOperand(0);
15605
15606 // Avoid carry machinery if we know the low half of the add does not
15607 // contribute to the final result.
15608 //
15609 // add i64:x, K if computeTrailingZeros(K) >= 32
15610 // => build_pair (add x.hi, K.hi), x.lo
15611
15612 // Breaking the 64-bit add here with this strange constant is unlikely
15613 // to interfere with addressing mode patterns.
15614
15615 SDValue Hi = getHiHalf64(LHS, DAG);
15616 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
15617 unsigned Opcode = N->getOpcode();
15618 if (Opcode == ISD::PTRADD)
15619 Opcode = ISD::ADD;
15620 SDValue AddHi =
15621 DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
15622
15623 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
15624 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
15625 }
15626
15627 return SDValue();
15628}
15629
15630// Collect the ultimate src of each of the mul node's operands, and confirm
15631// each operand is 8 bytes.
15632static std::optional<ByteProvider<SDValue>>
15633handleMulOperand(const SDValue &MulOperand) {
15634 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
15635 if (!Byte0 || Byte0->isConstantZero()) {
15636 return std::nullopt;
15637 }
15638 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
15639 if (Byte1 && !Byte1->isConstantZero()) {
15640 return std::nullopt;
15641 }
15642 return Byte0;
15643}
15644
15645static unsigned addPermMasks(unsigned First, unsigned Second) {
15646 unsigned FirstCs = First & 0x0c0c0c0c;
15647 unsigned SecondCs = Second & 0x0c0c0c0c;
15648 unsigned FirstNoCs = First & ~0x0c0c0c0c;
15649 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15650
15651 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15652 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15653 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15654 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15655
15656 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15657}
15658
15659struct DotSrc {
15661 int64_t PermMask;
15663};
15664
15668 SmallVectorImpl<DotSrc> &Src1s, int Step) {
15669
15670 assert(Src0.Src.has_value() && Src1.Src.has_value());
15671 // Src0s and Src1s are empty, just place arbitrarily.
15672 if (Step == 0) {
15673 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
15674 Src0.SrcOffset / 4});
15675 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
15676 Src1.SrcOffset / 4});
15677 return;
15678 }
15679
15680 for (int BPI = 0; BPI < 2; BPI++) {
15681 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
15682 if (BPI == 1) {
15683 BPP = {Src1, Src0};
15684 }
15685 unsigned ZeroMask = 0x0c0c0c0c;
15686 unsigned FMask = 0xFF << (8 * (3 - Step));
15687
15688 unsigned FirstMask =
15689 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15690 unsigned SecondMask =
15691 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15692 // Attempt to find Src vector which contains our SDValue, if so, add our
15693 // perm mask to the existing one. If we are unable to find a match for the
15694 // first SDValue, attempt to find match for the second.
15695 int FirstGroup = -1;
15696 for (int I = 0; I < 2; I++) {
15697 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
15698 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
15699 return IterElt.SrcOp == *BPP.first.Src &&
15700 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15701 };
15702
15703 auto *Match = llvm::find_if(Srcs, MatchesFirst);
15704 if (Match != Srcs.end()) {
15705 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
15706 FirstGroup = I;
15707 break;
15708 }
15709 }
15710 if (FirstGroup != -1) {
15711 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
15712 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
15713 return IterElt.SrcOp == *BPP.second.Src &&
15714 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15715 };
15716 auto *Match = llvm::find_if(Srcs, MatchesSecond);
15717 if (Match != Srcs.end()) {
15718 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
15719 } else
15720 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15721 return;
15722 }
15723 }
15724
15725 // If we have made it here, then we could not find a match in Src0s or Src1s
15726 // for either Src0 or Src1, so just place them arbitrarily.
15727
15728 unsigned ZeroMask = 0x0c0c0c0c;
15729 unsigned FMask = 0xFF << (8 * (3 - Step));
15730
15731 Src0s.push_back(
15732 {*Src0.Src,
15733 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15734 Src0.SrcOffset / 4});
15735 Src1s.push_back(
15736 {*Src1.Src,
15737 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15738 Src1.SrcOffset / 4});
15739}
15740
15742 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
15743 bool IsAny) {
15744
15745 // If we just have one source, just permute it accordingly.
15746 if (Srcs.size() == 1) {
15747 auto *Elt = Srcs.begin();
15748 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
15749
15750 // v_perm will produce the original value
15751 if (Elt->PermMask == 0x3020100)
15752 return EltOp;
15753
15754 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15755 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
15756 }
15757
15758 auto *FirstElt = Srcs.begin();
15759 auto *SecondElt = std::next(FirstElt);
15760
15762
15763 // If we have multiple sources in the chain, combine them via perms (using
15764 // calculated perm mask) and Ors.
15765 while (true) {
15766 auto FirstMask = FirstElt->PermMask;
15767 auto SecondMask = SecondElt->PermMask;
15768
15769 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15770 unsigned FirstPlusFour = FirstMask | 0x04040404;
15771 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
15772 // original 0x0C.
15773 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15774
15775 auto PermMask = addPermMasks(FirstMask, SecondMask);
15776 auto FirstVal =
15777 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15778 auto SecondVal =
15779 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
15780
15781 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
15782 SecondVal,
15783 DAG.getConstant(PermMask, SL, MVT::i32)));
15784
15785 FirstElt = std::next(SecondElt);
15786 if (FirstElt == Srcs.end())
15787 break;
15788
15789 SecondElt = std::next(FirstElt);
15790 // If we only have a FirstElt, then just combine that into the cumulative
15791 // source node.
15792 if (SecondElt == Srcs.end()) {
15793 auto EltOp =
15794 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15795
15796 Perms.push_back(
15797 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15798 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
15799 break;
15800 }
15801 }
15802
15803 assert(Perms.size() == 1 || Perms.size() == 2);
15804 return Perms.size() == 2
15805 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
15806 : Perms[0];
15807}
15808
15809static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
15810 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15811 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15812 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15813 EntryMask += ZeroMask;
15814 }
15815}
15816
15817static bool isMul(const SDValue Op) {
15818 auto Opcode = Op.getOpcode();
15819
15820 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
15821 Opcode == AMDGPUISD::MUL_I24);
15822}
15823
15824static std::optional<bool>
15826 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
15827 const SDValue &S1Op, const SelectionDAG &DAG) {
15828 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
15829 // of the dot4 is irrelevant.
15830 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
15831 return false;
15832
15833 auto Known0 = DAG.computeKnownBits(S0Op, 0);
15834 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
15835 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
15836 auto Known1 = DAG.computeKnownBits(S1Op, 0);
15837 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
15838 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
15839
15840 assert(!(S0IsUnsigned && S0IsSigned));
15841 assert(!(S1IsUnsigned && S1IsSigned));
15842
15843 // There are 9 possible permutations of
15844 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
15845
15846 // In two permutations, the sign bits are known to be the same for both Ops,
15847 // so simply return Signed / Unsigned corresponding to the MSB
15848
15849 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
15850 return S0IsSigned;
15851
15852 // In another two permutations, the sign bits are known to be opposite. In
15853 // this case return std::nullopt to indicate a bad match.
15854
15855 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
15856 return std::nullopt;
15857
15858 // In the remaining five permutations, we don't know the value of the sign
15859 // bit for at least one Op. Since we have a valid ByteProvider, we know that
15860 // the upper bits must be extension bits. Thus, the only ways for the sign
15861 // bit to be unknown is if it was sign extended from unknown value, or if it
15862 // was any extended. In either case, it is correct to use the signed
15863 // version of the signedness semantics of dot4
15864
15865 // In two of such permutations, we known the sign bit is set for
15866 // one op, and the other is unknown. It is okay to used signed version of
15867 // dot4.
15868 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
15869 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
15870 return true;
15871
15872 // In one such permutation, we don't know either of the sign bits. It is okay
15873 // to used the signed version of dot4.
15874 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
15875 return true;
15876
15877 // In two of such permutations, we known the sign bit is unset for
15878 // one op, and the other is unknown. Return std::nullopt to indicate a
15879 // bad match.
15880 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
15881 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
15882 return std::nullopt;
15883
15884 llvm_unreachable("Fully covered condition");
15885}
15886
15887SDValue SITargetLowering::performAddCombine(SDNode *N,
15888 DAGCombinerInfo &DCI) const {
15889 SelectionDAG &DAG = DCI.DAG;
15890 EVT VT = N->getValueType(0);
15891 SDLoc SL(N);
15892 SDValue LHS = N->getOperand(0);
15893 SDValue RHS = N->getOperand(1);
15894
15895 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
15896 if (Subtarget->hasMad64_32()) {
15897 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
15898 return Folded;
15899 }
15900 }
15901
15902 if (SDValue V = reassociateScalarOps(N, DAG)) {
15903 return V;
15904 }
15905
15906 if (VT == MVT::i64) {
15907 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
15908 return Folded;
15909 }
15910
15911 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
15912 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
15913 SDValue TempNode(N, 0);
15914 std::optional<bool> IsSigned;
15918
15919 // Match the v_dot4 tree, while collecting src nodes.
15920 int ChainLength = 0;
15921 for (int I = 0; I < 4; I++) {
15922 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
15923 if (MulIdx == -1)
15924 break;
15925 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
15926 if (!Src0)
15927 break;
15928 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
15929 if (!Src1)
15930 break;
15931
15932 auto IterIsSigned = checkDot4MulSignedness(
15933 TempNode->getOperand(MulIdx), *Src0, *Src1,
15934 TempNode->getOperand(MulIdx)->getOperand(0),
15935 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
15936 if (!IterIsSigned)
15937 break;
15938 if (!IsSigned)
15939 IsSigned = *IterIsSigned;
15940 if (*IterIsSigned != *IsSigned)
15941 break;
15942 placeSources(*Src0, *Src1, Src0s, Src1s, I);
15943 auto AddIdx = 1 - MulIdx;
15944 // Allow the special case where add (add (mul24, 0), mul24) became ->
15945 // add (mul24, mul24).
15946 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
15947 Src2s.push_back(TempNode->getOperand(AddIdx));
15948 auto Src0 =
15949 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
15950 if (!Src0)
15951 break;
15952 auto Src1 =
15953 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
15954 if (!Src1)
15955 break;
15956 auto IterIsSigned = checkDot4MulSignedness(
15957 TempNode->getOperand(AddIdx), *Src0, *Src1,
15958 TempNode->getOperand(AddIdx)->getOperand(0),
15959 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
15960 if (!IterIsSigned)
15961 break;
15962 assert(IsSigned);
15963 if (*IterIsSigned != *IsSigned)
15964 break;
15965 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
15966 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
15967 ChainLength = I + 2;
15968 break;
15969 }
15970
15971 TempNode = TempNode->getOperand(AddIdx);
15972 Src2s.push_back(TempNode);
15973 ChainLength = I + 1;
15974 if (TempNode->getNumOperands() < 2)
15975 break;
15976 LHS = TempNode->getOperand(0);
15977 RHS = TempNode->getOperand(1);
15978 }
15979
15980 if (ChainLength < 2)
15981 return SDValue();
15982
15983 // Masks were constructed with assumption that we would find a chain of
15984 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
15985 // 0x0c) so they do not affect dot calculation.
15986 if (ChainLength < 4) {
15987 fixMasks(Src0s, ChainLength);
15988 fixMasks(Src1s, ChainLength);
15989 }
15990
15991 SDValue Src0, Src1;
15992
15993 // If we are just using a single source for both, and have permuted the
15994 // bytes consistently, we can just use the sources without permuting
15995 // (commutation).
15996 bool UseOriginalSrc = false;
15997 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
15998 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
15999 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
16000 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
16001 SmallVector<unsigned, 4> SrcBytes;
16002 auto Src0Mask = Src0s.begin()->PermMask;
16003 SrcBytes.push_back(Src0Mask & 0xFF000000);
16004 bool UniqueEntries = true;
16005 for (auto I = 1; I < 4; I++) {
16006 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
16007
16008 if (is_contained(SrcBytes, NextByte)) {
16009 UniqueEntries = false;
16010 break;
16011 }
16012 SrcBytes.push_back(NextByte);
16013 }
16014
16015 if (UniqueEntries) {
16016 UseOriginalSrc = true;
16017
16018 auto *FirstElt = Src0s.begin();
16019 auto FirstEltOp =
16020 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16021
16022 auto *SecondElt = Src1s.begin();
16023 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
16024 SecondElt->DWordOffset);
16025
16026 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
16027 MVT::getIntegerVT(32));
16028 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
16029 MVT::getIntegerVT(32));
16030 }
16031 }
16032
16033 if (!UseOriginalSrc) {
16034 Src0 = resolveSources(DAG, SL, Src0s, false, true);
16035 Src1 = resolveSources(DAG, SL, Src1s, false, true);
16036 }
16037
16038 assert(IsSigned);
16039 SDValue Src2 =
16040 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16041
16042 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
16043 : Intrinsic::amdgcn_udot4,
16044 SL, MVT::i64);
16045
16046 assert(!VT.isVector());
16047 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
16048 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
16049
16050 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
16051 }
16052
16053 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16054 return SDValue();
16055
16056 // add x, zext (setcc) => uaddo_carry x, 0, setcc
16057 // add x, sext (setcc) => usubo_carry x, 0, setcc
16058 unsigned Opc = LHS.getOpcode();
16061 std::swap(RHS, LHS);
16062
16063 Opc = RHS.getOpcode();
16064 switch (Opc) {
16065 default:
16066 break;
16067 case ISD::ZERO_EXTEND:
16068 case ISD::SIGN_EXTEND:
16069 case ISD::ANY_EXTEND: {
16070 auto Cond = RHS.getOperand(0);
16071 // If this won't be a real VOPC output, we would still need to insert an
16072 // extra instruction anyway.
16073 if (!isBoolSGPR(Cond))
16074 break;
16075 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16076 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16078 return DAG.getNode(Opc, SL, VTList, Args);
16079 }
16080 case ISD::UADDO_CARRY: {
16081 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
16082 if (!isNullConstant(RHS.getOperand(1)))
16083 break;
16084 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
16085 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
16086 }
16087 }
16088 return SDValue();
16089}
16090
16091SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
16092 DAGCombinerInfo &DCI) const {
16093 SelectionDAG &DAG = DCI.DAG;
16094 SDLoc DL(N);
16095 EVT VT = N->getValueType(0);
16096 SDValue N0 = N->getOperand(0);
16097 SDValue N1 = N->getOperand(1);
16098
16099 // The following folds transform PTRADDs into regular arithmetic in cases
16100 // where the PTRADD wouldn't be folded as an immediate offset into memory
16101 // instructions anyway. They are target-specific in that other targets might
16102 // prefer to not lose information about the pointer arithmetic.
16103
16104 // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
16105 // Adapted from DAGCombiner::visitADDLikeCommutative.
16106 SDValue V, K;
16107 if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
16108 SDNodeFlags ShlFlags = N1->getFlags();
16109 // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
16110 // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
16111 // preserved.
16112 SDNodeFlags NewShlFlags =
16113 ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
16115 : SDNodeFlags();
16116 SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
16117 DCI.AddToWorklist(Inner.getNode());
16118 return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
16119 }
16120
16121 // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
16122 // performAddCombine.
16123 if (N1.getOpcode() == ISD::MUL) {
16124 if (Subtarget->hasMad64_32()) {
16125 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16126 return Folded;
16127 }
16128 }
16129
16130 // If the 32 low bits of the constant are all zero, there is nothing to fold
16131 // into an immediate offset, so it's better to eliminate the unnecessary
16132 // addition for the lower 32 bits than to preserve the PTRADD.
16133 // Analogous to a fold in performAddCombine.
16134 if (VT == MVT::i64) {
16135 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16136 return Folded;
16137 }
16138
16139 if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
16140 return SDValue();
16141
16142 SDValue X = N0;
16143 SDValue Y = N1.getOperand(0);
16144 SDValue Z = N1.getOperand(1);
16145 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
16146 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
16147
16148 if (!YIsConstant && !ZIsConstant && !X->isDivergent() &&
16149 Y->isDivergent() != Z->isDivergent()) {
16150 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
16151 // y are uniform and z isn't.
16152 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
16153 // z are uniform and y isn't.
16154 // The goal is to push uniform operands up in the computation, so that they
16155 // can be handled with scalar operations. We can't use reassociateScalarOps
16156 // for this since it requires two identical commutative operations to
16157 // reassociate.
16158 if (Y->isDivergent())
16159 std::swap(Y, Z);
16160 // If both additions in the original were NUW, reassociation preserves that.
16161 SDNodeFlags ReassocFlags =
16162 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
16163 SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
16164 DCI.AddToWorklist(UniformInner.getNode());
16165 return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
16166 }
16167
16168 return SDValue();
16169}
16170
16171SDValue SITargetLowering::performSubCombine(SDNode *N,
16172 DAGCombinerInfo &DCI) const {
16173 SelectionDAG &DAG = DCI.DAG;
16174 EVT VT = N->getValueType(0);
16175
16176 if (VT == MVT::i64) {
16177 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16178 return Folded;
16179 }
16180
16181 if (VT != MVT::i32)
16182 return SDValue();
16183
16184 SDLoc SL(N);
16185 SDValue LHS = N->getOperand(0);
16186 SDValue RHS = N->getOperand(1);
16187
16188 // sub x, zext (setcc) => usubo_carry x, 0, setcc
16189 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
16190 unsigned Opc = RHS.getOpcode();
16191 switch (Opc) {
16192 default:
16193 break;
16194 case ISD::ZERO_EXTEND:
16195 case ISD::SIGN_EXTEND:
16196 case ISD::ANY_EXTEND: {
16197 auto Cond = RHS.getOperand(0);
16198 // If this won't be a real VOPC output, we would still need to insert an
16199 // extra instruction anyway.
16200 if (!isBoolSGPR(Cond))
16201 break;
16202 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16203 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16205 return DAG.getNode(Opc, SL, VTList, Args);
16206 }
16207 }
16208
16209 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
16210 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
16211 if (!isNullConstant(LHS.getOperand(1)))
16212 return SDValue();
16213 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
16214 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
16215 }
16216 return SDValue();
16217}
16218
16219SDValue
16220SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
16221 DAGCombinerInfo &DCI) const {
16222
16223 if (N->getValueType(0) != MVT::i32)
16224 return SDValue();
16225
16226 if (!isNullConstant(N->getOperand(1)))
16227 return SDValue();
16228
16229 SelectionDAG &DAG = DCI.DAG;
16230 SDValue LHS = N->getOperand(0);
16231
16232 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
16233 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
16234 unsigned LHSOpc = LHS.getOpcode();
16235 unsigned Opc = N->getOpcode();
16236 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
16237 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
16238 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
16239 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
16240 }
16241 return SDValue();
16242}
16243
16244SDValue SITargetLowering::performFAddCombine(SDNode *N,
16245 DAGCombinerInfo &DCI) const {
16246 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16247 return SDValue();
16248
16249 SelectionDAG &DAG = DCI.DAG;
16250 EVT VT = N->getValueType(0);
16251
16252 SDLoc SL(N);
16253 SDValue LHS = N->getOperand(0);
16254 SDValue RHS = N->getOperand(1);
16255
16256 // These should really be instruction patterns, but writing patterns with
16257 // source modifiers is a pain.
16258
16259 // fadd (fadd (a, a), b) -> mad 2.0, a, b
16260 if (LHS.getOpcode() == ISD::FADD) {
16261 SDValue A = LHS.getOperand(0);
16262 if (A == LHS.getOperand(1)) {
16263 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16264 if (FusedOp != 0) {
16265 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16266 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
16267 }
16268 }
16269 }
16270
16271 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
16272 if (RHS.getOpcode() == ISD::FADD) {
16273 SDValue A = RHS.getOperand(0);
16274 if (A == RHS.getOperand(1)) {
16275 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16276 if (FusedOp != 0) {
16277 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16278 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
16279 }
16280 }
16281 }
16282
16283 return SDValue();
16284}
16285
16286SDValue SITargetLowering::performFSubCombine(SDNode *N,
16287 DAGCombinerInfo &DCI) const {
16288 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16289 return SDValue();
16290
16291 SelectionDAG &DAG = DCI.DAG;
16292 SDLoc SL(N);
16293 EVT VT = N->getValueType(0);
16294 assert(!VT.isVector());
16295
16296 // Try to get the fneg to fold into the source modifier. This undoes generic
16297 // DAG combines and folds them into the mad.
16298 //
16299 // Only do this if we are not trying to support denormals. v_mad_f32 does
16300 // not support denormals ever.
16301 SDValue LHS = N->getOperand(0);
16302 SDValue RHS = N->getOperand(1);
16303 if (LHS.getOpcode() == ISD::FADD) {
16304 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
16305 SDValue A = LHS.getOperand(0);
16306 if (A == LHS.getOperand(1)) {
16307 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16308 if (FusedOp != 0) {
16309 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16310 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
16311
16312 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
16313 }
16314 }
16315 }
16316
16317 if (RHS.getOpcode() == ISD::FADD) {
16318 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
16319
16320 SDValue A = RHS.getOperand(0);
16321 if (A == RHS.getOperand(1)) {
16322 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16323 if (FusedOp != 0) {
16324 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
16325 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
16326 }
16327 }
16328 }
16329
16330 return SDValue();
16331}
16332
16333SDValue SITargetLowering::performFDivCombine(SDNode *N,
16334 DAGCombinerInfo &DCI) const {
16335 SelectionDAG &DAG = DCI.DAG;
16336 SDLoc SL(N);
16337 EVT VT = N->getValueType(0);
16338 if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts())
16339 return SDValue();
16340
16341 SDValue LHS = N->getOperand(0);
16342 SDValue RHS = N->getOperand(1);
16343
16344 SDNodeFlags Flags = N->getFlags();
16345 SDNodeFlags RHSFlags = RHS->getFlags();
16346 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
16347 !RHS->hasOneUse())
16348 return SDValue();
16349
16350 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
16351 bool IsNegative = false;
16352 if (CLHS->isExactlyValue(1.0) ||
16353 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16354 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
16355 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
16356 if (RHS.getOpcode() == ISD::FSQRT) {
16357 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
16358 SDValue Rsq =
16359 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
16360 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
16361 }
16362 }
16363 }
16364
16365 return SDValue();
16366}
16367
16368SDValue SITargetLowering::performFMulCombine(SDNode *N,
16369 DAGCombinerInfo &DCI) const {
16370 SelectionDAG &DAG = DCI.DAG;
16371 EVT VT = N->getValueType(0);
16372 EVT ScalarVT = VT.getScalarType();
16373 EVT IntVT = VT.changeElementType(MVT::i32);
16374
16375 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
16376 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16377 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
16378 return SDValue();
16379 }
16380
16381 SDValue LHS = N->getOperand(0);
16382 SDValue RHS = N->getOperand(1);
16383
16384 // It is cheaper to realize i32 inline constants as compared against
16385 // materializing f16 or f64 (or even non-inline f32) values,
16386 // possible via ldexp usage, as shown below :
16387 //
16388 // Given : A = 2^a & B = 2^b ; where a and b are integers.
16389 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
16390 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
16391 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16392 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
16393 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
16394 if (!TrueNode)
16395 return SDValue();
16396 const ConstantFPSDNode *FalseNode =
16397 isConstOrConstSplatFP(RHS.getOperand(2));
16398 if (!FalseNode)
16399 return SDValue();
16400
16401 if (TrueNode->isNegative() != FalseNode->isNegative())
16402 return SDValue();
16403
16404 // For f32, only non-inline constants should be transformed.
16405 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16406 if (ScalarVT == MVT::f32 &&
16407 TII->isInlineConstant(TrueNode->getValueAPF()) &&
16408 TII->isInlineConstant(FalseNode->getValueAPF()))
16409 return SDValue();
16410
16411 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
16412 if (TrueNodeExpVal == INT_MIN)
16413 return SDValue();
16414 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
16415 if (FalseNodeExpVal == INT_MIN)
16416 return SDValue();
16417
16418 SDLoc SL(N);
16419 SDValue SelectNode =
16420 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
16421 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
16422 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
16423
16424 LHS = TrueNode->isNegative()
16425 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
16426 : LHS;
16427
16428 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
16429 }
16430
16431 return SDValue();
16432}
16433
16434SDValue SITargetLowering::performFMACombine(SDNode *N,
16435 DAGCombinerInfo &DCI) const {
16436 SelectionDAG &DAG = DCI.DAG;
16437 EVT VT = N->getValueType(0);
16438 SDLoc SL(N);
16439
16440 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16441 return SDValue();
16442
16443 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
16444 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
16445 SDValue Op1 = N->getOperand(0);
16446 SDValue Op2 = N->getOperand(1);
16447 SDValue FMA = N->getOperand(2);
16448
16449 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
16450 Op2.getOpcode() != ISD::FP_EXTEND)
16451 return SDValue();
16452
16453 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
16454 // regardless of the denorm mode setting. Therefore,
16455 // fp-contract is sufficient to allow generating fdot2.
16456 const TargetOptions &Options = DAG.getTarget().Options;
16457 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
16458 (N->getFlags().hasAllowContract() &&
16459 FMA->getFlags().hasAllowContract())) {
16460 Op1 = Op1.getOperand(0);
16461 Op2 = Op2.getOperand(0);
16462 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16464 return SDValue();
16465
16466 SDValue Vec1 = Op1.getOperand(0);
16467 SDValue Idx1 = Op1.getOperand(1);
16468 SDValue Vec2 = Op2.getOperand(0);
16469
16470 SDValue FMAOp1 = FMA.getOperand(0);
16471 SDValue FMAOp2 = FMA.getOperand(1);
16472 SDValue FMAAcc = FMA.getOperand(2);
16473
16474 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
16475 FMAOp2.getOpcode() != ISD::FP_EXTEND)
16476 return SDValue();
16477
16478 FMAOp1 = FMAOp1.getOperand(0);
16479 FMAOp2 = FMAOp2.getOperand(0);
16480 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16482 return SDValue();
16483
16484 SDValue Vec3 = FMAOp1.getOperand(0);
16485 SDValue Vec4 = FMAOp2.getOperand(0);
16486 SDValue Idx2 = FMAOp1.getOperand(1);
16487
16488 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
16489 // Idx1 and Idx2 cannot be the same.
16490 Idx1 == Idx2)
16491 return SDValue();
16492
16493 if (Vec1 == Vec2 || Vec3 == Vec4)
16494 return SDValue();
16495
16496 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
16497 return SDValue();
16498
16499 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16500 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
16501 DAG.getTargetConstant(0, SL, MVT::i1));
16502 }
16503 }
16504 return SDValue();
16505}
16506
16507SDValue SITargetLowering::performSetCCCombine(SDNode *N,
16508 DAGCombinerInfo &DCI) const {
16509 SelectionDAG &DAG = DCI.DAG;
16510 SDLoc SL(N);
16511
16512 SDValue LHS = N->getOperand(0);
16513 SDValue RHS = N->getOperand(1);
16514 EVT VT = LHS.getValueType();
16515 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
16516
16517 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
16518 if (!CRHS) {
16520 if (CRHS) {
16521 std::swap(LHS, RHS);
16522 CC = getSetCCSwappedOperands(CC);
16523 }
16524 }
16525
16526 if (CRHS) {
16527 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
16528 isBoolSGPR(LHS.getOperand(0))) {
16529 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
16530 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
16531 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
16532 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
16533 if ((CRHS->isAllOnes() &&
16534 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
16535 (CRHS->isZero() &&
16536 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
16537 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16538 DAG.getAllOnesConstant(SL, MVT::i1));
16539 if ((CRHS->isAllOnes() &&
16540 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
16541 (CRHS->isZero() &&
16542 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
16543 return LHS.getOperand(0);
16544 }
16545
16546 const APInt &CRHSVal = CRHS->getAPIntValue();
16547 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
16548 LHS.getOpcode() == ISD::SELECT &&
16549 isa<ConstantSDNode>(LHS.getOperand(1)) &&
16550 isa<ConstantSDNode>(LHS.getOperand(2)) &&
16551 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
16552 isBoolSGPR(LHS.getOperand(0))) {
16553 // Given CT != FT:
16554 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
16555 // setcc (select cc, CT, CF), CF, ne => cc
16556 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
16557 // setcc (select cc, CT, CF), CT, eq => cc
16558 const APInt &CT = LHS.getConstantOperandAPInt(1);
16559 const APInt &CF = LHS.getConstantOperandAPInt(2);
16560
16561 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
16562 (CT == CRHSVal && CC == ISD::SETNE))
16563 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16564 DAG.getAllOnesConstant(SL, MVT::i1));
16565 if ((CF == CRHSVal && CC == ISD::SETNE) ||
16566 (CT == CRHSVal && CC == ISD::SETEQ))
16567 return LHS.getOperand(0);
16568 }
16569 }
16570
16571 // Eliminate setcc by using carryout from add/sub instruction
16572
16573 // LHS = ADD i64 RHS, Z LHSlo = UADDO i32 RHSlo, Zlo
16574 // setcc LHS ult RHS -> LHSHi = UADDO_CARRY i32 RHShi, Zhi
16575 // similarly for subtraction
16576
16577 // LHS = ADD i64 Y, 1 LHSlo = UADDO i32 Ylo, 1
16578 // setcc LHS eq 0 -> LHSHi = UADDO_CARRY i32 Yhi, 0
16579
16580 if (VT == MVT::i64 && ((CC == ISD::SETULT &&
16582 (CC == ISD::SETUGT &&
16584 (CC == ISD::SETEQ && CRHS && CRHS->isZero() &&
16585 sd_match(LHS, m_Add(m_Value(), m_One()))))) {
16586 bool IsAdd = LHS.getOpcode() == ISD::ADD;
16587
16588 SDValue Op0 = LHS.getOperand(0);
16589 SDValue Op1 = LHS.getOperand(1);
16590
16591 SDValue Op0Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op0);
16592 SDValue Op1Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op1);
16593
16594 SDValue Op0Hi = getHiHalf64(Op0, DAG);
16595 SDValue Op1Hi = getHiHalf64(Op1, DAG);
16596
16597 SDValue NodeLo =
16598 DAG.getNode(IsAdd ? ISD::UADDO : ISD::USUBO, SL,
16599 DAG.getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});
16600
16601 SDValue CarryInHi = NodeLo.getValue(1);
16602 SDValue NodeHi = DAG.getNode(IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY,
16603 SL, DAG.getVTList(MVT::i32, MVT::i1),
16604 {Op0Hi, Op1Hi, CarryInHi});
16605
16606 SDValue ResultLo = NodeLo.getValue(0);
16607 SDValue ResultHi = NodeHi.getValue(0);
16608
16609 SDValue JoinedResult =
16610 DAG.getBuildVector(MVT::v2i32, SL, {ResultLo, ResultHi});
16611
16612 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, JoinedResult);
16613 SDValue Overflow = NodeHi.getValue(1);
16614 DCI.CombineTo(LHS.getNode(), Result);
16615 return Overflow;
16616 }
16617
16618 if (VT != MVT::f32 && VT != MVT::f64 &&
16619 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16620 return SDValue();
16621
16622 // Match isinf/isfinite pattern
16623 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
16624 // (fcmp one (fabs x), inf) -> (fp_class x,
16625 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
16626 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
16627 LHS.getOpcode() == ISD::FABS) {
16628 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
16629 if (!CRHS)
16630 return SDValue();
16631
16632 const APFloat &APF = CRHS->getValueAPF();
16633 if (APF.isInfinity() && !APF.isNegative()) {
16634 const unsigned IsInfMask =
16636 const unsigned IsFiniteMask =
16640 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
16641 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
16642 DAG.getConstant(Mask, SL, MVT::i32));
16643 }
16644 }
16645
16646 return SDValue();
16647}
16648
16649SDValue
16650SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
16651 DAGCombinerInfo &DCI) const {
16652 SelectionDAG &DAG = DCI.DAG;
16653 SDLoc SL(N);
16654 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
16655
16656 SDValue Src = N->getOperand(0);
16657 SDValue Shift = N->getOperand(0);
16658
16659 // TODO: Extend type shouldn't matter (assuming legal types).
16660 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
16661 Shift = Shift.getOperand(0);
16662
16663 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
16664 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
16665 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
16666 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
16667 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
16668 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
16669 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
16670 SDValue Shifted = DAG.getZExtOrTrunc(
16671 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
16672
16673 unsigned ShiftOffset = 8 * Offset;
16674 if (Shift.getOpcode() == ISD::SHL)
16675 ShiftOffset -= C->getZExtValue();
16676 else
16677 ShiftOffset += C->getZExtValue();
16678
16679 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16680 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
16681 MVT::f32, Shifted);
16682 }
16683 }
16684 }
16685
16686 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16687 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
16688 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
16689 // We simplified Src. If this node is not dead, visit it again so it is
16690 // folded properly.
16691 if (N->getOpcode() != ISD::DELETED_NODE)
16692 DCI.AddToWorklist(N);
16693 return SDValue(N, 0);
16694 }
16695
16696 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
16697 if (SDValue DemandedSrc =
16698 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
16699 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
16700
16701 return SDValue();
16702}
16703
16704SDValue SITargetLowering::performClampCombine(SDNode *N,
16705 DAGCombinerInfo &DCI) const {
16706 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
16707 if (!CSrc)
16708 return SDValue();
16709
16710 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16711 const APFloat &F = CSrc->getValueAPF();
16712 APFloat Zero = APFloat::getZero(F.getSemantics());
16713 if (F < Zero ||
16714 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16715 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
16716 }
16717
16718 APFloat One(F.getSemantics(), "1.0");
16719 if (F > One)
16720 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
16721
16722 return SDValue(CSrc, 0);
16723}
16724
16725SDValue SITargetLowering::performSelectCombine(SDNode *N,
16726 DAGCombinerInfo &DCI) const {
16727
16728 // Try to fold CMP + SELECT patterns with shared constants (both FP and
16729 // integer).
16730 // Detect when CMP and SELECT use the same constant and fold them to avoid
16731 // loading the constant twice. Specifically handles patterns like:
16732 // %cmp = icmp eq i32 %val, 4242
16733 // %sel = select i1 %cmp, i32 4242, i32 %other
16734 // It can be optimized to reuse %val instead of 4242 in select.
16735 SDValue Cond = N->getOperand(0);
16736 SDValue TrueVal = N->getOperand(1);
16737 SDValue FalseVal = N->getOperand(2);
16738
16739 // Check if condition is a comparison.
16740 if (Cond.getOpcode() != ISD::SETCC)
16741 return SDValue();
16742
16743 SDValue LHS = Cond.getOperand(0);
16744 SDValue RHS = Cond.getOperand(1);
16745 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
16746
16747 bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
16748 bool isInteger = LHS.getValueType().isInteger();
16749
16750 // Handle simple floating-point and integer types only.
16751 if (!isFloatingPoint && !isInteger)
16752 return SDValue();
16753
16754 bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
16755 bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
16756 if (!isEquality && !isNonEquality)
16757 return SDValue();
16758
16759 SDValue ArgVal, ConstVal;
16760 if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
16761 (isInteger && isa<ConstantSDNode>(RHS))) {
16762 ConstVal = RHS;
16763 ArgVal = LHS;
16764 } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
16765 (isInteger && isa<ConstantSDNode>(LHS))) {
16766 ConstVal = LHS;
16767 ArgVal = RHS;
16768 } else {
16769 return SDValue();
16770 }
16771
16772 // Skip optimization for inlinable immediates.
16773 if (isFloatingPoint) {
16774 const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
16775 if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
16776 return SDValue();
16777 } else {
16779 cast<ConstantSDNode>(ConstVal)->getSExtValue()))
16780 return SDValue();
16781 }
16782
16783 // For equality and non-equality comparisons, patterns:
16784 // select (setcc x, const), const, y -> select (setcc x, const), x, y
16785 // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
16786 if (!(isEquality && TrueVal == ConstVal) &&
16787 !(isNonEquality && FalseVal == ConstVal))
16788 return SDValue();
16789
16790 SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
16791 SDValue SelectRHS =
16792 (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
16793 return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
16794 SelectLHS, SelectRHS);
16795}
16796
16798 DAGCombinerInfo &DCI) const {
16799 switch (N->getOpcode()) {
16800 case ISD::ADD:
16801 case ISD::SUB:
16802 case ISD::SHL:
16803 case ISD::SRL:
16804 case ISD::SRA:
16805 case ISD::AND:
16806 case ISD::OR:
16807 case ISD::XOR:
16808 case ISD::MUL:
16809 case ISD::SETCC:
16810 case ISD::SELECT:
16811 case ISD::SMIN:
16812 case ISD::SMAX:
16813 case ISD::UMIN:
16814 case ISD::UMAX:
16815 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
16816 return Res;
16817 break;
16818 default:
16819 break;
16820 }
16821
16822 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
16823 return SDValue();
16824
16825 switch (N->getOpcode()) {
16826 case ISD::ADD:
16827 return performAddCombine(N, DCI);
16828 case ISD::PTRADD:
16829 return performPtrAddCombine(N, DCI);
16830 case ISD::SUB:
16831 return performSubCombine(N, DCI);
16832 case ISD::UADDO_CARRY:
16833 case ISD::USUBO_CARRY:
16834 return performAddCarrySubCarryCombine(N, DCI);
16835 case ISD::FADD:
16836 return performFAddCombine(N, DCI);
16837 case ISD::FSUB:
16838 return performFSubCombine(N, DCI);
16839 case ISD::FDIV:
16840 return performFDivCombine(N, DCI);
16841 case ISD::FMUL:
16842 return performFMulCombine(N, DCI);
16843 case ISD::SETCC:
16844 return performSetCCCombine(N, DCI);
16845 case ISD::SELECT:
16846 if (auto Res = performSelectCombine(N, DCI))
16847 return Res;
16848 break;
16849 case ISD::FMAXNUM:
16850 case ISD::FMINNUM:
16851 case ISD::FMAXNUM_IEEE:
16852 case ISD::FMINNUM_IEEE:
16853 case ISD::FMAXIMUM:
16854 case ISD::FMINIMUM:
16855 case ISD::FMAXIMUMNUM:
16856 case ISD::FMINIMUMNUM:
16857 case ISD::SMAX:
16858 case ISD::SMIN:
16859 case ISD::UMAX:
16860 case ISD::UMIN:
16863 return performMinMaxCombine(N, DCI);
16864 case ISD::FMA:
16865 return performFMACombine(N, DCI);
16866 case ISD::AND:
16867 return performAndCombine(N, DCI);
16868 case ISD::OR:
16869 return performOrCombine(N, DCI);
16870 case ISD::FSHR: {
16872 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
16873 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
16874 return matchPERM(N, DCI);
16875 }
16876 break;
16877 }
16878 case ISD::XOR:
16879 return performXorCombine(N, DCI);
16880 case ISD::ZERO_EXTEND:
16881 return performZeroExtendCombine(N, DCI);
16883 return performSignExtendInRegCombine(N, DCI);
16885 return performClassCombine(N, DCI);
16886 case ISD::FCANONICALIZE:
16887 return performFCanonicalizeCombine(N, DCI);
16888 case AMDGPUISD::RCP:
16889 return performRcpCombine(N, DCI);
16890 case ISD::FLDEXP:
16891 case AMDGPUISD::FRACT:
16892 case AMDGPUISD::RSQ:
16895 case AMDGPUISD::RSQ_CLAMP: {
16896 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
16897 SDValue Src = N->getOperand(0);
16898 if (Src.isUndef())
16899 return Src;
16900 break;
16901 }
16902 case ISD::SINT_TO_FP:
16903 case ISD::UINT_TO_FP:
16904 return performUCharToFloatCombine(N, DCI);
16905 case ISD::FCOPYSIGN:
16906 return performFCopySignCombine(N, DCI);
16911 return performCvtF32UByteNCombine(N, DCI);
16912 case AMDGPUISD::FMED3:
16913 return performFMed3Combine(N, DCI);
16915 return performCvtPkRTZCombine(N, DCI);
16916 case AMDGPUISD::CLAMP:
16917 return performClampCombine(N, DCI);
16918 case ISD::SCALAR_TO_VECTOR: {
16919 SelectionDAG &DAG = DCI.DAG;
16920 EVT VT = N->getValueType(0);
16921
16922 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
16923 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
16924 SDLoc SL(N);
16925 SDValue Src = N->getOperand(0);
16926 EVT EltVT = Src.getValueType();
16927 if (EltVT != MVT::i16)
16928 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
16929
16930 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
16931 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
16932 }
16933
16934 break;
16935 }
16937 return performExtractVectorEltCombine(N, DCI);
16939 return performInsertVectorEltCombine(N, DCI);
16940 case ISD::FP_ROUND:
16941 return performFPRoundCombine(N, DCI);
16942 case ISD::LOAD: {
16943 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
16944 return Widened;
16945 [[fallthrough]];
16946 }
16947 default: {
16948 if (!DCI.isBeforeLegalize()) {
16949 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
16950 return performMemSDNodeCombine(MemNode, DCI);
16951 }
16952
16953 break;
16954 }
16955 }
16956
16958}
16959
16960/// Helper function for adjustWritemask
16961static unsigned SubIdx2Lane(unsigned Idx) {
16962 switch (Idx) {
16963 default:
16964 return ~0u;
16965 case AMDGPU::sub0:
16966 return 0;
16967 case AMDGPU::sub1:
16968 return 1;
16969 case AMDGPU::sub2:
16970 return 2;
16971 case AMDGPU::sub3:
16972 return 3;
16973 case AMDGPU::sub4:
16974 return 4; // Possible with TFE/LWE
16975 }
16976}
16977
16978/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
16979SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
16980 SelectionDAG &DAG) const {
16981 unsigned Opcode = Node->getMachineOpcode();
16982
16983 // Subtract 1 because the vdata output is not a MachineSDNode operand.
16984 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
16985 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
16986 return Node; // not implemented for D16
16987
16988 SDNode *Users[5] = {nullptr};
16989 unsigned Lane = 0;
16990 unsigned DmaskIdx =
16991 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
16992 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
16993 unsigned NewDmask = 0;
16994 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
16995 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
16996 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
16997 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx));
16998 unsigned TFCLane = 0;
16999 bool HasChain = Node->getNumValues() > 1;
17000
17001 if (OldDmask == 0) {
17002 // These are folded out, but on the chance it happens don't assert.
17003 return Node;
17004 }
17005
17006 unsigned OldBitsSet = llvm::popcount(OldDmask);
17007 // Work out which is the TFE/LWE lane if that is enabled.
17008 if (UsesTFC) {
17009 TFCLane = OldBitsSet;
17010 }
17011
17012 // Try to figure out the used register components
17013 for (SDUse &Use : Node->uses()) {
17014
17015 // Don't look at users of the chain.
17016 if (Use.getResNo() != 0)
17017 continue;
17018
17019 SDNode *User = Use.getUser();
17020
17021 // Abort if we can't understand the usage
17022 if (!User->isMachineOpcode() ||
17023 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17024 return Node;
17025
17026 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
17027 // Note that subregs are packed, i.e. Lane==0 is the first bit set
17028 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
17029 // set, etc.
17030 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
17031 if (Lane == ~0u)
17032 return Node;
17033
17034 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
17035 if (UsesTFC && Lane == TFCLane) {
17036 Users[Lane] = User;
17037 } else {
17038 // Set which texture component corresponds to the lane.
17039 unsigned Comp;
17040 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17041 Comp = llvm::countr_zero(Dmask);
17042 Dmask &= ~(1 << Comp);
17043 }
17044
17045 // Abort if we have more than one user per component.
17046 if (Users[Lane])
17047 return Node;
17048
17049 Users[Lane] = User;
17050 NewDmask |= 1 << Comp;
17051 }
17052 }
17053
17054 // Don't allow 0 dmask, as hardware assumes one channel enabled.
17055 bool NoChannels = !NewDmask;
17056 if (NoChannels) {
17057 if (!UsesTFC) {
17058 // No uses of the result and not using TFC. Then do nothing.
17059 return Node;
17060 }
17061 // If the original dmask has one channel - then nothing to do
17062 if (OldBitsSet == 1)
17063 return Node;
17064 // Use an arbitrary dmask - required for the instruction to work
17065 NewDmask = 1;
17066 }
17067 // Abort if there's no change
17068 if (NewDmask == OldDmask)
17069 return Node;
17070
17071 unsigned BitsSet = llvm::popcount(NewDmask);
17072
17073 // Check for TFE or LWE - increase the number of channels by one to account
17074 // for the extra return value
17075 // This will need adjustment for D16 if this is also included in
17076 // adjustWriteMask (this function) but at present D16 are excluded.
17077 unsigned NewChannels = BitsSet + UsesTFC;
17078
17079 int NewOpcode =
17080 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
17081 assert(NewOpcode != -1 &&
17082 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
17083 "failed to find equivalent MIMG op");
17084
17085 // Adjust the writemask in the node
17087 llvm::append_range(Ops, Node->ops().take_front(DmaskIdx));
17088 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
17089 llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1));
17090
17091 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
17092
17093 MVT ResultVT = NewChannels == 1
17094 ? SVT
17095 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
17096 : NewChannels == 5 ? 8
17097 : NewChannels);
17098 SDVTList NewVTList =
17099 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
17100
17101 MachineSDNode *NewNode =
17102 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
17103
17104 if (HasChain) {
17105 // Update chain.
17106 DAG.setNodeMemRefs(NewNode, Node->memoperands());
17107 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
17108 }
17109
17110 if (NewChannels == 1) {
17111 assert(Node->hasNUsesOfValue(1, 0));
17112 SDNode *Copy =
17113 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
17114 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
17115 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
17116 return nullptr;
17117 }
17118
17119 // Update the users of the node with the new indices
17120 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17121 SDNode *User = Users[i];
17122 if (!User) {
17123 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
17124 // Users[0] is still nullptr because channel 0 doesn't really have a use.
17125 if (i || !NoChannels)
17126 continue;
17127 } else {
17128 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
17129 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
17130 if (NewUser != User) {
17131 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
17132 DAG.RemoveDeadNode(User);
17133 }
17134 }
17135
17136 switch (Idx) {
17137 default:
17138 break;
17139 case AMDGPU::sub0:
17140 Idx = AMDGPU::sub1;
17141 break;
17142 case AMDGPU::sub1:
17143 Idx = AMDGPU::sub2;
17144 break;
17145 case AMDGPU::sub2:
17146 Idx = AMDGPU::sub3;
17147 break;
17148 case AMDGPU::sub3:
17149 Idx = AMDGPU::sub4;
17150 break;
17151 }
17152 }
17153
17154 DAG.RemoveDeadNode(Node);
17155 return nullptr;
17156}
17157
17159 if (Op.getOpcode() == ISD::AssertZext)
17160 Op = Op.getOperand(0);
17161
17162 return isa<FrameIndexSDNode>(Op);
17163}
17164
17165/// Legalize target independent instructions (e.g. INSERT_SUBREG)
17166/// with frame index operands.
17167/// LLVM assumes that inputs are to these instructions are registers.
17168SDNode *
17170 SelectionDAG &DAG) const {
17171 if (Node->getOpcode() == ISD::CopyToReg) {
17172 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
17173 SDValue SrcVal = Node->getOperand(2);
17174
17175 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
17176 // to try understanding copies to physical registers.
17177 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
17178 SDLoc SL(Node);
17180 SDValue VReg = DAG.getRegister(
17181 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17182
17183 SDNode *Glued = Node->getGluedNode();
17184 SDValue ToVReg = DAG.getCopyToReg(
17185 Node->getOperand(0), SL, VReg, SrcVal,
17186 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
17187 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
17188 VReg, ToVReg.getValue(1));
17189 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
17190 DAG.RemoveDeadNode(Node);
17191 return ToResultReg.getNode();
17192 }
17193 }
17194
17196 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
17197 if (!isFrameIndexOp(Node->getOperand(i))) {
17198 Ops.push_back(Node->getOperand(i));
17199 continue;
17200 }
17201
17202 SDLoc DL(Node);
17203 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
17204 Node->getOperand(i).getValueType(),
17205 Node->getOperand(i)),
17206 0));
17207 }
17208
17209 return DAG.UpdateNodeOperands(Node, Ops);
17210}
17211
17212/// Fold the instructions after selecting them.
17213/// Returns null if users were already updated.
17215 SelectionDAG &DAG) const {
17217 unsigned Opcode = Node->getMachineOpcode();
17218
17219 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
17220 !TII->isGather4(Opcode) &&
17221 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
17222 return adjustWritemask(Node, DAG);
17223 }
17224
17225 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17227 return Node;
17228 }
17229
17230 switch (Opcode) {
17231 case AMDGPU::V_DIV_SCALE_F32_e64:
17232 case AMDGPU::V_DIV_SCALE_F64_e64: {
17233 // Satisfy the operand register constraint when one of the inputs is
17234 // undefined. Ordinarily each undef value will have its own implicit_def of
17235 // a vreg, so force these to use a single register.
17236 SDValue Src0 = Node->getOperand(1);
17237 SDValue Src1 = Node->getOperand(3);
17238 SDValue Src2 = Node->getOperand(5);
17239
17240 if ((Src0.isMachineOpcode() &&
17241 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
17242 (Src0 == Src1 || Src0 == Src2))
17243 break;
17244
17245 MVT VT = Src0.getValueType().getSimpleVT();
17246 const TargetRegisterClass *RC =
17247 getRegClassFor(VT, Src0.getNode()->isDivergent());
17248
17250 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
17251
17252 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
17253 Src0, SDValue());
17254
17255 // src0 must be the same register as src1 or src2, even if the value is
17256 // undefined, so make sure we don't violate this constraint.
17257 if (Src0.isMachineOpcode() &&
17258 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
17259 if (Src1.isMachineOpcode() &&
17260 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17261 Src0 = Src1;
17262 else if (Src2.isMachineOpcode() &&
17263 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17264 Src0 = Src2;
17265 else {
17266 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
17267 Src0 = UndefReg;
17268 Src1 = UndefReg;
17269 }
17270 } else
17271 break;
17272
17274 Ops[1] = Src0;
17275 Ops[3] = Src1;
17276 Ops[5] = Src2;
17277 Ops.push_back(ImpDef.getValue(1));
17278 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
17279 }
17280 default:
17281 break;
17282 }
17283
17284 return Node;
17285}
17286
17287// Any MIMG instructions that use tfe or lwe require an initialization of the
17288// result register that will be written in the case of a memory access failure.
17289// The required code is also added to tie this init code to the result of the
17290// img instruction.
17293 const SIRegisterInfo &TRI = TII->getRegisterInfo();
17294 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
17295 MachineBasicBlock &MBB = *MI.getParent();
17296
17297 int DstIdx =
17298 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
17299 unsigned InitIdx = 0;
17300
17301 if (TII->isImage(MI)) {
17302 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
17303 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
17304 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
17305
17306 if (!TFE && !LWE) // intersect_ray
17307 return;
17308
17309 unsigned TFEVal = TFE ? TFE->getImm() : 0;
17310 unsigned LWEVal = LWE ? LWE->getImm() : 0;
17311 unsigned D16Val = D16 ? D16->getImm() : 0;
17312
17313 if (!TFEVal && !LWEVal)
17314 return;
17315
17316 // At least one of TFE or LWE are non-zero
17317 // We have to insert a suitable initialization of the result value and
17318 // tie this to the dest of the image instruction.
17319
17320 // Calculate which dword we have to initialize to 0.
17321 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
17322
17323 // check that dmask operand is found.
17324 assert(MO_Dmask && "Expected dmask operand in instruction");
17325
17326 unsigned dmask = MO_Dmask->getImm();
17327 // Determine the number of active lanes taking into account the
17328 // Gather4 special case
17329 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
17330
17331 bool Packed = !Subtarget->hasUnpackedD16VMem();
17332
17333 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17334
17335 // Abandon attempt if the dst size isn't large enough
17336 // - this is in fact an error but this is picked up elsewhere and
17337 // reported correctly.
17338 uint32_t DstSize =
17339 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
17340 if (DstSize < InitIdx)
17341 return;
17342 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
17343 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
17344 } else {
17345 return;
17346 }
17347
17348 const DebugLoc &DL = MI.getDebugLoc();
17349
17350 // Create a register for the initialization value.
17351 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
17352 unsigned NewDst = 0; // Final initialized value will be in here
17353
17354 // If PRTStrictNull feature is enabled (the default) then initialize
17355 // all the result registers to 0, otherwise just the error indication
17356 // register (VGPRn+1)
17357 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17358 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17359
17360 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
17361 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17362 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
17363 // Initialize dword
17364 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
17365 // clang-format off
17366 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
17367 .addImm(0);
17368 // clang-format on
17369 // Insert into the super-reg
17370 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
17371 .addReg(PrevDst)
17372 .addReg(SubReg)
17374
17375 PrevDst = NewDst;
17376 }
17377
17378 // Add as an implicit operand
17379 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
17380
17381 // Tie the just added implicit operand to the dst
17382 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
17383}
17384
17385/// Assign the register class depending on the number of
17386/// bits set in the writemask
17388 SDNode *Node) const {
17390
17391 MachineFunction *MF = MI.getParent()->getParent();
17393
17394 if (TII->isVOP3(MI.getOpcode())) {
17395 // Make sure constant bus requirements are respected.
17396 TII->legalizeOperandsVOP3(MRI, MI);
17397
17398 if (TII->isMAI(MI)) {
17399 // The ordinary src0, src1, src2 were legalized above.
17400 //
17401 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
17402 // as a separate instruction.
17403 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17404 AMDGPU::OpName::scale_src0);
17405 if (Src0Idx != -1) {
17406 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17407 AMDGPU::OpName::scale_src1);
17408 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
17409 TII->usesConstantBus(MRI, MI, Src1Idx))
17410 TII->legalizeOpWithMove(MI, Src1Idx);
17411 }
17412 }
17413
17414 return;
17415 }
17416
17417 if (TII->isImage(MI))
17418 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
17419}
17420
17422 uint64_t Val) {
17423 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
17424 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
17425}
17426
17428 const SDLoc &DL,
17429 SDValue Ptr) const {
17431
17432 // Build the half of the subregister with the constants before building the
17433 // full 128-bit register. If we are building multiple resource descriptors,
17434 // this will allow CSEing of the 2-component register.
17435 const SDValue Ops0[] = {
17436 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
17437 buildSMovImm32(DAG, DL, 0),
17438 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17439 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
17440 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
17441
17442 SDValue SubRegHi = SDValue(
17443 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
17444
17445 // Combine the constants and the pointer.
17446 const SDValue Ops1[] = {
17447 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
17448 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
17449 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
17450
17451 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
17452}
17453
17454/// Return a resource descriptor with the 'Add TID' bit enabled
17455/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
17456/// of the resource descriptor) to create an offset, which is added to
17457/// the resource pointer.
17459 SDValue Ptr, uint32_t RsrcDword1,
17460 uint64_t RsrcDword2And3) const {
17461 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
17462 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
17463 if (RsrcDword1) {
17464 PtrHi =
17465 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
17466 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
17467 0);
17468 }
17469
17470 SDValue DataLo =
17471 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
17472 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
17473
17474 const SDValue Ops[] = {
17475 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
17476 PtrLo,
17477 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17478 PtrHi,
17479 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
17480 DataLo,
17481 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
17482 DataHi,
17483 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
17484
17485 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
17486}
17487
17488//===----------------------------------------------------------------------===//
17489// SI Inline Assembly Support
17490//===----------------------------------------------------------------------===//
17491
17492std::pair<unsigned, const TargetRegisterClass *>
17494 StringRef Constraint,
17495 MVT VT) const {
17496 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
17497
17498 const TargetRegisterClass *RC = nullptr;
17499 if (Constraint.size() == 1) {
17500 // Check if we cannot determine the bit size of the given value type. This
17501 // can happen, for example, in this situation where we have an empty struct
17502 // (size 0): `call void asm "", "v"({} poison)`-
17503 if (VT == MVT::Other)
17504 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17505 const unsigned BitWidth = VT.getSizeInBits();
17506 switch (Constraint[0]) {
17507 default:
17508 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17509 case 's':
17510 case 'r':
17511 switch (BitWidth) {
17512 case 16:
17513 RC = &AMDGPU::SReg_32RegClass;
17514 break;
17515 case 64:
17516 RC = &AMDGPU::SGPR_64RegClass;
17517 break;
17518 default:
17520 if (!RC)
17521 return std::pair(0U, nullptr);
17522 break;
17523 }
17524 break;
17525 case 'v':
17526 switch (BitWidth) {
17527 case 16:
17528 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17529 : &AMDGPU::VGPR_32_Lo256RegClass;
17530 break;
17531 default:
17532 RC = Subtarget->has1024AddressableVGPRs()
17533 ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
17534 : TRI->getVGPRClassForBitWidth(BitWidth);
17535 if (!RC)
17536 return std::pair(0U, nullptr);
17537 break;
17538 }
17539 break;
17540 case 'a':
17541 if (!Subtarget->hasMAIInsts())
17542 break;
17543 switch (BitWidth) {
17544 case 16:
17545 RC = &AMDGPU::AGPR_32RegClass;
17546 break;
17547 default:
17548 RC = TRI->getAGPRClassForBitWidth(BitWidth);
17549 if (!RC)
17550 return std::pair(0U, nullptr);
17551 break;
17552 }
17553 break;
17554 }
17555 } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
17556 const unsigned BitWidth = VT.getSizeInBits();
17557 switch (BitWidth) {
17558 case 16:
17559 RC = &AMDGPU::AV_32RegClass;
17560 break;
17561 default:
17562 RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
17563 if (!RC)
17564 return std::pair(0U, nullptr);
17565 break;
17566 }
17567 }
17568
17569 // We actually support i128, i16 and f16 as inline parameters
17570 // even if they are not reported as legal
17571 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
17572 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
17573 return std::pair(0U, RC);
17574
17575 auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
17576 if (Kind != '\0') {
17577 if (Kind == 'v') {
17578 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17579 } else if (Kind == 's') {
17580 RC = &AMDGPU::SGPR_32RegClass;
17581 } else if (Kind == 'a') {
17582 RC = &AMDGPU::AGPR_32RegClass;
17583 }
17584
17585 if (RC) {
17586 if (NumRegs > 1) {
17587 if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
17588 return std::pair(0U, nullptr);
17589
17590 uint32_t Width = NumRegs * 32;
17591 // Prohibit constraints for register ranges with a width that does not
17592 // match the required type.
17593 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
17594 return std::pair(0U, nullptr);
17595
17596 MCRegister Reg = RC->getRegister(Idx);
17598 RC = TRI->getVGPRClassForBitWidth(Width);
17599 else if (SIRegisterInfo::isSGPRClass(RC))
17600 RC = TRI->getSGPRClassForBitWidth(Width);
17601 else if (SIRegisterInfo::isAGPRClass(RC))
17602 RC = TRI->getAGPRClassForBitWidth(Width);
17603 if (RC) {
17604 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17605 if (!Reg) {
17606 // The register class does not contain the requested register,
17607 // e.g., because it is an SGPR pair that would violate alignment
17608 // requirements.
17609 return std::pair(0U, nullptr);
17610 }
17611 return std::pair(Reg, RC);
17612 }
17613 }
17614
17615 // Check for lossy scalar/vector conversions.
17616 if (VT.isVector() && VT.getSizeInBits() != 32)
17617 return std::pair(0U, nullptr);
17618 if (Idx < RC->getNumRegs())
17619 return std::pair(RC->getRegister(Idx), RC);
17620 return std::pair(0U, nullptr);
17621 }
17622 }
17623
17624 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17625 if (Ret.first)
17626 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
17627
17628 return Ret;
17629}
17630
17631static bool isImmConstraint(StringRef Constraint) {
17632 if (Constraint.size() == 1) {
17633 switch (Constraint[0]) {
17634 default:
17635 break;
17636 case 'I':
17637 case 'J':
17638 case 'A':
17639 case 'B':
17640 case 'C':
17641 return true;
17642 }
17643 } else if (Constraint == "DA" || Constraint == "DB") {
17644 return true;
17645 }
17646 return false;
17647}
17648
17651 if (Constraint.size() == 1) {
17652 switch (Constraint[0]) {
17653 default:
17654 break;
17655 case 's':
17656 case 'v':
17657 case 'a':
17658 return C_RegisterClass;
17659 }
17660 } else if (Constraint.size() == 2) {
17661 if (Constraint == "VA")
17662 return C_RegisterClass;
17663 }
17664 if (isImmConstraint(Constraint)) {
17665 return C_Other;
17666 }
17667 return TargetLowering::getConstraintType(Constraint);
17668}
17669
17670static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
17672 Val = Val & maskTrailingOnes<uint64_t>(Size);
17673 }
17674 return Val;
17675}
17676
17678 StringRef Constraint,
17679 std::vector<SDValue> &Ops,
17680 SelectionDAG &DAG) const {
17681 if (isImmConstraint(Constraint)) {
17682 uint64_t Val;
17683 if (getAsmOperandConstVal(Op, Val) &&
17684 checkAsmConstraintVal(Op, Constraint, Val)) {
17685 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
17686 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
17687 }
17688 } else {
17690 }
17691}
17692
17694 unsigned Size = Op.getScalarValueSizeInBits();
17695 if (Size > 64)
17696 return false;
17697
17698 if (Size == 16 && !Subtarget->has16BitInsts())
17699 return false;
17700
17702 Val = C->getSExtValue();
17703 return true;
17704 }
17706 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17707 return true;
17708 }
17710 if (Size != 16 || Op.getNumOperands() != 2)
17711 return false;
17712 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
17713 return false;
17714 if (ConstantSDNode *C = V->getConstantSplatNode()) {
17715 Val = C->getSExtValue();
17716 return true;
17717 }
17718 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
17719 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17720 return true;
17721 }
17722 }
17723
17724 return false;
17725}
17726
17728 uint64_t Val) const {
17729 if (Constraint.size() == 1) {
17730 switch (Constraint[0]) {
17731 case 'I':
17733 case 'J':
17734 return isInt<16>(Val);
17735 case 'A':
17736 return checkAsmConstraintValA(Op, Val);
17737 case 'B':
17738 return isInt<32>(Val);
17739 case 'C':
17740 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
17742 default:
17743 break;
17744 }
17745 } else if (Constraint.size() == 2) {
17746 if (Constraint == "DA") {
17747 int64_t HiBits = static_cast<int32_t>(Val >> 32);
17748 int64_t LoBits = static_cast<int32_t>(Val);
17749 return checkAsmConstraintValA(Op, HiBits, 32) &&
17750 checkAsmConstraintValA(Op, LoBits, 32);
17751 }
17752 if (Constraint == "DB") {
17753 return true;
17754 }
17755 }
17756 llvm_unreachable("Invalid asm constraint");
17757}
17758
17760 unsigned MaxSize) const {
17761 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
17762 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
17763 if (Size == 16) {
17764 MVT VT = Op.getSimpleValueType();
17765 switch (VT.SimpleTy) {
17766 default:
17767 return false;
17768 case MVT::i16:
17769 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
17770 case MVT::f16:
17771 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
17772 case MVT::bf16:
17773 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
17774 case MVT::v2i16:
17775 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
17776 case MVT::v2f16:
17777 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
17778 case MVT::v2bf16:
17779 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
17780 }
17781 }
17782 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
17783 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
17784 return true;
17785 return false;
17786}
17787
17788static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
17789 switch (UnalignedClassID) {
17790 case AMDGPU::VReg_64RegClassID:
17791 return AMDGPU::VReg_64_Align2RegClassID;
17792 case AMDGPU::VReg_96RegClassID:
17793 return AMDGPU::VReg_96_Align2RegClassID;
17794 case AMDGPU::VReg_128RegClassID:
17795 return AMDGPU::VReg_128_Align2RegClassID;
17796 case AMDGPU::VReg_160RegClassID:
17797 return AMDGPU::VReg_160_Align2RegClassID;
17798 case AMDGPU::VReg_192RegClassID:
17799 return AMDGPU::VReg_192_Align2RegClassID;
17800 case AMDGPU::VReg_224RegClassID:
17801 return AMDGPU::VReg_224_Align2RegClassID;
17802 case AMDGPU::VReg_256RegClassID:
17803 return AMDGPU::VReg_256_Align2RegClassID;
17804 case AMDGPU::VReg_288RegClassID:
17805 return AMDGPU::VReg_288_Align2RegClassID;
17806 case AMDGPU::VReg_320RegClassID:
17807 return AMDGPU::VReg_320_Align2RegClassID;
17808 case AMDGPU::VReg_352RegClassID:
17809 return AMDGPU::VReg_352_Align2RegClassID;
17810 case AMDGPU::VReg_384RegClassID:
17811 return AMDGPU::VReg_384_Align2RegClassID;
17812 case AMDGPU::VReg_512RegClassID:
17813 return AMDGPU::VReg_512_Align2RegClassID;
17814 case AMDGPU::VReg_1024RegClassID:
17815 return AMDGPU::VReg_1024_Align2RegClassID;
17816 case AMDGPU::AReg_64RegClassID:
17817 return AMDGPU::AReg_64_Align2RegClassID;
17818 case AMDGPU::AReg_96RegClassID:
17819 return AMDGPU::AReg_96_Align2RegClassID;
17820 case AMDGPU::AReg_128RegClassID:
17821 return AMDGPU::AReg_128_Align2RegClassID;
17822 case AMDGPU::AReg_160RegClassID:
17823 return AMDGPU::AReg_160_Align2RegClassID;
17824 case AMDGPU::AReg_192RegClassID:
17825 return AMDGPU::AReg_192_Align2RegClassID;
17826 case AMDGPU::AReg_256RegClassID:
17827 return AMDGPU::AReg_256_Align2RegClassID;
17828 case AMDGPU::AReg_512RegClassID:
17829 return AMDGPU::AReg_512_Align2RegClassID;
17830 case AMDGPU::AReg_1024RegClassID:
17831 return AMDGPU::AReg_1024_Align2RegClassID;
17832 default:
17833 return -1;
17834 }
17835}
17836
17837// Figure out which registers should be reserved for stack access. Only after
17838// the function is legalized do we know all of the non-spill stack objects or if
17839// calls are present.
17843 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
17844 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17845 const SIInstrInfo *TII = ST.getInstrInfo();
17846
17847 if (Info->isEntryFunction()) {
17848 // Callable functions have fixed registers used for stack access.
17850 }
17851
17852 // TODO: Move this logic to getReservedRegs()
17853 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
17854 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
17855 Register SReg = ST.isWave32()
17856 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
17857 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
17858 &AMDGPU::SGPR_64RegClass);
17859 Info->setSGPRForEXECCopy(SReg);
17860
17861 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
17862 Info->getStackPtrOffsetReg()));
17863 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
17864 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
17865
17866 // We need to worry about replacing the default register with itself in case
17867 // of MIR testcases missing the MFI.
17868 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
17869 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
17870
17871 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
17872 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
17873
17874 Info->limitOccupancy(MF);
17875
17876 if (ST.isWave32() && !MF.empty()) {
17877 for (auto &MBB : MF) {
17878 for (auto &MI : MBB) {
17879 TII->fixImplicitOperands(MI);
17880 }
17881 }
17882 }
17883
17884 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
17885 // classes if required. Ideally the register class constraints would differ
17886 // per-subtarget, but there's no easy way to achieve that right now. This is
17887 // not a problem for VGPRs because the correctly aligned VGPR class is implied
17888 // from using them as the register class for legal types.
17889 if (ST.needsAlignedVGPRs()) {
17890 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
17891 const Register Reg = Register::index2VirtReg(I);
17892 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
17893 if (!RC)
17894 continue;
17895 int NewClassID = getAlignedAGPRClassID(RC->getID());
17896 if (NewClassID != -1)
17897 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
17898 }
17899 }
17900
17902}
17903
17905 KnownBits &Known,
17906 const APInt &DemandedElts,
17907 const SelectionDAG &DAG,
17908 unsigned Depth) const {
17909 Known.resetAll();
17910 unsigned Opc = Op.getOpcode();
17911 switch (Opc) {
17913 unsigned IID = Op.getConstantOperandVal(0);
17914 switch (IID) {
17915 case Intrinsic::amdgcn_mbcnt_lo:
17916 case Intrinsic::amdgcn_mbcnt_hi: {
17917 const GCNSubtarget &ST =
17919 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
17920 // most 31 + src1.
17921 Known.Zero.setBitsFrom(
17922 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
17923 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
17924 Known = KnownBits::add(Known, Known2);
17925 return;
17926 }
17927 }
17928 break;
17929 }
17930 }
17932 Op, Known, DemandedElts, DAG, Depth);
17933}
17934
17936 const int FI, KnownBits &Known, const MachineFunction &MF) const {
17938
17939 // Set the high bits to zero based on the maximum allowed scratch size per
17940 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
17941 // calculation won't overflow, so assume the sign bit is never set.
17942 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
17943}
17944
17946 GISelValueTracking &VT, KnownBits &Known,
17947 unsigned Dim) {
17948 unsigned MaxValue =
17949 ST.getMaxWorkitemID(VT.getMachineFunction().getFunction(), Dim);
17950 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
17951}
17952
17954 KnownBits &Known, const APInt &DemandedElts,
17955 unsigned BFEWidth, bool SExt, unsigned Depth) {
17957 const MachineOperand &Src1 = MI.getOperand(2);
17958
17959 unsigned Src1Cst = 0;
17960 if (Src1.isImm()) {
17961 Src1Cst = Src1.getImm();
17962 } else if (Src1.isReg()) {
17963 auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI);
17964 if (!Cst)
17965 return;
17966 Src1Cst = Cst->Value.getZExtValue();
17967 } else {
17968 return;
17969 }
17970
17971 // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
17972 // Width is always [22:16].
17973 const unsigned Offset =
17974 Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
17975 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
17976
17977 if (Width >= BFEWidth) // Ill-formed.
17978 return;
17979
17980 VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
17981 Depth + 1);
17982
17983 Known = Known.extractBits(Width, Offset);
17984
17985 if (SExt)
17986 Known = Known.sext(BFEWidth);
17987 else
17988 Known = Known.zext(BFEWidth);
17989}
17990
17992 GISelValueTracking &VT, Register R, KnownBits &Known,
17993 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
17994 unsigned Depth) const {
17995 Known.resetAll();
17996 const MachineInstr *MI = MRI.getVRegDef(R);
17997 switch (MI->getOpcode()) {
17998 case AMDGPU::S_BFE_I32:
17999 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18000 /*SExt=*/true, Depth);
18001 case AMDGPU::S_BFE_U32:
18002 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18003 /*SExt=*/false, Depth);
18004 case AMDGPU::S_BFE_I64:
18005 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18006 /*SExt=*/true, Depth);
18007 case AMDGPU::S_BFE_U64:
18008 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18009 /*SExt=*/false, Depth);
18010 case AMDGPU::G_INTRINSIC:
18011 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18012 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
18013 switch (IID) {
18014 case Intrinsic::amdgcn_workitem_id_x:
18015 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 0);
18016 break;
18017 case Intrinsic::amdgcn_workitem_id_y:
18018 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 1);
18019 break;
18020 case Intrinsic::amdgcn_workitem_id_z:
18021 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 2);
18022 break;
18023 case Intrinsic::amdgcn_mbcnt_lo:
18024 case Intrinsic::amdgcn_mbcnt_hi: {
18025 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18026 // most 31 + src1.
18027 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
18028 ? getSubtarget()->getWavefrontSizeLog2()
18029 : 5);
18030 KnownBits Known2;
18031 VT.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
18032 Depth + 1);
18033 Known = KnownBits::add(Known, Known2);
18034 break;
18035 }
18036 case Intrinsic::amdgcn_groupstaticsize: {
18037 // We can report everything over the maximum size as 0. We can't report
18038 // based on the actual size because we don't know if it's accurate or not
18039 // at any given point.
18040 Known.Zero.setHighBits(
18041 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
18042 break;
18043 }
18044 }
18045 break;
18046 }
18047 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18048 Known.Zero.setHighBits(24);
18049 break;
18050 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18051 Known.Zero.setHighBits(16);
18052 break;
18053 case AMDGPU::G_AMDGPU_SMED3:
18054 case AMDGPU::G_AMDGPU_UMED3: {
18055 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
18056
18057 KnownBits Known2;
18058 VT.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
18059 if (Known2.isUnknown())
18060 break;
18061
18062 KnownBits Known1;
18063 VT.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
18064 if (Known1.isUnknown())
18065 break;
18066
18067 KnownBits Known0;
18068 VT.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
18069 if (Known0.isUnknown())
18070 break;
18071
18072 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
18073 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
18074 Known.One = Known0.One & Known1.One & Known2.One;
18075 break;
18076 }
18077 }
18078}
18079
18082 unsigned Depth) const {
18083 const MachineInstr *MI = MRI.getVRegDef(R);
18084 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
18085 // FIXME: Can this move to generic code? What about the case where the call
18086 // site specifies a lower alignment?
18087 Intrinsic::ID IID = GI->getIntrinsicID();
18089 AttributeList Attrs =
18090 Intrinsic::getAttributes(Ctx, IID, Intrinsic::getType(Ctx, IID));
18091 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
18092 return *RetAlign;
18093 }
18094 return Align(1);
18095}
18096
18099 const Align CacheLineAlign = Align(64);
18100
18101 // Pre-GFX10 target did not benefit from loop alignment
18102 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
18103 getSubtarget()->hasInstFwdPrefetchBug())
18104 return PrefAlign;
18105
18106 // On GFX10 I$ is 4 x 64 bytes cache lines.
18107 // By default prefetcher keeps one cache line behind and reads two ahead.
18108 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
18109 // behind and one ahead.
18110 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
18111 // If loop fits 64 bytes it always spans no more than two cache lines and
18112 // does not need an alignment.
18113 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
18114 // Else if loop is less or equal 192 bytes we need two lines behind.
18115
18117 const MachineBasicBlock *Header = ML->getHeader();
18118 if (Header->getAlignment() != PrefAlign)
18119 return Header->getAlignment(); // Already processed.
18120
18121 unsigned LoopSize = 0;
18122 for (const MachineBasicBlock *MBB : ML->blocks()) {
18123 // If inner loop block is aligned assume in average half of the alignment
18124 // size to be added as nops.
18125 if (MBB != Header)
18126 LoopSize += MBB->getAlignment().value() / 2;
18127
18128 for (const MachineInstr &MI : *MBB) {
18129 LoopSize += TII->getInstSizeInBytes(MI);
18130 if (LoopSize > 192)
18131 return PrefAlign;
18132 }
18133 }
18134
18135 if (LoopSize <= 64)
18136 return PrefAlign;
18137
18138 if (LoopSize <= 128)
18139 return CacheLineAlign;
18140
18141 // If any of parent loops is surrounded by prefetch instructions do not
18142 // insert new for inner loop, which would reset parent's settings.
18143 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
18144 if (MachineBasicBlock *Exit = P->getExitBlock()) {
18145 auto I = Exit->getFirstNonDebugInstr();
18146 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18147 return CacheLineAlign;
18148 }
18149 }
18150
18151 MachineBasicBlock *Pre = ML->getLoopPreheader();
18152 MachineBasicBlock *Exit = ML->getExitBlock();
18153
18154 if (Pre && Exit) {
18155 auto PreTerm = Pre->getFirstTerminator();
18156 if (PreTerm == Pre->begin() ||
18157 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18158 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18159 .addImm(1); // prefetch 2 lines behind PC
18160
18161 auto ExitHead = Exit->getFirstNonDebugInstr();
18162 if (ExitHead == Exit->end() ||
18163 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18164 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18165 .addImm(2); // prefetch 1 line behind PC
18166 }
18167
18168 return CacheLineAlign;
18169}
18170
18172static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
18173 assert(N->getOpcode() == ISD::CopyFromReg);
18174 do {
18175 // Follow the chain until we find an INLINEASM node.
18176 N = N->getOperand(0).getNode();
18177 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
18178 return true;
18179 } while (N->getOpcode() == ISD::CopyFromReg);
18180 return false;
18181}
18182
18185 UniformityInfo *UA) const {
18186 switch (N->getOpcode()) {
18187 case ISD::CopyFromReg: {
18188 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
18189 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
18190 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18191 Register Reg = R->getReg();
18192
18193 // FIXME: Why does this need to consider isLiveIn?
18194 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
18195 return !TRI->isSGPRReg(MRI, Reg);
18196
18197 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
18198 return UA->isDivergent(V);
18199
18201 return !TRI->isSGPRReg(MRI, Reg);
18202 }
18203 case ISD::LOAD: {
18204 const LoadSDNode *L = cast<LoadSDNode>(N);
18205 unsigned AS = L->getAddressSpace();
18206 // A flat load may access private memory.
18208 }
18209 case ISD::CALLSEQ_END:
18210 return true;
18212 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
18214 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
18233 // Target-specific read-modify-write atomics are sources of divergence.
18234 return true;
18235 default:
18236 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
18237 // Generic read-modify-write atomics are sources of divergence.
18238 return A->readMem() && A->writeMem();
18239 }
18240 return false;
18241 }
18242}
18243
18245 EVT VT) const {
18246 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
18247 case MVT::f32:
18249 case MVT::f64:
18250 case MVT::f16:
18252 default:
18253 return false;
18254 }
18255}
18256
18258 LLT Ty, const MachineFunction &MF) const {
18259 switch (Ty.getScalarSizeInBits()) {
18260 case 32:
18261 return !denormalModeIsFlushAllF32(MF);
18262 case 64:
18263 case 16:
18264 return !denormalModeIsFlushAllF64F16(MF);
18265 default:
18266 return false;
18267 }
18268}
18269
18271 const APInt &DemandedElts,
18272 const SelectionDAG &DAG,
18273 bool SNaN,
18274 unsigned Depth) const {
18275 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
18276 const MachineFunction &MF = DAG.getMachineFunction();
18278
18279 if (Info->getMode().DX10Clamp)
18280 return true; // Clamped to 0.
18281 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
18282 }
18283
18285 DAG, SNaN, Depth);
18286}
18287
18288// On older subtargets, global FP atomic instructions have a hardcoded FP mode
18289// and do not support FP32 denormals, and only support v2f16/f64 denormals.
18291 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
18292 return true;
18293
18295 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
18296 if (DenormMode == DenormalMode::getPreserveSign())
18297 return true;
18298
18299 // TODO: Remove this.
18300 return RMW->getFunction()
18301 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
18302 .getValueAsBool();
18303}
18304
18306 LLVMContext &Ctx = RMW->getContext();
18307 StringRef MemScope =
18308 Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system");
18309
18310 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
18311 << "Hardware instruction generated for atomic "
18312 << RMW->getOperationName(RMW->getOperation())
18313 << " operation at memory scope " << MemScope;
18314}
18315
18316static bool isV2F16OrV2BF16(Type *Ty) {
18317 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
18318 Type *EltTy = VT->getElementType();
18319 return VT->getNumElements() == 2 &&
18320 (EltTy->isHalfTy() || EltTy->isBFloatTy());
18321 }
18322
18323 return false;
18324}
18325
18326static bool isV2F16(Type *Ty) {
18328 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
18329}
18330
18331static bool isV2BF16(Type *Ty) {
18333 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
18334}
18335
18336/// \return true if atomicrmw integer ops work for the type.
18337static bool isAtomicRMWLegalIntTy(Type *Ty) {
18338 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
18339 unsigned BW = IT->getBitWidth();
18340 return BW == 32 || BW == 64;
18341 }
18342
18343 return false;
18344}
18345
18346/// \return true if this atomicrmw xchg type can be selected.
18347static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
18348 Type *Ty = RMW->getType();
18349 if (isAtomicRMWLegalIntTy(Ty))
18350 return true;
18351
18352 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
18353 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
18354 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
18355 return BW == 32 || BW == 64;
18356 }
18357
18358 if (Ty->isFloatTy() || Ty->isDoubleTy())
18359 return true;
18360
18362 return VT->getNumElements() == 2 &&
18363 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18364 }
18365
18366 return false;
18367}
18368
18369/// \returns true if it's valid to emit a native instruction for \p RMW, based
18370/// on the properties of the target memory.
18371static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
18372 const AtomicRMWInst *RMW,
18373 bool HasSystemScope) {
18374 // The remote/fine-grained access logic is different from the integer
18375 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
18376 // fine-grained access does not work, even for a device local allocation.
18377 //
18378 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
18379 // allocations work.
18380 if (HasSystemScope) {
18382 RMW->hasMetadata("amdgpu.no.remote.memory"))
18383 return true;
18384 if (Subtarget.hasEmulatedSystemScopeAtomics())
18385 return true;
18387 return true;
18388
18389 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
18390}
18391
18392/// \return Action to perform on AtomicRMWInsts for integer operations.
18399
18400/// Return if a flat address space atomicrmw can access private memory.
18402 const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace);
18403 return !MD ||
18405}
18406
18414
18417 unsigned AS = RMW->getPointerAddressSpace();
18418 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
18420
18421 // 64-bit flat atomics that dynamically reside in private memory will silently
18422 // be dropped.
18423 //
18424 // Note that we will emit a new copy of the original atomic in the expansion,
18425 // which will be incrementally relegalized.
18426 const DataLayout &DL = RMW->getFunction()->getDataLayout();
18427 if (AS == AMDGPUAS::FLAT_ADDRESS &&
18428 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
18431
18432 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
18434 ORE.emit([=]() {
18435 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
18436 });
18437 return Kind;
18438 };
18439
18440 auto SSID = RMW->getSyncScopeID();
18441 bool HasSystemScope =
18442 SSID == SyncScope::System ||
18443 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
18444
18445 auto Op = RMW->getOperation();
18446 switch (Op) {
18448 // PCIe supports add and xchg for system atomics.
18449 return isAtomicRMWLegalXChgTy(RMW)
18452 case AtomicRMWInst::Add:
18453 // PCIe supports add and xchg for system atomics.
18455 case AtomicRMWInst::Sub:
18456 case AtomicRMWInst::And:
18457 case AtomicRMWInst::Or:
18458 case AtomicRMWInst::Xor:
18459 case AtomicRMWInst::Max:
18460 case AtomicRMWInst::Min:
18467 if (Subtarget->hasEmulatedSystemScopeAtomics())
18469
18470 // On most subtargets, for atomicrmw operations other than add/xchg,
18471 // whether or not the instructions will behave correctly depends on where
18472 // the address physically resides and what interconnect is used in the
18473 // system configuration. On some some targets the instruction will nop,
18474 // and in others synchronization will only occur at degraded device scope.
18475 //
18476 // If the allocation is known local to the device, the instructions should
18477 // work correctly.
18478 if (RMW->hasMetadata("amdgpu.no.remote.memory"))
18480
18481 // If fine-grained remote memory works at device scope, we don't need to
18482 // do anything.
18483 if (!HasSystemScope &&
18484 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18486
18487 // If we are targeting a remote allocated address, it depends what kind of
18488 // allocation the address belongs to.
18489 //
18490 // If the allocation is fine-grained (in host memory, or in PCIe peer
18491 // device memory), the operation will fail depending on the target.
18492 //
18493 // Note fine-grained host memory access does work on APUs or if XGMI is
18494 // used, but we do not know if we are targeting an APU or the system
18495 // configuration from the ISA version/target-cpu.
18496 if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
18498
18501 // Atomic sub/or/xor do not work over PCI express, but atomic add
18502 // does. InstCombine transforms these with 0 to or, so undo that.
18503 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
18504 ConstVal && ConstVal->isNullValue())
18506 }
18507
18508 // If the allocation could be in remote, fine-grained memory, the rmw
18509 // instructions may fail. cmpxchg should work, so emit that. On some
18510 // system configurations, PCIe atomics aren't supported so cmpxchg won't
18511 // even work, so you're out of luck anyway.
18512
18513 // In summary:
18514 //
18515 // Cases that may fail:
18516 // - fine-grained pinned host memory
18517 // - fine-grained migratable host memory
18518 // - fine-grained PCIe peer device
18519 //
18520 // Cases that should work, but may be treated overly conservatively.
18521 // - fine-grained host memory on an APU
18522 // - fine-grained XGMI peer device
18524 }
18525
18527 }
18528 case AtomicRMWInst::FAdd: {
18529 Type *Ty = RMW->getType();
18530
18531 // TODO: Handle REGION_ADDRESS
18532 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18533 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
18534 // is fixed to round-to-nearest-even.
18535 //
18536 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
18537 // round-to-nearest-even.
18538 //
18539 // We ignore the rounding mode problem, even in strictfp. The C++ standard
18540 // suggests it is OK if the floating-point mode may not match the calling
18541 // thread.
18542 if (Ty->isFloatTy()) {
18543 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
18545 }
18546
18547 if (Ty->isDoubleTy()) {
18548 // Ignores denormal mode, but we don't consider flushing mandatory.
18549 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
18551 }
18552
18553 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18555
18557 }
18558
18559 // LDS atomics respect the denormal mode from the mode register.
18560 //
18561 // Traditionally f32 global/buffer memory atomics would unconditionally
18562 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
18563 // flush.
18564 //
18565 // On targets with flat atomic fadd, denormals would flush depending on
18566 // whether the target address resides in LDS or global memory. We consider
18567 // this flat-maybe-flush as will-flush.
18568 if (Ty->isFloatTy() &&
18569 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18572
18573 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
18574 // safe. The message phrasing also should be better.
18575 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18576 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18577 // gfx942, gfx12
18578 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18579 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18580 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
18581 // gfx90a, gfx942, gfx12
18582 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18583 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18584
18585 // gfx942, gfx12
18586 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
18587 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18588 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
18589 // gfx90a, gfx942, gfx12
18590 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18591 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18592
18593 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
18594 // buffer. gfx12 does have the buffer version.
18595 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
18596 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18597 }
18598
18599 // global and flat atomic fadd f64: gfx90a, gfx942.
18600 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18601 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18602
18603 if (AS != AMDGPUAS::FLAT_ADDRESS) {
18604 if (Ty->isFloatTy()) {
18605 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
18606 // gfx11+.
18607 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18608 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18609 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
18610 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18611 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18612 } else {
18613 // gfx908
18614 if (RMW->use_empty() &&
18615 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18616 isV2F16(Ty))
18617 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18618 }
18619 }
18620
18621 // flat atomic fadd f32: gfx942, gfx11+.
18622 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
18623 if (Subtarget->hasFlatAtomicFaddF32Inst())
18624 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18625
18626 // If it is in flat address space, and the type is float, we will try to
18627 // expand it, if the target supports global and lds atomic fadd. The
18628 // reason we need that is, in the expansion, we emit the check of
18629 // address space. If it is in global address space, we emit the global
18630 // atomic fadd; if it is in shared address space, we emit the LDS atomic
18631 // fadd.
18632 if (Subtarget->hasLDSFPAtomicAddF32()) {
18633 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18635 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18637 }
18638 }
18639 }
18640
18642 }
18644 case AtomicRMWInst::FMax: {
18645 Type *Ty = RMW->getType();
18646
18647 // LDS float and double fmin/fmax were always supported.
18648 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18649 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
18651 }
18652
18653 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18654 // For flat and global cases:
18655 // float, double in gfx7. Manual claims denormal support.
18656 // Removed in gfx8.
18657 // float, double restored in gfx10.
18658 // double removed again in gfx11, so only f32 for gfx11/gfx12.
18659 //
18660 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
18661 // no f32.
18662 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18663 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18664 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18665 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18666 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18667 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
18669 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18670 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18671 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18672 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18673 }
18674 }
18675
18677 }
18680 default:
18682 }
18683
18684 llvm_unreachable("covered atomicrmw op switch");
18685}
18686
18693
18700
18703 unsigned AddrSpace = CmpX->getPointerAddressSpace();
18704 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
18706
18707 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
18709
18710 const DataLayout &DL = CmpX->getDataLayout();
18711
18712 Type *ValTy = CmpX->getNewValOperand()->getType();
18713
18714 // If a 64-bit flat atomic may alias private, we need to avoid using the
18715 // atomic in the private case.
18716 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand
18718}
18719
18720const TargetRegisterClass *
18721SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
18723 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18724 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18725 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18726 : &AMDGPU::SReg_32RegClass;
18727 if (!TRI->isSGPRClass(RC) && !isDivergent)
18728 return TRI->getEquivalentSGPRClass(RC);
18729 if (TRI->isSGPRClass(RC) && isDivergent)
18730 return TRI->getEquivalentVGPRClass(RC);
18731
18732 return RC;
18733}
18734
18735// FIXME: This is a workaround for DivergenceAnalysis not understanding always
18736// uniform values (as produced by the mask results of control flow intrinsics)
18737// used outside of divergent blocks. The phi users need to also be treated as
18738// always uniform.
18739//
18740// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
18741static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
18742 unsigned WaveSize) {
18743 // FIXME: We assume we never cast the mask results of a control flow
18744 // intrinsic.
18745 // Early exit if the type won't be consistent as a compile time hack.
18746 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
18747 if (!IT || IT->getBitWidth() != WaveSize)
18748 return false;
18749
18750 if (!isa<Instruction>(V))
18751 return false;
18752 if (!Visited.insert(V).second)
18753 return false;
18754 bool Result = false;
18755 for (const auto *U : V->users()) {
18757 if (V == U->getOperand(1)) {
18758 switch (Intrinsic->getIntrinsicID()) {
18759 default:
18760 Result = false;
18761 break;
18762 case Intrinsic::amdgcn_if_break:
18763 case Intrinsic::amdgcn_if:
18764 case Intrinsic::amdgcn_else:
18765 Result = true;
18766 break;
18767 }
18768 }
18769 if (V == U->getOperand(0)) {
18770 switch (Intrinsic->getIntrinsicID()) {
18771 default:
18772 Result = false;
18773 break;
18774 case Intrinsic::amdgcn_end_cf:
18775 case Intrinsic::amdgcn_loop:
18776 Result = true;
18777 break;
18778 }
18779 }
18780 } else {
18781 Result = hasCFUser(U, Visited, WaveSize);
18782 }
18783 if (Result)
18784 break;
18785 }
18786 return Result;
18787}
18788
18790 const Value *V) const {
18791 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
18792 if (CI->isInlineAsm()) {
18793 // FIXME: This cannot give a correct answer. This should only trigger in
18794 // the case where inline asm returns mixed SGPR and VGPR results, used
18795 // outside the defining block. We don't have a specific result to
18796 // consider, so this assumes if any value is SGPR, the overall register
18797 // also needs to be SGPR.
18798 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
18800 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
18801 for (auto &TC : TargetConstraints) {
18802 if (TC.Type == InlineAsm::isOutput) {
18804 const TargetRegisterClass *RC =
18805 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
18806 TC.ConstraintVT)
18807 .second;
18808 if (RC && SIRI->isSGPRClass(RC))
18809 return true;
18810 }
18811 }
18812 }
18813 }
18815 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
18816}
18817
18819 for (SDUse &Use : N->uses()) {
18821 if (getBasePtrIndex(M) == Use.getOperandNo())
18822 return true;
18823 }
18824 }
18825 return false;
18826}
18827
18829 SDValue N1) const {
18830 if (!N0.hasOneUse())
18831 return false;
18832 // Take care of the opportunity to keep N0 uniform
18833 if (N0->isDivergent() || !N1->isDivergent())
18834 return true;
18835 // Check if we have a good chance to form the memory access pattern with the
18836 // base and offset
18837 return (DAG.isBaseWithConstantOffset(N0) &&
18839}
18840
18842 Register N0, Register N1) const {
18843 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
18844}
18845
18848 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
18850 if (I.getMetadata("amdgpu.noclobber"))
18851 Flags |= MONoClobber;
18852 if (I.getMetadata("amdgpu.last.use"))
18853 Flags |= MOLastUse;
18854 return Flags;
18855}
18856
18858 Instruction *AI) const {
18859 // Given: atomicrmw fadd ptr %addr, float %val ordering
18860 //
18861 // With this expansion we produce the following code:
18862 // [...]
18863 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
18864 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
18865 //
18866 // atomicrmw.shared:
18867 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
18868 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
18869 // float %val ordering
18870 // br label %atomicrmw.phi
18871 //
18872 // atomicrmw.check.private:
18873 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
18874 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
18875 //
18876 // atomicrmw.private:
18877 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
18878 // %loaded.private = load float, ptr addrspace(5) %cast.private
18879 // %val.new = fadd float %loaded.private, %val
18880 // store float %val.new, ptr addrspace(5) %cast.private
18881 // br label %atomicrmw.phi
18882 //
18883 // atomicrmw.global:
18884 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
18885 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
18886 // float %val ordering
18887 // br label %atomicrmw.phi
18888 //
18889 // atomicrmw.phi:
18890 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
18891 // [ %loaded.private, %atomicrmw.private ],
18892 // [ %loaded.global, %atomicrmw.global ]
18893 // br label %atomicrmw.end
18894 //
18895 // atomicrmw.end:
18896 // [...]
18897 //
18898 //
18899 // For 64-bit atomics which may reside in private memory, we perform a simpler
18900 // version that only inserts the private check, and uses the flat operation.
18901
18902 IRBuilder<> Builder(AI);
18903 LLVMContext &Ctx = Builder.getContext();
18904
18905 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
18906 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
18908 Value *Addr = AI->getOperand(PtrOpIdx);
18909
18910 /// TODO: Only need to check private, then emit flat-known-not private (no
18911 /// need for shared block, or cast to global).
18913
18914 Align Alignment;
18915 if (RMW)
18916 Alignment = RMW->getAlign();
18917 else if (CX)
18918 Alignment = CX->getAlign();
18919 else
18920 llvm_unreachable("unhandled atomic operation");
18921
18922 // FullFlatEmulation is true if we need to issue the private, shared, and
18923 // global cases.
18924 //
18925 // If this is false, we are only dealing with the flat-targeting-private case,
18926 // where we only insert a check for private and still use the flat instruction
18927 // for global and shared.
18928
18929 bool FullFlatEmulation =
18930 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
18931 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
18932 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
18933 RMW->getType()->isDoubleTy()));
18934
18935 // If the return value isn't used, do not introduce a false use in the phi.
18936 bool ReturnValueIsUsed = !AI->use_empty();
18937
18938 BasicBlock *BB = Builder.GetInsertBlock();
18939 Function *F = BB->getParent();
18940 BasicBlock *ExitBB =
18941 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
18942 BasicBlock *SharedBB = nullptr;
18943
18944 BasicBlock *CheckPrivateBB = BB;
18945 if (FullFlatEmulation) {
18946 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
18947 CheckPrivateBB =
18948 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
18949 }
18950
18951 BasicBlock *PrivateBB =
18952 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
18953 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
18954 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
18955
18956 std::prev(BB->end())->eraseFromParent();
18957 Builder.SetInsertPoint(BB);
18958
18959 Value *LoadedShared = nullptr;
18960 if (FullFlatEmulation) {
18961 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
18962 {Addr}, nullptr, "is.shared");
18963 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
18964 Builder.SetInsertPoint(SharedBB);
18965 Value *CastToLocal = Builder.CreateAddrSpaceCast(
18967
18968 Instruction *Clone = AI->clone();
18969 Clone->insertInto(SharedBB, SharedBB->end());
18970 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
18971 LoadedShared = Clone;
18972
18973 Builder.CreateBr(PhiBB);
18974 Builder.SetInsertPoint(CheckPrivateBB);
18975 }
18976
18977 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
18978 {Addr}, nullptr, "is.private");
18979 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
18980
18981 Builder.SetInsertPoint(PrivateBB);
18982
18983 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
18985
18986 Value *LoadedPrivate;
18987 if (RMW) {
18988 LoadedPrivate = Builder.CreateAlignedLoad(
18989 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
18990
18991 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
18992 LoadedPrivate, RMW->getValOperand());
18993
18994 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
18995 } else {
18996 auto [ResultLoad, Equal] =
18997 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
18998 CX->getNewValOperand(), CX->getAlign());
18999
19000 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
19001 ResultLoad, 0);
19002 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19003 }
19004
19005 Builder.CreateBr(PhiBB);
19006
19007 Builder.SetInsertPoint(GlobalBB);
19008
19009 // Continue using a flat instruction if we only emitted the check for private.
19010 Instruction *LoadedGlobal = AI;
19011 if (FullFlatEmulation) {
19012 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19014 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
19015 }
19016
19017 AI->removeFromParent();
19018 AI->insertInto(GlobalBB, GlobalBB->end());
19019
19020 // The new atomicrmw may go through another round of legalization later.
19021 if (!FullFlatEmulation) {
19022 // We inserted the runtime check already, make sure we do not try to
19023 // re-expand this.
19024 // TODO: Should union with any existing metadata.
19025 MDBuilder MDB(F->getContext());
19026 MDNode *RangeNotPrivate =
19029 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
19030 RangeNotPrivate);
19031 }
19032
19033 Builder.CreateBr(PhiBB);
19034
19035 Builder.SetInsertPoint(PhiBB);
19036
19037 if (ReturnValueIsUsed) {
19038 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
19039 AI->replaceAllUsesWith(Loaded);
19040 if (FullFlatEmulation)
19041 Loaded->addIncoming(LoadedShared, SharedBB);
19042 Loaded->addIncoming(LoadedPrivate, PrivateBB);
19043 Loaded->addIncoming(LoadedGlobal, GlobalBB);
19044 Loaded->takeName(AI);
19045 }
19046
19047 Builder.CreateBr(ExitBB);
19048}
19049
19051 unsigned PtrOpIdx) {
19052 Value *PtrOp = I->getOperand(PtrOpIdx);
19055
19056 Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
19057 Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
19058 I->getIterator());
19059 I->setOperand(PtrOpIdx, ASCast);
19060}
19061
19064
19067
19070 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
19071 ConstVal && ConstVal->isNullValue()) {
19072 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
19074
19075 // We may still need the private-alias-flat handling below.
19076
19077 // TODO: Skip this for cases where we cannot access remote memory.
19078 }
19079 }
19080
19081 // The non-flat expansions should only perform the de-canonicalization of
19082 // identity values.
19084 return;
19085
19087}
19088
19095
19099
19101 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19102}
19103
19105 if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19106 return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
19107
19109 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19110}
19111
19112LoadInst *
19114 IRBuilder<> Builder(AI);
19115 auto Order = AI->getOrdering();
19116
19117 // The optimization removes store aspect of the atomicrmw. Therefore, cache
19118 // must be flushed if the atomic ordering had a release semantics. This is
19119 // not necessary a fence, a release fence just coincides to do that flush.
19120 // Avoid replacing of an atomicrmw with a release semantics.
19121 if (isReleaseOrStronger(Order))
19122 return nullptr;
19123
19124 LoadInst *LI = Builder.CreateAlignedLoad(
19125 AI->getType(), AI->getPointerOperand(), AI->getAlign());
19126 LI->setAtomic(Order, AI->getSyncScopeID());
19127 LI->copyMetadata(*AI);
19128 LI->takeName(AI);
19129 AI->replaceAllUsesWith(LI);
19130 AI->eraseFromParent();
19131 return LI;
19132}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
return SDValue()
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
constexpr LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition Compiler.h:298
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
dxil translate DXIL Translate Metadata
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
#define P(N)
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1254
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition SIDefines.h:1251
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
The Input class is used to parse a yaml document into in-memory structs and vectors.
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1120
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:6057
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1497
bool isNegative() const
Definition APFloat.h:1449
bool isNormal() const
Definition APFloat.h:1453
APInt bitcastToAPInt() const
Definition APFloat.h:1353
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1138
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1098
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1079
bool isInfinity() const
Definition APFloat.h:1446
Class for arbitrary precision integers.
Definition APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1385
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:466
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
Definition APInt.h:366
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1237
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1221
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition Function.cpp:339
const Function * getParent() const
Definition Argument.h:44
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:361
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
BitVector & set()
Definition BitVector.h:370
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_NE
not equal
Definition InstrTypes.h:698
bool isSigned() const
Definition InstrTypes.h:930
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:770
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:214
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
bool isBigEndian() const
Definition DataLayout.h:208
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
iterator_range< arg_iterator > args()
Definition Function.h:890
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition Function.cpp:803
Argument * getArg(unsigned i) const
Definition Function.h:884
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMadF16() const
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasPrivateSegmentBuffer() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2783
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition MDBuilder.cpp:96
Metadata node.
Definition Metadata.h:1078
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1442
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition ModRef.h:221
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:215
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition ModRef.h:218
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition Register.h:67
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:420
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:344
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition Type.h:258
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:107
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI void set(Value *Val)
Definition Value.h:905
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
const Use & getOperandUse(unsigned i) const
Definition User.h:245
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:201
constexpr bool isZero() const
Definition TypeSize.h:154
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:774
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ GlobalAddress
Definition ISDOpcodes.h:88
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:656
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:773
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:528
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:535
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
Offsets
Offsets in bytes from the start of the input buffer.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
Definition MathExtras.h:54
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
@ Offset
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition Analysis.cpp:241
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1727
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:841
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644
@ Done
Definition Threading.h:60
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition MathExtras.h:232
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:289
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2138
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:557
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
Definition ModRef.h:296
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:396
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:282
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1734
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:222
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition Analysis.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition MathExtras.h:241
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:560
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1760
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1899
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:86
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:304
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition APFloat.cpp:264
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:42
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
unsigned getOrigArgIndex() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:172
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:225
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:180
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:248
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs