LLVM 22.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
42#include "llvm/IR/MDBuilder.h"
45#include "llvm/Support/ModRef.h"
47#include <optional>
48
49using namespace llvm;
50using namespace llvm::SDPatternMatch;
51
52#define DEBUG_TYPE "si-lower"
53
54STATISTIC(NumTailCalls, "Number of tail calls");
55
56static cl::opt<bool>
57 DisableLoopAlignment("amdgpu-disable-loop-alignment",
58 cl::desc("Do not align and prefetch loops"),
59 cl::init(false));
60
62 "amdgpu-use-divergent-register-indexing", cl::Hidden,
63 cl::desc("Use indirect register addressing for divergent indexes"),
64 cl::init(false));
65
66// TODO: This option should be removed once we switch to always using PTRADD in
67// the SelectionDAG.
69 "amdgpu-use-sdag-ptradd", cl::Hidden,
70 cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the "
71 "SelectionDAG ISel"),
72 cl::init(false));
73
76 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
77}
78
81 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
82}
83
84static unsigned findFirstFreeSGPR(CCState &CCInfo) {
85 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
86 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
87 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
88 return AMDGPU::SGPR0 + Reg;
89 }
90 }
91 llvm_unreachable("Cannot allocate sgpr");
92}
93
95 const GCNSubtarget &STI)
96 : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) {
97 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
98 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
99
100 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
101 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
102
103 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
104
105 const SIRegisterInfo *TRI = STI.getRegisterInfo();
106 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
107
108 addRegisterClass(MVT::f64, V64RegClass);
109 addRegisterClass(MVT::v2f32, V64RegClass);
110 addRegisterClass(MVT::Untyped, V64RegClass);
111
112 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
113 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
114
115 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
116 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
117
118 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
119 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
120
121 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
122 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
123
124 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
125 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
126
127 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
128 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
129
130 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
131 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
132
133 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
134 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
135
136 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
137 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
138
139 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
140 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
141
142 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
143 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
144
145 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
146 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
147
148 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
149 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
150
151 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
152 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
153
154 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
155 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
156
157 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
158 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
159
160 if (Subtarget->has16BitInsts()) {
161 if (Subtarget->useRealTrue16Insts()) {
162 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
163 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
164 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
165 } else {
166 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
167 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
168 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
169 }
170
171 // Unless there are also VOP3P operations, not operations are really legal.
172 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
173 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
174 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
175 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
176 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
177 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
178 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
179 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
180 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
181 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
182 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
183 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
184 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
185 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
186 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
187 }
188
189 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
190 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
191
193
194 // The boolean content concept here is too inflexible. Compares only ever
195 // really produce a 1-bit result. Any copy/extend from these will turn into a
196 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
197 // it's what most targets use.
200
201 // We need to custom lower vector stores from local memory
203 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
204 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
205 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
206 MVT::i1, MVT::v32i32},
207 Custom);
208
210 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
211 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
212 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
213 MVT::i1, MVT::v32i32},
214 Custom);
215
216 if (isTypeLegal(MVT::bf16)) {
217 for (unsigned Opc :
226 ISD::SETCC}) {
227 // FIXME: The promoted to type shouldn't need to be explicit
228 setOperationAction(Opc, MVT::bf16, Promote);
229 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
230 }
231
233
235 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
236
240
241 // We only need to custom lower because we can't specify an action for bf16
242 // sources.
245 }
246
247 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
248 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
249 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
250 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
251 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
252 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
253 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
254 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
255 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
256 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
257 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
258 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
259 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
260 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
261 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
262 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
263
264 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
265 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
266 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
267 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
268 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
269 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
270 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
271
272 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
273
277 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
278
279 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
280
282 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
283
285 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
286 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
287
289 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
290 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
291 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
292 Expand);
294 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
295 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
296 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
297 Expand);
298
300 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
301 MVT::v3i16, MVT::v4i16, MVT::Other},
302 Custom);
303
306 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
307
309
311
313 Expand);
314
315#if 0
317#endif
318
319 // We only support LOAD/STORE and vector manipulation ops for vectors
320 // with > 4 elements.
321 for (MVT VT :
322 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
323 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
324 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
325 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
326 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
327 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
328 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
329 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
330 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
331 switch (Op) {
332 case ISD::LOAD:
333 case ISD::STORE:
335 case ISD::BITCAST:
336 case ISD::UNDEF:
340 case ISD::IS_FPCLASS:
341 break;
346 break;
347 default:
349 break;
350 }
351 }
352 }
353
355
356 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
357 // is expanded to avoid having two separate loops in case the index is a VGPR.
358
359 // Most operations are naturally 32-bit vector operations. We only support
360 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
361 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
363 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
364
366 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
367
369 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
370
372 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
373 }
374
375 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
377 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
378
380 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
381
383 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
384
386 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
387 }
388
389 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
391 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
392
394 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
395
397 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
398
400 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
401 }
402
403 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
405 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
406
408 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
409
411 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
412
414 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
415 }
416
417 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
419 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
420
422 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
423
425 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
426
428 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
429 }
430
432 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
433 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
434 Custom);
435
436 if (Subtarget->hasPkMovB32()) {
437 // TODO: 16-bit element vectors should be legal with even aligned elements.
438 // TODO: Can be legal with wider source types than the result with
439 // subregister extracts.
440 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
441 }
442
443 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
444 Custom);
445
446 // Avoid stack access for these.
447 // TODO: Generalize to more vector types.
449 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
450 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
451 Custom);
452
453 // Deal with vec3 vector operations when widened to vec4.
455 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
456
457 // Deal with vec5/6/7 vector operations when widened to vec8.
459 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
460 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
461 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
462 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
463 Custom);
464
465 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
466 // and output demarshalling
467 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
468
469 // We can't return success/failure, only the old value,
470 // let LLVM add the comparison
472 Expand);
473
474 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
475
476 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
477
478 // FIXME: This should be narrowed to i32, but that only happens if i64 is
479 // illegal.
480 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
481 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
482
483 // On SI this is s_memtime and s_memrealtime on VI.
485
486 if (Subtarget->hasSMemRealTime() ||
490
491 if (Subtarget->has16BitInsts()) {
494 } else {
496 }
497
498 if (Subtarget->hasMadMacF32Insts())
500
501 if (!Subtarget->hasBFI())
502 // fcopysign can be done in a single instruction with BFI.
503 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
504
505 if (!Subtarget->hasBCNT(32))
507
508 if (!Subtarget->hasBCNT(64))
510
511 if (Subtarget->hasFFBH())
513
514 if (Subtarget->hasFFBL())
516
517 // We only really have 32-bit BFE instructions (and 16-bit on VI).
518 //
519 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
520 // effort to match them now. We want this to be false for i64 cases when the
521 // extraction isn't restricted to the upper or lower half. Ideally we would
522 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
523 // span the midpoint are probably relatively rare, so don't worry about them
524 // for now.
525 if (Subtarget->hasBFE())
527
528 // Clamp modifier on add/sub
529 if (Subtarget->hasIntClamp())
531
532 if (Subtarget->hasAddNoCarry())
533 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
534 Legal);
535
538 {MVT::f32, MVT::f64}, Custom);
539
540 // These are really only legal for ieee_mode functions. We should be avoiding
541 // them for functions that don't have ieee_mode enabled, so just say they are
542 // legal.
544 {MVT::f32, MVT::f64}, Legal);
545
546 if (Subtarget->haveRoundOpsF64())
548 Legal);
549 else
551 MVT::f64, Custom);
552
554 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
555 Legal);
556 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
557
560
561 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
562 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
563
564 // Custom lower these because we can't specify a rule based on an illegal
565 // source bf16.
568
569 if (Subtarget->has16BitInsts()) {
572 MVT::i16, Legal);
573
574 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
575
577 MVT::i16, Expand);
578
582 ISD::CTPOP},
583 MVT::i16, Promote);
584
586
587 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
588
590 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
592 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
593
597
599
600 // F16 - Constant Actions.
603
604 // F16 - Load/Store Actions.
606 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
608 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
609
610 // BF16 - Load/Store Actions.
612 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
614 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
615
616 // F16 - VOP1 Actions.
619 MVT::f16, Custom);
620
621 // BF16 - VOP1 Actions.
622 if (Subtarget->hasBF16TransInsts())
624
627
628 // F16 - VOP2 Actions.
629 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
630 Expand);
634
635 // F16 - VOP3 Actions.
637 if (STI.hasMadF16())
639
640 for (MVT VT :
641 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
642 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
643 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
644 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
645 switch (Op) {
646 case ISD::LOAD:
647 case ISD::STORE:
649 case ISD::BITCAST:
650 case ISD::UNDEF:
655 case ISD::IS_FPCLASS:
656 break;
660 break;
661 default:
663 break;
664 }
665 }
666 }
667
668 // v_perm_b32 can handle either of these.
669 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
671
672 // XXX - Do these do anything? Vector constants turn into build_vector.
673 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
674
675 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
676 Legal);
677
679 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
681 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
682
684 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
686 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
687
688 setOperationAction(ISD::AND, MVT::v2i16, Promote);
689 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
690 setOperationAction(ISD::OR, MVT::v2i16, Promote);
691 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
692 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
693 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
694
696 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
698 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
699 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
700 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
701
703 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
705 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
707 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
708
710 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
712 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
713 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
714 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
715
717 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
719 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
720
722 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
724 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
726 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
727
728 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
729 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
730 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
731 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
732 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
733 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
734
736 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
738 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
739 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
740 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
741
742 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
743 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
744 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
745 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
746 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
747 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
748
750 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
752 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
753 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
754 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
755
757 MVT::v2i32, Expand);
759
761 MVT::v4i32, Expand);
762
764 MVT::v8i32, Expand);
765
766 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
767 Subtarget->hasVOP3PInsts() ? Legal : Custom);
768
769 setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal);
770 // This isn't really legal, but this avoids the legalizer unrolling it (and
771 // allows matching fneg (fabs x) patterns)
772 setOperationAction(ISD::FABS, {MVT::v2f16, MVT::v2bf16}, Legal);
773
774 // Can do this in one BFI plus a constant materialize.
776 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
777 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
778 MVT::v32f16, MVT::v32bf16},
779 Custom);
780
783 MVT::f16, Custom);
785
788 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
789 Custom);
790
792 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
793 Expand);
794
795 for (MVT Vec16 :
796 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
797 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
800 Vec16, Custom);
802 }
803 }
804
805 if (Subtarget->hasVOP3PInsts()) {
809 MVT::v2i16, Legal);
810
813 MVT::v2f16, Legal);
814
816 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
817
819 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
820 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
821 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
822 Custom);
823
824 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
825 // Split vector operations.
830 VT, Custom);
831
832 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
833 // Split vector operations.
835 VT, Custom);
836
839 {MVT::v2f16, MVT::v4f16}, Custom);
840
841 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
842 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
843 Custom);
844
845 if (Subtarget->hasPackedFP32Ops()) {
847 MVT::v2f32, Legal);
849 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
850 Custom);
851 }
852 }
853
855
856 if (Subtarget->has16BitInsts()) {
858 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
860 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
861 } else {
862 // Legalization hack.
863 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
864
866 }
867
869 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
870 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
871 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
872 MVT::v32f16, MVT::v32bf16},
873 Custom);
874
876
877 if (Subtarget->hasVectorMulU64())
879 else if (Subtarget->hasScalarSMulU64())
881
882 if (Subtarget->hasMad64_32())
884
885 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
887
888 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
890 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
891 } else {
892 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
893 if (Subtarget->hasMinimum3Maximum3F32())
895
896 if (Subtarget->hasMinimum3Maximum3PKF16()) {
898
899 // If only the vector form is available, we need to widen to a vector.
900 if (!Subtarget->hasMinimum3Maximum3F16())
902 }
903 }
904
905 if (Subtarget->hasVOP3PInsts()) {
906 // We want to break these into v2f16 pieces, not scalarize.
908 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
909 Custom);
910 }
911
912 if (Subtarget->hasIntMinMax64())
914 Legal);
915
917 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
918 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
919 MVT::i8},
920 Custom);
921
923 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
924 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
925 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
926 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
927 Custom);
928
930 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
931 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
932 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
933 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
934 Custom);
935
941
942 // TODO: Could move this to custom lowering, could benefit from combines on
943 // extract of relevant bits.
945
947
948 if (Subtarget->hasBF16ConversionInsts()) {
949 setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom);
951 }
952
953 if (Subtarget->hasBF16PackedInsts()) {
956 MVT::v2bf16, Legal);
957 }
958
959 if (Subtarget->hasBF16TransInsts()) {
961 }
962
963 if (Subtarget->hasCvtPkF16F32Inst()) {
965 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
966 Custom);
967 }
968
972 ISD::SUB,
974 ISD::MUL,
975 ISD::FADD,
976 ISD::FSUB,
977 ISD::FDIV,
978 ISD::FMUL,
987 ISD::FMA,
988 ISD::SMIN,
989 ISD::SMAX,
990 ISD::UMIN,
991 ISD::UMAX,
994 ISD::SMIN,
995 ISD::SMAX,
996 ISD::UMIN,
997 ISD::UMAX,
998 ISD::AND,
999 ISD::OR,
1000 ISD::XOR,
1001 ISD::SHL,
1002 ISD::SRL,
1003 ISD::SRA,
1004 ISD::FSHR,
1014
1015 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1017
1018 // All memory operations. Some folding on the pointer operand is done to help
1019 // matching the constant offsets in the addressing modes.
1021 ISD::STORE,
1044
1045 // FIXME: In other contexts we pretend this is a per-function property.
1047
1049}
1050
1051const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1052
1054 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1055 return RCRegs;
1056}
1057
1058//===----------------------------------------------------------------------===//
1059// TargetLowering queries
1060//===----------------------------------------------------------------------===//
1061
1062// v_mad_mix* support a conversion from f16 to f32.
1063//
1064// There is only one special case when denormals are enabled we don't currently,
1065// where this is OK to use.
1066bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1067 EVT DestVT, EVT SrcVT) const {
1068 return DestVT.getScalarType() == MVT::f32 &&
1069 ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1070 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1071 SrcVT.getScalarType() == MVT::f16) ||
1072 (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1073 SrcVT.getScalarType() == MVT::bf16)) &&
1074 // TODO: This probably only requires no input flushing?
1076}
1077
1079 LLT DestTy, LLT SrcTy) const {
1080 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1081 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1082 DestTy.getScalarSizeInBits() == 32 &&
1083 SrcTy.getScalarSizeInBits() == 16 &&
1084 // TODO: This probably only requires no input flushing?
1085 denormalModeIsFlushAllF32(*MI.getMF());
1086}
1087
1089 // SI has some legal vector types, but no legal vector operations. Say no
1090 // shuffles are legal in order to prefer scalarizing some vector operations.
1091 return false;
1092}
1093
1095 CallingConv::ID CC,
1096 EVT VT) const {
1099
1100 if (VT.isVector()) {
1101 EVT ScalarVT = VT.getScalarType();
1102 unsigned Size = ScalarVT.getSizeInBits();
1103 if (Size == 16) {
1104 if (Subtarget->has16BitInsts()) {
1105 if (VT.isInteger())
1106 return MVT::v2i16;
1107 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1108 }
1109 return VT.isInteger() ? MVT::i32 : MVT::f32;
1110 }
1111
1112 if (Size < 16)
1113 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1114 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1115 }
1116
1117 if (VT.getSizeInBits() > 32)
1118 return MVT::i32;
1119
1121}
1122
1124 CallingConv::ID CC,
1125 EVT VT) const {
1128
1129 if (VT.isVector()) {
1130 unsigned NumElts = VT.getVectorNumElements();
1131 EVT ScalarVT = VT.getScalarType();
1132 unsigned Size = ScalarVT.getSizeInBits();
1133
1134 // FIXME: Should probably promote 8-bit vectors to i16.
1135 if (Size == 16 && Subtarget->has16BitInsts())
1136 return (NumElts + 1) / 2;
1137
1138 if (Size <= 32)
1139 return NumElts;
1140
1141 if (Size > 32)
1142 return NumElts * ((Size + 31) / 32);
1143 } else if (VT.getSizeInBits() > 32)
1144 return (VT.getSizeInBits() + 31) / 32;
1145
1147}
1148
1150 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1151 unsigned &NumIntermediates, MVT &RegisterVT) const {
1152 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1153 unsigned NumElts = VT.getVectorNumElements();
1154 EVT ScalarVT = VT.getScalarType();
1155 unsigned Size = ScalarVT.getSizeInBits();
1156 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1157 // support, but unless we can properly handle 3-vectors, it will be still be
1158 // inconsistent.
1159 if (Size == 16 && Subtarget->has16BitInsts()) {
1160 if (ScalarVT == MVT::bf16) {
1161 RegisterVT = MVT::i32;
1162 IntermediateVT = MVT::v2bf16;
1163 } else {
1164 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1165 IntermediateVT = RegisterVT;
1166 }
1167 NumIntermediates = (NumElts + 1) / 2;
1168 return NumIntermediates;
1169 }
1170
1171 if (Size == 32) {
1172 RegisterVT = ScalarVT.getSimpleVT();
1173 IntermediateVT = RegisterVT;
1174 NumIntermediates = NumElts;
1175 return NumIntermediates;
1176 }
1177
1178 if (Size < 16 && Subtarget->has16BitInsts()) {
1179 // FIXME: Should probably form v2i16 pieces
1180 RegisterVT = MVT::i16;
1181 IntermediateVT = ScalarVT;
1182 NumIntermediates = NumElts;
1183 return NumIntermediates;
1184 }
1185
1186 if (Size != 16 && Size <= 32) {
1187 RegisterVT = MVT::i32;
1188 IntermediateVT = ScalarVT;
1189 NumIntermediates = NumElts;
1190 return NumIntermediates;
1191 }
1192
1193 if (Size > 32) {
1194 RegisterVT = MVT::i32;
1195 IntermediateVT = RegisterVT;
1196 NumIntermediates = NumElts * ((Size + 31) / 32);
1197 return NumIntermediates;
1198 }
1199 }
1200
1202 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1203}
1204
1206 const DataLayout &DL, Type *Ty,
1207 unsigned MaxNumLanes) {
1208 assert(MaxNumLanes != 0);
1209
1210 LLVMContext &Ctx = Ty->getContext();
1211 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1212 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1213 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1214 NumElts);
1215 }
1216
1217 return TLI.getValueType(DL, Ty);
1218}
1219
1220// Peek through TFE struct returns to only use the data size.
1222 const DataLayout &DL, Type *Ty,
1223 unsigned MaxNumLanes) {
1224 auto *ST = dyn_cast<StructType>(Ty);
1225 if (!ST)
1226 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1227
1228 // TFE intrinsics return an aggregate type.
1229 assert(ST->getNumContainedTypes() == 2 &&
1230 ST->getContainedType(1)->isIntegerTy(32));
1231 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1232}
1233
1234/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1235/// in-memory representation. This return value is a custom type because there
1236/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1237/// could cause issues during codegen, these address space 7 pointers will be
1238/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1239/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1240/// for cost modeling, to work. (This also sets us up decently for doing the
1241/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1243 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1244 return MVT::amdgpuBufferFatPointer;
1246 DL.getPointerSizeInBits(AS) == 192)
1247 return MVT::amdgpuBufferStridedPointer;
1249}
1250/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1251/// v8i32 when padding is added.
1252/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1253/// also v8i32 with padding.
1255 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1256 DL.getPointerSizeInBits(AS) == 160) ||
1258 DL.getPointerSizeInBits(AS) == 192))
1259 return MVT::v8i32;
1261}
1262
1263static unsigned getIntrMemWidth(unsigned IntrID) {
1264 switch (IntrID) {
1265 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1266 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1267 return 8;
1268 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1269 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1270 return 32;
1271 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1272 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1273 return 64;
1274 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1275 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1276 return 128;
1277 default:
1278 llvm_unreachable("Unknown width");
1279 }
1280}
1281
1283 const CallInst &CI,
1284 MachineFunction &MF,
1285 unsigned IntrID) const {
1287 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1289 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1291 Info.flags |= getTargetMMOFlags(CI);
1292
1293 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1295 AttributeSet Attr =
1297 MemoryEffects ME = Attr.getMemoryEffects();
1298 if (ME.doesNotAccessMemory())
1299 return false;
1300
1301 // TODO: Should images get their own address space?
1302 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1303
1304 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1305 if (RsrcIntr->IsImage) {
1308 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1309 Info.align.reset();
1310 }
1311
1312 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1313 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1314 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1315 // We conservatively set the memory operand of a buffer intrinsic to the
1316 // base resource pointer, so that we can access alias information about
1317 // those pointers. Cases like "this points at the same value
1318 // but with a different offset" are handled in
1319 // areMemAccessesTriviallyDisjoint.
1320 Info.ptrVal = RsrcArg;
1321 }
1322
1323 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1324 if (!IsSPrefetch) {
1325 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1326 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1328 }
1329
1331 if (ME.onlyReadsMemory()) {
1332 if (RsrcIntr->IsImage) {
1333 unsigned MaxNumLanes = 4;
1334
1335 if (!BaseOpcode->Gather4) {
1336 // If this isn't a gather, we may have excess loaded elements in the
1337 // IR type. Check the dmask for the real number of elements loaded.
1338 unsigned DMask =
1339 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1340 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1341 }
1342
1343 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1344 CI.getType(), MaxNumLanes);
1345 } else {
1346 Info.memVT =
1348 std::numeric_limits<unsigned>::max());
1349 }
1350
1351 // FIXME: What does alignment mean for an image?
1354 } else if (ME.onlyWritesMemory()) {
1356
1357 Type *DataTy = CI.getArgOperand(0)->getType();
1358 if (RsrcIntr->IsImage) {
1359 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1360 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1361 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1362 DMaskLanes);
1363 } else
1364 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1365
1367 } else {
1368 // Atomic, NoReturn Sampler or prefetch
1371 Info.flags |=
1373
1374 if (!IsSPrefetch)
1376
1377 switch (IntrID) {
1378 default:
1379 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1380 // Fake memory access type for no return sampler intrinsics
1381 Info.memVT = MVT::i32;
1382 } else {
1383 // XXX - Should this be volatile without known ordering?
1385 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1386 }
1387 break;
1388 case Intrinsic::amdgcn_raw_buffer_load_lds:
1389 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1390 case Intrinsic::amdgcn_struct_buffer_load_lds:
1391 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1392 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1393 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1394 Info.ptrVal = CI.getArgOperand(1);
1395 return true;
1396 }
1397 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1398 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1399 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1400 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1401 Info.memVT =
1403 std::numeric_limits<unsigned>::max());
1404 Info.flags &= ~MachineMemOperand::MOStore;
1405 return true;
1406 }
1407 }
1408 }
1409 return true;
1410 }
1411
1412 switch (IntrID) {
1413 case Intrinsic::amdgcn_ds_ordered_add:
1414 case Intrinsic::amdgcn_ds_ordered_swap: {
1416 Info.memVT = MVT::getVT(CI.getType());
1417 Info.ptrVal = CI.getOperand(0);
1418 Info.align.reset();
1420
1421 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1422 if (!Vol->isZero())
1424
1425 return true;
1426 }
1427 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1428 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1430 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1431 Info.ptrVal = nullptr;
1432 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1434 return true;
1435 }
1436 case Intrinsic::amdgcn_ds_append:
1437 case Intrinsic::amdgcn_ds_consume: {
1439 Info.memVT = MVT::getVT(CI.getType());
1440 Info.ptrVal = CI.getOperand(0);
1441 Info.align.reset();
1443
1444 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1445 if (!Vol->isZero())
1447
1448 return true;
1449 }
1450 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1451 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1452 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1455 Info.memVT = MVT::getVT(CI.getType());
1456 Info.ptrVal = CI.getOperand(0);
1457 Info.memVT = MVT::i64;
1458 Info.size = 8;
1459 Info.align.reset();
1461 return true;
1462 }
1463 case Intrinsic::amdgcn_global_atomic_csub: {
1465 Info.memVT = MVT::getVT(CI.getType());
1466 Info.ptrVal = CI.getOperand(0);
1467 Info.align.reset();
1470 return true;
1471 }
1472 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1473 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1474 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1476 Info.memVT =
1477 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1478 ? CI.getType()
1479 : cast<StructType>(CI.getType())
1480 ->getElementType(0)); // XXX: what is correct VT?
1481
1482 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1483 Info.align.reset();
1484 Info.flags |=
1486 return true;
1487 }
1488 case Intrinsic::amdgcn_global_atomic_fmin_num:
1489 case Intrinsic::amdgcn_global_atomic_fmax_num:
1490 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1491 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1492 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1493 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1495 Info.memVT = MVT::getVT(CI.getType());
1496 Info.ptrVal = CI.getOperand(0);
1497 Info.align.reset();
1501 return true;
1502 }
1503 case Intrinsic::amdgcn_flat_load_monitor_b32:
1504 case Intrinsic::amdgcn_flat_load_monitor_b64:
1505 case Intrinsic::amdgcn_flat_load_monitor_b128:
1506 case Intrinsic::amdgcn_global_load_monitor_b32:
1507 case Intrinsic::amdgcn_global_load_monitor_b64:
1508 case Intrinsic::amdgcn_global_load_monitor_b128:
1509 case Intrinsic::amdgcn_ds_load_tr6_b96:
1510 case Intrinsic::amdgcn_ds_load_tr4_b64:
1511 case Intrinsic::amdgcn_ds_load_tr8_b64:
1512 case Intrinsic::amdgcn_ds_load_tr16_b128:
1513 case Intrinsic::amdgcn_global_load_tr6_b96:
1514 case Intrinsic::amdgcn_global_load_tr4_b64:
1515 case Intrinsic::amdgcn_global_load_tr_b64:
1516 case Intrinsic::amdgcn_global_load_tr_b128:
1517 case Intrinsic::amdgcn_ds_read_tr4_b64:
1518 case Intrinsic::amdgcn_ds_read_tr6_b96:
1519 case Intrinsic::amdgcn_ds_read_tr8_b64:
1520 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1522 Info.memVT = MVT::getVT(CI.getType());
1523 Info.ptrVal = CI.getOperand(0);
1524 Info.align.reset();
1526 return true;
1527 }
1528 case Intrinsic::amdgcn_ds_gws_init:
1529 case Intrinsic::amdgcn_ds_gws_barrier:
1530 case Intrinsic::amdgcn_ds_gws_sema_v:
1531 case Intrinsic::amdgcn_ds_gws_sema_br:
1532 case Intrinsic::amdgcn_ds_gws_sema_p:
1533 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1535
1536 const GCNTargetMachine &TM =
1537 static_cast<const GCNTargetMachine &>(getTargetMachine());
1538
1540 Info.ptrVal = MFI->getGWSPSV(TM);
1541
1542 // This is an abstract access, but we need to specify a type and size.
1543 Info.memVT = MVT::i32;
1544 Info.size = 4;
1545 Info.align = Align(4);
1546
1547 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1549 else
1551 return true;
1552 }
1553 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1554 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1555 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1556 case Intrinsic::amdgcn_global_load_async_to_lds_b128: {
1558 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1559 Info.ptrVal = CI.getArgOperand(1);
1561 return true;
1562 }
1563 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1564 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1565 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1566 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1568 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1569 Info.ptrVal = CI.getArgOperand(0);
1571 return true;
1572 }
1573 case Intrinsic::amdgcn_load_to_lds:
1574 case Intrinsic::amdgcn_global_load_lds: {
1576 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1577 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1578 Info.ptrVal = CI.getArgOperand(1);
1580 return true;
1581 }
1582 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1583 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1584 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1585 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1587
1588 const GCNTargetMachine &TM =
1589 static_cast<const GCNTargetMachine &>(getTargetMachine());
1590
1592 Info.ptrVal = MFI->getGWSPSV(TM);
1593
1594 // This is an abstract access, but we need to specify a type and size.
1595 Info.memVT = MVT::i32;
1596 Info.size = 4;
1597 Info.align = Align(4);
1598
1600 return true;
1601 }
1602 case Intrinsic::amdgcn_s_prefetch_data:
1603 case Intrinsic::amdgcn_flat_prefetch:
1604 case Intrinsic::amdgcn_global_prefetch: {
1606 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1607 Info.ptrVal = CI.getArgOperand(0);
1609 return true;
1610 }
1611 default:
1612 return false;
1613 }
1614}
1615
1617 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1618 switch (cast<IntrinsicInst>(I).getIntrinsicID()) {
1619 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1620 // The DAG's ValueType loses the addrspaces.
1621 // Add them as 2 extra Constant operands "from" and "to".
1622 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1623 unsigned DstAS = I.getType()->getPointerAddressSpace();
1624 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1625 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1626 break;
1627 }
1628 default:
1629 break;
1630 }
1631}
1632
1635 Type *&AccessTy) const {
1636 Value *Ptr = nullptr;
1637 switch (II->getIntrinsicID()) {
1638 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1639 case Intrinsic::amdgcn_ds_append:
1640 case Intrinsic::amdgcn_ds_consume:
1641 case Intrinsic::amdgcn_ds_load_tr8_b64:
1642 case Intrinsic::amdgcn_ds_load_tr16_b128:
1643 case Intrinsic::amdgcn_ds_load_tr4_b64:
1644 case Intrinsic::amdgcn_ds_load_tr6_b96:
1645 case Intrinsic::amdgcn_ds_read_tr4_b64:
1646 case Intrinsic::amdgcn_ds_read_tr6_b96:
1647 case Intrinsic::amdgcn_ds_read_tr8_b64:
1648 case Intrinsic::amdgcn_ds_read_tr16_b64:
1649 case Intrinsic::amdgcn_ds_ordered_add:
1650 case Intrinsic::amdgcn_ds_ordered_swap:
1651 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1652 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1653 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1654 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1655 case Intrinsic::amdgcn_flat_load_monitor_b128:
1656 case Intrinsic::amdgcn_flat_load_monitor_b32:
1657 case Intrinsic::amdgcn_flat_load_monitor_b64:
1658 case Intrinsic::amdgcn_global_atomic_csub:
1659 case Intrinsic::amdgcn_global_atomic_fmax_num:
1660 case Intrinsic::amdgcn_global_atomic_fmin_num:
1661 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1662 case Intrinsic::amdgcn_global_load_monitor_b128:
1663 case Intrinsic::amdgcn_global_load_monitor_b32:
1664 case Intrinsic::amdgcn_global_load_monitor_b64:
1665 case Intrinsic::amdgcn_global_load_tr_b64:
1666 case Intrinsic::amdgcn_global_load_tr_b128:
1667 case Intrinsic::amdgcn_global_load_tr4_b64:
1668 case Intrinsic::amdgcn_global_load_tr6_b96:
1669 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1670 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1671 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1672 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1673 Ptr = II->getArgOperand(0);
1674 break;
1675 case Intrinsic::amdgcn_load_to_lds:
1676 case Intrinsic::amdgcn_global_load_lds:
1677 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1678 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1679 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1680 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1681 Ptr = II->getArgOperand(1);
1682 break;
1683 default:
1684 return false;
1685 }
1686 AccessTy = II->getType();
1687 Ops.push_back(Ptr);
1688 return true;
1689}
1690
1692 unsigned AddrSpace) const {
1693 if (!Subtarget->hasFlatInstOffsets()) {
1694 // Flat instructions do not have offsets, and only have the register
1695 // address.
1696 return AM.BaseOffs == 0 && AM.Scale == 0;
1697 }
1698
1699 decltype(SIInstrFlags::FLAT) FlatVariant =
1703
1704 return AM.Scale == 0 &&
1705 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1706 AM.BaseOffs, AddrSpace, FlatVariant));
1707}
1708
1710 if (Subtarget->hasFlatGlobalInsts())
1712
1713 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1714 // Assume the we will use FLAT for all global memory accesses
1715 // on VI.
1716 // FIXME: This assumption is currently wrong. On VI we still use
1717 // MUBUF instructions for the r + i addressing mode. As currently
1718 // implemented, the MUBUF instructions only work on buffer < 4GB.
1719 // It may be possible to support > 4GB buffers with MUBUF instructions,
1720 // by setting the stride value in the resource descriptor which would
1721 // increase the size limit to (stride * 4GB). However, this is risky,
1722 // because it has never been validated.
1724 }
1725
1726 return isLegalMUBUFAddressingMode(AM);
1727}
1728
1729bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1730 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1731 // additionally can do r + r + i with addr64. 32-bit has more addressing
1732 // mode options. Depending on the resource constant, it can also do
1733 // (i64 r0) + (i32 r1) * (i14 i).
1734 //
1735 // Private arrays end up using a scratch buffer most of the time, so also
1736 // assume those use MUBUF instructions. Scratch loads / stores are currently
1737 // implemented as mubuf instructions with offen bit set, so slightly
1738 // different than the normal addr64.
1739 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1740 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1741 return false;
1742
1743 // FIXME: Since we can split immediate into soffset and immediate offset,
1744 // would it make sense to allow any immediate?
1745
1746 switch (AM.Scale) {
1747 case 0: // r + i or just i, depending on HasBaseReg.
1748 return true;
1749 case 1:
1750 return true; // We have r + r or r + i.
1751 case 2:
1752 if (AM.HasBaseReg) {
1753 // Reject 2 * r + r.
1754 return false;
1755 }
1756
1757 // Allow 2 * r as r + r
1758 // Or 2 * r + i is allowed as r + r + i.
1759 return true;
1760 default: // Don't allow n * r
1761 return false;
1762 }
1763}
1764
1766 const AddrMode &AM, Type *Ty,
1767 unsigned AS,
1768 Instruction *I) const {
1769 // No global is ever allowed as a base.
1770 if (AM.BaseGV)
1771 return false;
1772
1773 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1774 return isLegalGlobalAddressingMode(AM);
1775
1776 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1780 // If the offset isn't a multiple of 4, it probably isn't going to be
1781 // correctly aligned.
1782 // FIXME: Can we get the real alignment here?
1783 if (AM.BaseOffs % 4 != 0)
1784 return isLegalMUBUFAddressingMode(AM);
1785
1786 if (!Subtarget->hasScalarSubwordLoads()) {
1787 // There are no SMRD extloads, so if we have to do a small type access we
1788 // will use a MUBUF load.
1789 // FIXME?: We also need to do this if unaligned, but we don't know the
1790 // alignment here.
1791 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1792 return isLegalGlobalAddressingMode(AM);
1793 }
1794
1796 // SMRD instructions have an 8-bit, dword offset on SI.
1797 if (!isUInt<8>(AM.BaseOffs / 4))
1798 return false;
1799 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1800 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1801 // in 8-bits, it can use a smaller encoding.
1802 if (!isUInt<32>(AM.BaseOffs / 4))
1803 return false;
1804 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1805 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1806 if (!isUInt<20>(AM.BaseOffs))
1807 return false;
1808 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1809 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1810 // for S_BUFFER_* instructions).
1811 if (!isInt<21>(AM.BaseOffs))
1812 return false;
1813 } else {
1814 // On GFX12, all offsets are signed 24-bit in bytes.
1815 if (!isInt<24>(AM.BaseOffs))
1816 return false;
1817 }
1818
1819 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1821 AM.BaseOffs < 0) {
1822 // Scalar (non-buffer) loads can only use a negative offset if
1823 // soffset+offset is non-negative. Since the compiler can only prove that
1824 // in a few special cases, it is safer to claim that negative offsets are
1825 // not supported.
1826 return false;
1827 }
1828
1829 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1830 return true;
1831
1832 if (AM.Scale == 1 && AM.HasBaseReg)
1833 return true;
1834
1835 return false;
1836 }
1837
1838 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1839 return Subtarget->enableFlatScratch()
1841 : isLegalMUBUFAddressingMode(AM);
1842
1843 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1844 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1845 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1846 // field.
1847 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1848 // an 8-bit dword offset but we don't know the alignment here.
1849 if (!isUInt<16>(AM.BaseOffs))
1850 return false;
1851
1852 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1853 return true;
1854
1855 if (AM.Scale == 1 && AM.HasBaseReg)
1856 return true;
1857
1858 return false;
1859 }
1860
1862 // For an unknown address space, this usually means that this is for some
1863 // reason being used for pure arithmetic, and not based on some addressing
1864 // computation. We don't have instructions that compute pointers with any
1865 // addressing modes, so treat them as having no offset like flat
1866 // instructions.
1868 }
1869
1870 // Assume a user alias of global for unknown address spaces.
1871 return isLegalGlobalAddressingMode(AM);
1872}
1873
1875 const MachineFunction &MF) const {
1877 return (MemVT.getSizeInBits() <= 4 * 32);
1878 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1879 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1880 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1881 }
1883 return (MemVT.getSizeInBits() <= 2 * 32);
1884 return true;
1885}
1886
1888 unsigned Size, unsigned AddrSpace, Align Alignment,
1889 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1890 if (IsFast)
1891 *IsFast = 0;
1892
1893 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1894 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1895 // Check if alignment requirements for ds_read/write instructions are
1896 // disabled.
1897 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1898 return false;
1899
1900 Align RequiredAlignment(
1901 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1902 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1903 Alignment < RequiredAlignment)
1904 return false;
1905
1906 // Either, the alignment requirements are "enabled", or there is an
1907 // unaligned LDS access related hardware bug though alignment requirements
1908 // are "disabled". In either case, we need to check for proper alignment
1909 // requirements.
1910 //
1911 switch (Size) {
1912 case 64:
1913 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1914 // address is negative, then the instruction is incorrectly treated as
1915 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1916 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1917 // load later in the SILoadStoreOptimizer.
1918 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1919 return false;
1920
1921 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1922 // can do a 4 byte aligned, 8 byte access in a single operation using
1923 // ds_read2/write2_b32 with adjacent offsets.
1924 RequiredAlignment = Align(4);
1925
1926 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1927 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1928 // ds_write2_b32 depending on the alignment. In either case with either
1929 // alignment there is no faster way of doing this.
1930
1931 // The numbers returned here and below are not additive, it is a 'speed
1932 // rank'. They are just meant to be compared to decide if a certain way
1933 // of lowering an operation is faster than another. For that purpose
1934 // naturally aligned operation gets it bitsize to indicate that "it
1935 // operates with a speed comparable to N-bit wide load". With the full
1936 // alignment ds128 is slower than ds96 for example. If underaligned it
1937 // is comparable to a speed of a single dword access, which would then
1938 // mean 32 < 128 and it is faster to issue a wide load regardless.
1939 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1940 // wider load which will not be aligned anymore the latter is slower.
1941 if (IsFast)
1942 *IsFast = (Alignment >= RequiredAlignment) ? 64
1943 : (Alignment < Align(4)) ? 32
1944 : 1;
1945 return true;
1946 }
1947
1948 break;
1949 case 96:
1950 if (!Subtarget->hasDS96AndDS128())
1951 return false;
1952
1953 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1954 // gfx8 and older.
1955
1956 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1957 // Naturally aligned access is fastest. However, also report it is Fast
1958 // if memory is aligned less than DWORD. A narrow load or store will be
1959 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1960 // be more of them, so overall we will pay less penalty issuing a single
1961 // instruction.
1962
1963 // See comment on the values above.
1964 if (IsFast)
1965 *IsFast = (Alignment >= RequiredAlignment) ? 96
1966 : (Alignment < Align(4)) ? 32
1967 : 1;
1968 return true;
1969 }
1970
1971 break;
1972 case 128:
1973 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1974 return false;
1975
1976 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1977 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1978 // single operation using ds_read2/write2_b64.
1979 RequiredAlignment = Align(8);
1980
1981 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1982 // Naturally aligned access is fastest. However, also report it is Fast
1983 // if memory is aligned less than DWORD. A narrow load or store will be
1984 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1985 // will be more of them, so overall we will pay less penalty issuing a
1986 // single instruction.
1987
1988 // See comment on the values above.
1989 if (IsFast)
1990 *IsFast = (Alignment >= RequiredAlignment) ? 128
1991 : (Alignment < Align(4)) ? 32
1992 : 1;
1993 return true;
1994 }
1995
1996 break;
1997 default:
1998 if (Size > 32)
1999 return false;
2000
2001 break;
2002 }
2003
2004 // See comment on the values above.
2005 // Note that we have a single-dword or sub-dword here, so if underaligned
2006 // it is a slowest possible access, hence returned value is 0.
2007 if (IsFast)
2008 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
2009
2010 return Alignment >= RequiredAlignment ||
2011 Subtarget->hasUnalignedDSAccessEnabled();
2012 }
2013
2014 // FIXME: We have to be conservative here and assume that flat operations
2015 // will access scratch. If we had access to the IR function, then we
2016 // could determine if any private memory was used in the function.
2017 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
2018 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2019 bool AlignedBy4 = Alignment >= Align(4);
2020 if (IsFast)
2021 *IsFast = AlignedBy4;
2022
2023 return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
2024 }
2025
2026 // So long as they are correct, wide global memory operations perform better
2027 // than multiple smaller memory ops -- even when misaligned
2028 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
2029 if (IsFast)
2030 *IsFast = Size;
2031
2032 return Alignment >= Align(4) ||
2034 }
2035
2036 // Ensure robust out-of-bounds guarantees for buffer accesses are met if
2037 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
2038 // out-of-bounds behavior, but in the edge case where an access starts
2039 // out-of-bounds and then enter in-bounds, the entire access would be treated
2040 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
2041 // natural alignment of buffer accesses.
2042 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
2043 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
2044 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2045 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2046 Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
2047 return false;
2048 }
2049
2050 // Smaller than dword value must be aligned.
2051 if (Size < 32)
2052 return false;
2053
2054 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2055 // byte-address are ignored, thus forcing Dword alignment.
2056 // This applies to private, global, and constant memory.
2057 if (IsFast)
2058 *IsFast = 1;
2059
2060 return Size >= 32 && Alignment >= Align(4);
2061}
2062
2064 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2065 unsigned *IsFast) const {
2067 Alignment, Flags, IsFast);
2068}
2069
2071 LLVMContext &Context, const MemOp &Op,
2072 const AttributeList &FuncAttributes) const {
2073 // FIXME: Should account for address space here.
2074
2075 // The default fallback uses the private pointer size as a guess for a type to
2076 // use. Make sure we switch these to 64-bit accesses.
2077
2078 if (Op.size() >= 16 &&
2079 Op.isDstAligned(Align(4))) // XXX: Should only do for global
2080 return MVT::v4i32;
2081
2082 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
2083 return MVT::v2i32;
2084
2085 // Use the default.
2086 return MVT::Other;
2087}
2088
2090 const MemSDNode *MemNode = cast<MemSDNode>(N);
2091 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2092}
2093
2095 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
2097}
2098
2100 unsigned DestAS) const {
2101 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2102 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2103 Subtarget->hasGloballyAddressableScratch()) {
2104 // Flat -> private requires subtracting src_flat_scratch_base_lo.
2105 return false;
2106 }
2107
2108 // Flat -> private/local is a simple truncate.
2109 // Flat -> global is no-op
2110 return true;
2111 }
2112
2113 const GCNTargetMachine &TM =
2114 static_cast<const GCNTargetMachine &>(getTargetMachine());
2115 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2116}
2117
2120 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2121 VT.getScalarType().bitsLE(MVT::i16))
2124}
2125
2127 Type *Ty) const {
2128 // FIXME: Could be smarter if called for vector constants.
2129 return true;
2130}
2131
2133 unsigned Index) const {
2135 return false;
2136
2137 // TODO: Add more cases that are cheap.
2138 return Index == 0;
2139}
2140
2141bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2142 // TODO: This should be more aggressive, particular for 16-bit element
2143 // vectors. However there are some mixed improvements and regressions.
2144 EVT EltTy = VT.getVectorElementType();
2145 return EltTy.getSizeInBits() % 32 == 0;
2146}
2147
2149 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2150 switch (Op) {
2151 case ISD::LOAD:
2152 case ISD::STORE:
2153 return true;
2154 default:
2155 return false;
2156 }
2157 }
2158
2159 // SimplifySetCC uses this function to determine whether or not it should
2160 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2161 if (VT == MVT::i1 && Op == ISD::SETCC)
2162 return false;
2163
2165}
2166
2167SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2168 const SDLoc &SL,
2169 SDValue Chain,
2170 uint64_t Offset) const {
2171 const DataLayout &DL = DAG.getDataLayout();
2175
2176 auto [InputPtrReg, RC, ArgTy] =
2178
2179 // We may not have the kernarg segment argument if we have no kernel
2180 // arguments.
2181 if (!InputPtrReg)
2182 return DAG.getConstant(Offset, SL, PtrVT);
2183
2185 SDValue BasePtr = DAG.getCopyFromReg(
2186 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2187
2188 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2189}
2190
2191SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2192 const SDLoc &SL) const {
2195 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2196}
2197
2198SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2199 const SDLoc &SL) const {
2200
2202 std::optional<uint32_t> KnownSize =
2204 if (KnownSize.has_value())
2205 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2206 return SDValue();
2207}
2208
2209SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2210 const SDLoc &SL, SDValue Val,
2211 bool Signed,
2212 const ISD::InputArg *Arg) const {
2213 // First, if it is a widened vector, narrow it.
2214 if (VT.isVector() &&
2216 EVT NarrowedVT =
2219 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2220 DAG.getConstant(0, SL, MVT::i32));
2221 }
2222
2223 // Then convert the vector elements or scalar value.
2224 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2225 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2226 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2227 }
2228
2229 if (MemVT.isFloatingPoint())
2230 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2231 else if (Signed)
2232 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2233 else
2234 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2235
2236 return Val;
2237}
2238
2239SDValue SITargetLowering::lowerKernargMemParameter(
2240 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2241 uint64_t Offset, Align Alignment, bool Signed,
2242 const ISD::InputArg *Arg) const {
2244
2245 // Try to avoid using an extload by loading earlier than the argument address,
2246 // and extracting the relevant bits. The load should hopefully be merged with
2247 // the previous argument.
2248 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2249 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2250 int64_t AlignDownOffset = alignDown(Offset, 4);
2251 int64_t OffsetDiff = Offset - AlignDownOffset;
2252
2253 EVT IntVT = MemVT.changeTypeToInteger();
2254
2255 // TODO: If we passed in the base kernel offset we could have a better
2256 // alignment than 4, but we don't really need it.
2257 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2258 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2261
2262 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2263 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2264
2265 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2266 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2267 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2268
2269 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2270 }
2271
2272 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2273 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2276
2277 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2278 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2279}
2280
2281SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2282 CCValAssign &VA, const SDLoc &SL,
2283 SDValue Chain,
2284 const ISD::InputArg &Arg) const {
2286 MachineFrameInfo &MFI = MF.getFrameInfo();
2287
2288 if (Arg.Flags.isByVal()) {
2289 unsigned Size = Arg.Flags.getByValSize();
2290 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2291 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2292 }
2293
2294 unsigned ArgOffset = VA.getLocMemOffset();
2295 unsigned ArgSize = VA.getValVT().getStoreSize();
2296
2297 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2298
2299 // Create load nodes to retrieve arguments from the stack.
2300 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2301 SDValue ArgValue;
2302
2303 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2305 MVT MemVT = VA.getValVT();
2306
2307 switch (VA.getLocInfo()) {
2308 default:
2309 break;
2310 case CCValAssign::BCvt:
2311 MemVT = VA.getLocVT();
2312 break;
2313 case CCValAssign::SExt:
2314 ExtType = ISD::SEXTLOAD;
2315 break;
2316 case CCValAssign::ZExt:
2317 ExtType = ISD::ZEXTLOAD;
2318 break;
2319 case CCValAssign::AExt:
2320 ExtType = ISD::EXTLOAD;
2321 break;
2322 }
2323
2324 ArgValue = DAG.getExtLoad(
2325 ExtType, SL, VA.getLocVT(), Chain, FIN,
2327 return ArgValue;
2328}
2329
2330SDValue SITargetLowering::getPreloadedValue(
2331 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2333 const ArgDescriptor *Reg = nullptr;
2334 const TargetRegisterClass *RC;
2335 LLT Ty;
2336
2338 const ArgDescriptor WorkGroupIDX =
2339 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2340 // If GridZ is not programmed in an entry function then the hardware will set
2341 // it to all zeros, so there is no need to mask the GridY value in the low
2342 // order bits.
2343 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2344 AMDGPU::TTMP7,
2345 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2346 const ArgDescriptor WorkGroupIDZ =
2347 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2348 if (Subtarget->hasArchitectedSGPRs() &&
2351 switch (PVID) {
2353 Reg = &WorkGroupIDX;
2354 RC = &AMDGPU::SReg_32RegClass;
2355 Ty = LLT::scalar(32);
2356 break;
2358 Reg = &WorkGroupIDY;
2359 RC = &AMDGPU::SReg_32RegClass;
2360 Ty = LLT::scalar(32);
2361 break;
2363 Reg = &WorkGroupIDZ;
2364 RC = &AMDGPU::SReg_32RegClass;
2365 Ty = LLT::scalar(32);
2366 break;
2367 default:
2368 break;
2369 }
2370 }
2371
2372 if (!Reg)
2373 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2374 if (!Reg) {
2376 // It's possible for a kernarg intrinsic call to appear in a kernel with
2377 // no allocated segment, in which case we do not add the user sgpr
2378 // argument, so just return null.
2379 return DAG.getConstant(0, SDLoc(), VT);
2380 }
2381
2382 // It's undefined behavior if a function marked with the amdgpu-no-*
2383 // attributes uses the corresponding intrinsic.
2384 return DAG.getPOISON(VT);
2385 }
2386
2387 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2388}
2389
2391 CallingConv::ID CallConv,
2392 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2393 FunctionType *FType,
2394 SIMachineFunctionInfo *Info) {
2395 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2396 const ISD::InputArg *Arg = &Ins[I];
2397
2398 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2399 "vector type argument should have been split");
2400
2401 // First check if it's a PS input addr.
2402 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2403 PSInputNum <= 15) {
2404 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2405
2406 // Inconveniently only the first part of the split is marked as isSplit,
2407 // so skip to the end. We only want to increment PSInputNum once for the
2408 // entire split argument.
2409 if (Arg->Flags.isSplit()) {
2410 while (!Arg->Flags.isSplitEnd()) {
2411 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2412 "unexpected vector split in ps argument type");
2413 if (!SkipArg)
2414 Splits.push_back(*Arg);
2415 Arg = &Ins[++I];
2416 }
2417 }
2418
2419 if (SkipArg) {
2420 // We can safely skip PS inputs.
2421 Skipped.set(Arg->getOrigArgIndex());
2422 ++PSInputNum;
2423 continue;
2424 }
2425
2426 Info->markPSInputAllocated(PSInputNum);
2427 if (Arg->Used)
2428 Info->markPSInputEnabled(PSInputNum);
2429
2430 ++PSInputNum;
2431 }
2432
2433 Splits.push_back(*Arg);
2434 }
2435}
2436
2437// Allocate special inputs passed in VGPRs.
2439 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2440 SIMachineFunctionInfo &Info) const {
2441 const LLT S32 = LLT::scalar(32);
2443
2444 if (Info.hasWorkItemIDX()) {
2445 Register Reg = AMDGPU::VGPR0;
2446 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2447
2448 CCInfo.AllocateReg(Reg);
2449 unsigned Mask =
2450 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2451 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2452 }
2453
2454 if (Info.hasWorkItemIDY()) {
2455 assert(Info.hasWorkItemIDX());
2456 if (Subtarget->hasPackedTID()) {
2457 Info.setWorkItemIDY(
2458 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2459 } else {
2460 unsigned Reg = AMDGPU::VGPR1;
2461 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2462
2463 CCInfo.AllocateReg(Reg);
2464 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2465 }
2466 }
2467
2468 if (Info.hasWorkItemIDZ()) {
2469 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2470 if (Subtarget->hasPackedTID()) {
2471 Info.setWorkItemIDZ(
2472 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2473 } else {
2474 unsigned Reg = AMDGPU::VGPR2;
2475 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2476
2477 CCInfo.AllocateReg(Reg);
2478 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2479 }
2480 }
2481}
2482
2483// Try to allocate a VGPR at the end of the argument list, or if no argument
2484// VGPRs are left allocating a stack slot.
2485// If \p Mask is is given it indicates bitfield position in the register.
2486// If \p Arg is given use it with new ]p Mask instead of allocating new.
2487static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2488 ArgDescriptor Arg = ArgDescriptor()) {
2489 if (Arg.isSet())
2490 return ArgDescriptor::createArg(Arg, Mask);
2491
2492 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2493 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2494 if (RegIdx == ArgVGPRs.size()) {
2495 // Spill to stack required.
2496 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2497
2498 return ArgDescriptor::createStack(Offset, Mask);
2499 }
2500
2501 unsigned Reg = ArgVGPRs[RegIdx];
2502 Reg = CCInfo.AllocateReg(Reg);
2503 assert(Reg != AMDGPU::NoRegister);
2504
2505 MachineFunction &MF = CCInfo.getMachineFunction();
2506 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2507 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2508 return ArgDescriptor::createRegister(Reg, Mask);
2509}
2510
2512 const TargetRegisterClass *RC,
2513 unsigned NumArgRegs) {
2514 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2515 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2516 if (RegIdx == ArgSGPRs.size())
2517 report_fatal_error("ran out of SGPRs for arguments");
2518
2519 unsigned Reg = ArgSGPRs[RegIdx];
2520 Reg = CCInfo.AllocateReg(Reg);
2521 assert(Reg != AMDGPU::NoRegister);
2522
2523 MachineFunction &MF = CCInfo.getMachineFunction();
2524 MF.addLiveIn(Reg, RC);
2526}
2527
2528// If this has a fixed position, we still should allocate the register in the
2529// CCInfo state. Technically we could get away with this for values passed
2530// outside of the normal argument range.
2532 const TargetRegisterClass *RC,
2533 MCRegister Reg) {
2534 Reg = CCInfo.AllocateReg(Reg);
2535 assert(Reg != AMDGPU::NoRegister);
2536 MachineFunction &MF = CCInfo.getMachineFunction();
2537 MF.addLiveIn(Reg, RC);
2538}
2539
2540static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2541 if (Arg) {
2542 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2543 Arg.getRegister());
2544 } else
2545 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2546}
2547
2548static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2549 if (Arg) {
2550 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2551 Arg.getRegister());
2552 } else
2553 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2554}
2555
2556/// Allocate implicit function VGPR arguments at the end of allocated user
2557/// arguments.
2559 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2560 SIMachineFunctionInfo &Info) const {
2561 const unsigned Mask = 0x3ff;
2562 ArgDescriptor Arg;
2563
2564 if (Info.hasWorkItemIDX()) {
2565 Arg = allocateVGPR32Input(CCInfo, Mask);
2566 Info.setWorkItemIDX(Arg);
2567 }
2568
2569 if (Info.hasWorkItemIDY()) {
2570 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2571 Info.setWorkItemIDY(Arg);
2572 }
2573
2574 if (Info.hasWorkItemIDZ())
2575 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2576}
2577
2578/// Allocate implicit function VGPR arguments in fixed registers.
2580 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2581 SIMachineFunctionInfo &Info) const {
2582 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2583 if (!Reg)
2584 report_fatal_error("failed to allocate VGPR for implicit arguments");
2585
2586 const unsigned Mask = 0x3ff;
2587 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2588 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2589 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2590}
2591
2593 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2594 SIMachineFunctionInfo &Info) const {
2595 auto &ArgInfo = Info.getArgInfo();
2596 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2597
2598 // TODO: Unify handling with private memory pointers.
2599 if (UserSGPRInfo.hasDispatchPtr())
2600 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2601
2602 if (UserSGPRInfo.hasQueuePtr())
2603 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2604
2605 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2606 // constant offset from the kernarg segment.
2607 if (Info.hasImplicitArgPtr())
2608 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2609
2610 if (UserSGPRInfo.hasDispatchID())
2611 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2612
2613 // flat_scratch_init is not applicable for non-kernel functions.
2614
2615 if (Info.hasWorkGroupIDX())
2616 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2617
2618 if (Info.hasWorkGroupIDY())
2619 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2620
2621 if (Info.hasWorkGroupIDZ())
2622 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2623
2624 if (Info.hasLDSKernelId())
2625 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2626}
2627
2628// Allocate special inputs passed in user SGPRs.
2630 MachineFunction &MF,
2631 const SIRegisterInfo &TRI,
2632 SIMachineFunctionInfo &Info) const {
2633 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2634 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2635 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2636 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2637 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2638 }
2639
2640 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2641 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2642 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2643 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2644 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2645 }
2646
2647 if (UserSGPRInfo.hasDispatchPtr()) {
2648 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2649 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2650 CCInfo.AllocateReg(DispatchPtrReg);
2651 }
2652
2653 if (UserSGPRInfo.hasQueuePtr()) {
2654 Register QueuePtrReg = Info.addQueuePtr(TRI);
2655 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2656 CCInfo.AllocateReg(QueuePtrReg);
2657 }
2658
2659 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2661 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2662 CCInfo.AllocateReg(InputPtrReg);
2663
2664 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2665 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2666 }
2667
2668 if (UserSGPRInfo.hasDispatchID()) {
2669 Register DispatchIDReg = Info.addDispatchID(TRI);
2670 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2671 CCInfo.AllocateReg(DispatchIDReg);
2672 }
2673
2674 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2675 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2676 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2677 CCInfo.AllocateReg(FlatScratchInitReg);
2678 }
2679
2680 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2681 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2682 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2683 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2684 }
2685
2686 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2687 // these from the dispatch pointer.
2688}
2689
2690// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2691// sequential starting from the first argument.
2693 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2695 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2696 Function &F = MF.getFunction();
2697 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2698 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2699 bool InPreloadSequence = true;
2700 unsigned InIdx = 0;
2701 bool AlignedForImplictArgs = false;
2702 unsigned ImplicitArgOffset = 0;
2703 for (auto &Arg : F.args()) {
2704 if (!InPreloadSequence || !Arg.hasInRegAttr())
2705 break;
2706
2707 unsigned ArgIdx = Arg.getArgNo();
2708 // Don't preload non-original args or parts not in the current preload
2709 // sequence.
2710 if (InIdx < Ins.size() &&
2711 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2712 break;
2713
2714 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2715 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2716 InIdx++) {
2717 assert(ArgLocs[ArgIdx].isMemLoc());
2718 auto &ArgLoc = ArgLocs[InIdx];
2719 const Align KernelArgBaseAlign = Align(16);
2720 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2721 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2722 unsigned NumAllocSGPRs =
2723 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2724
2725 // Fix alignment for hidden arguments.
2726 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2727 if (!AlignedForImplictArgs) {
2728 ImplicitArgOffset =
2729 alignTo(LastExplicitArgOffset,
2730 Subtarget->getAlignmentForImplicitArgPtr()) -
2731 LastExplicitArgOffset;
2732 AlignedForImplictArgs = true;
2733 }
2734 ArgOffset += ImplicitArgOffset;
2735 }
2736
2737 // Arg is preloaded into the previous SGPR.
2738 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2739 assert(InIdx >= 1 && "No previous SGPR");
2740 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2741 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2742 continue;
2743 }
2744
2745 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2746 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2747 // Check for free user SGPRs for preloading.
2748 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2749 InPreloadSequence = false;
2750 break;
2751 }
2752
2753 // Preload this argument.
2754 const TargetRegisterClass *RC =
2755 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2756 SmallVectorImpl<MCRegister> *PreloadRegs =
2757 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2758
2759 if (PreloadRegs->size() > 1)
2760 RC = &AMDGPU::SGPR_32RegClass;
2761 for (auto &Reg : *PreloadRegs) {
2762 assert(Reg);
2763 MF.addLiveIn(Reg, RC);
2764 CCInfo.AllocateReg(Reg);
2765 }
2766
2767 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2768 }
2769 }
2770}
2771
2773 const SIRegisterInfo &TRI,
2774 SIMachineFunctionInfo &Info) const {
2775 // Always allocate this last since it is a synthetic preload.
2776 if (Info.hasLDSKernelId()) {
2777 Register Reg = Info.addLDSKernelId();
2778 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2779 CCInfo.AllocateReg(Reg);
2780 }
2781}
2782
2783// Allocate special input registers that are initialized per-wave.
2786 CallingConv::ID CallConv,
2787 bool IsShader) const {
2788 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2789 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2790 // Note: user SGPRs are handled by the front-end for graphics shaders
2791 // Pad up the used user SGPRs with dead inputs.
2792
2793 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2794 // before enabling architected SGPRs for workgroup IDs.
2795 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2796
2797 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2798 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2799 // rely on it to reach 16 since if we end up having no stack usage, it will
2800 // not really be added.
2801 unsigned NumRequiredSystemSGPRs =
2802 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
2803 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
2804 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2805 Register Reg = Info.addReservedUserSGPR();
2806 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2807 CCInfo.AllocateReg(Reg);
2808 }
2809 }
2810
2811 if (!HasArchitectedSGPRs) {
2812 if (Info.hasWorkGroupIDX()) {
2813 Register Reg = Info.addWorkGroupIDX();
2814 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2815 CCInfo.AllocateReg(Reg);
2816 }
2817
2818 if (Info.hasWorkGroupIDY()) {
2819 Register Reg = Info.addWorkGroupIDY();
2820 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2821 CCInfo.AllocateReg(Reg);
2822 }
2823
2824 if (Info.hasWorkGroupIDZ()) {
2825 Register Reg = Info.addWorkGroupIDZ();
2826 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2827 CCInfo.AllocateReg(Reg);
2828 }
2829 }
2830
2831 if (Info.hasWorkGroupInfo()) {
2832 Register Reg = Info.addWorkGroupInfo();
2833 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2834 CCInfo.AllocateReg(Reg);
2835 }
2836
2837 if (Info.hasPrivateSegmentWaveByteOffset()) {
2838 // Scratch wave offset passed in system SGPR.
2839 unsigned PrivateSegmentWaveByteOffsetReg;
2840
2841 if (IsShader) {
2842 PrivateSegmentWaveByteOffsetReg =
2843 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2844
2845 // This is true if the scratch wave byte offset doesn't have a fixed
2846 // location.
2847 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2848 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2849 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2850 }
2851 } else
2852 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2853
2854 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2855 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2856 }
2857
2858 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2859 Info.getNumPreloadedSGPRs() >= 16);
2860}
2861
2863 MachineFunction &MF,
2864 const SIRegisterInfo &TRI,
2865 SIMachineFunctionInfo &Info) {
2866 // Now that we've figured out where the scratch register inputs are, see if
2867 // should reserve the arguments and use them directly.
2868 MachineFrameInfo &MFI = MF.getFrameInfo();
2869 bool HasStackObjects = MFI.hasStackObjects();
2870 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2871
2872 // Record that we know we have non-spill stack objects so we don't need to
2873 // check all stack objects later.
2874 if (HasStackObjects)
2875 Info.setHasNonSpillStackObjects(true);
2876
2877 // Everything live out of a block is spilled with fast regalloc, so it's
2878 // almost certain that spilling will be required.
2879 if (TM.getOptLevel() == CodeGenOptLevel::None)
2880 HasStackObjects = true;
2881
2882 // For now assume stack access is needed in any callee functions, so we need
2883 // the scratch registers to pass in.
2884 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2885
2886 if (!ST.enableFlatScratch()) {
2887 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2888 // If we have stack objects, we unquestionably need the private buffer
2889 // resource. For the Code Object V2 ABI, this will be the first 4 user
2890 // SGPR inputs. We can reserve those and use them directly.
2891
2892 Register PrivateSegmentBufferReg =
2894 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2895 } else {
2896 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2897 // We tentatively reserve the last registers (skipping the last registers
2898 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2899 // we'll replace these with the ones immediately after those which were
2900 // really allocated. In the prologue copies will be inserted from the
2901 // argument to these reserved registers.
2902
2903 // Without HSA, relocations are used for the scratch pointer and the
2904 // buffer resource setup is always inserted in the prologue. Scratch wave
2905 // offset is still in an input SGPR.
2906 Info.setScratchRSrcReg(ReservedBufferReg);
2907 }
2908 }
2909
2911
2912 // For entry functions we have to set up the stack pointer if we use it,
2913 // whereas non-entry functions get this "for free". This means there is no
2914 // intrinsic advantage to using S32 over S34 in cases where we do not have
2915 // calls but do need a frame pointer (i.e. if we are requested to have one
2916 // because frame pointer elimination is disabled). To keep things simple we
2917 // only ever use S32 as the call ABI stack pointer, and so using it does not
2918 // imply we need a separate frame pointer.
2919 //
2920 // Try to use s32 as the SP, but move it if it would interfere with input
2921 // arguments. This won't work with calls though.
2922 //
2923 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2924 // registers.
2925 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2926 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2927 } else {
2929
2930 if (MFI.hasCalls())
2931 report_fatal_error("call in graphics shader with too many input SGPRs");
2932
2933 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2934 if (!MRI.isLiveIn(Reg)) {
2935 Info.setStackPtrOffsetReg(Reg);
2936 break;
2937 }
2938 }
2939
2940 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2941 report_fatal_error("failed to find register for SP");
2942 }
2943
2944 // hasFP should be accurate for entry functions even before the frame is
2945 // finalized, because it does not rely on the known stack size, only
2946 // properties like whether variable sized objects are present.
2947 if (ST.getFrameLowering()->hasFP(MF)) {
2948 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2949 }
2950}
2951
2954 return !Info->isEntryFunction();
2955}
2956
2958
2960 MachineBasicBlock *Entry,
2961 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2963
2964 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2965 if (!IStart)
2966 return;
2967
2968 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2969 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2970 MachineBasicBlock::iterator MBBI = Entry->begin();
2971 for (const MCPhysReg *I = IStart; *I; ++I) {
2972 const TargetRegisterClass *RC = nullptr;
2973 if (AMDGPU::SReg_64RegClass.contains(*I))
2974 RC = &AMDGPU::SGPR_64RegClass;
2975 else if (AMDGPU::SReg_32RegClass.contains(*I))
2976 RC = &AMDGPU::SGPR_32RegClass;
2977 else
2978 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2979
2980 Register NewVR = MRI->createVirtualRegister(RC);
2981 // Create copy from CSR to a virtual register.
2982 Entry->addLiveIn(*I);
2983 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2984 .addReg(*I);
2985
2986 // Insert the copy-back instructions right before the terminator.
2987 for (auto *Exit : Exits)
2988 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2989 TII->get(TargetOpcode::COPY), *I)
2990 .addReg(NewVR);
2991 }
2992}
2993
2995 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2996 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2997 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2999
3001 const Function &Fn = MF.getFunction();
3004 bool IsError = false;
3005
3006 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
3008 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3009 IsError = true;
3010 }
3011
3014 BitVector Skipped(Ins.size());
3015 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3016 *DAG.getContext());
3017
3018 bool IsGraphics = AMDGPU::isGraphics(CallConv);
3019 bool IsKernel = AMDGPU::isKernel(CallConv);
3020 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
3021
3022 if (IsGraphics) {
3023 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3024 assert(!UserSGPRInfo.hasDispatchPtr() &&
3025 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3026 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3027 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3028 (void)UserSGPRInfo;
3029 if (!Subtarget->enableFlatScratch())
3030 assert(!UserSGPRInfo.hasFlatScratchInit());
3031 if ((CallConv != CallingConv::AMDGPU_CS &&
3032 CallConv != CallingConv::AMDGPU_Gfx &&
3033 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
3034 !Subtarget->hasArchitectedSGPRs())
3035 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3036 !Info->hasWorkGroupIDZ());
3037 }
3038
3039 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3040
3041 if (CallConv == CallingConv::AMDGPU_PS) {
3042 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3043
3044 // At least one interpolation mode must be enabled or else the GPU will
3045 // hang.
3046 //
3047 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3048 // set PSInputAddr, the user wants to enable some bits after the compilation
3049 // based on run-time states. Since we can't know what the final PSInputEna
3050 // will look like, so we shouldn't do anything here and the user should take
3051 // responsibility for the correct programming.
3052 //
3053 // Otherwise, the following restrictions apply:
3054 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
3055 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
3056 // enabled too.
3057 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3058 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3059 CCInfo.AllocateReg(AMDGPU::VGPR0);
3060 CCInfo.AllocateReg(AMDGPU::VGPR1);
3061 Info->markPSInputAllocated(0);
3062 Info->markPSInputEnabled(0);
3063 }
3064 if (Subtarget->isAmdPalOS()) {
3065 // For isAmdPalOS, the user does not enable some bits after compilation
3066 // based on run-time states; the register values being generated here are
3067 // the final ones set in hardware. Therefore we need to apply the
3068 // workaround to PSInputAddr and PSInputEnable together. (The case where
3069 // a bit is set in PSInputAddr but not PSInputEnable is where the
3070 // frontend set up an input arg for a particular interpolation mode, but
3071 // nothing uses that input arg. Really we should have an earlier pass
3072 // that removes such an arg.)
3073 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3074 if ((PsInputBits & 0x7F) == 0 ||
3075 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3076 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
3077 }
3078 } else if (IsKernel) {
3079 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3080 } else {
3081 Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3082 Ins.end());
3083 }
3084
3085 if (IsKernel)
3086 analyzeFormalArgumentsCompute(CCInfo, Ins);
3087
3088 if (IsEntryFunc) {
3089 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
3090 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
3091 if (IsKernel && Subtarget->hasKernargPreload())
3092 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
3093
3094 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
3095 } else if (!IsGraphics) {
3096 // For the fixed ABI, pass workitem IDs in the last argument register.
3097 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
3098
3099 // FIXME: Sink this into allocateSpecialInputSGPRs
3100 if (!Subtarget->enableFlatScratch())
3101 CCInfo.AllocateReg(Info->getScratchRSrcReg());
3102
3103 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
3104 }
3105
3106 if (!IsKernel) {
3107 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
3108 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
3109
3110 // This assumes the registers are allocated by CCInfo in ascending order
3111 // with no gaps.
3112 Info->setNumWaveDispatchSGPRs(
3113 CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
3114 Info->setNumWaveDispatchVGPRs(
3115 CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
3116 } else if (Info->getNumKernargPreloadedSGPRs()) {
3117 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3118 }
3119
3121
3122 if (IsWholeWaveFunc) {
3124 {MVT::i1, MVT::Other}, Chain);
3125 InVals.push_back(Setup.getValue(0));
3126 Chains.push_back(Setup.getValue(1));
3127 }
3128
3129 // FIXME: This is the minimum kernel argument alignment. We should improve
3130 // this to the maximum alignment of the arguments.
3131 //
3132 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3133 // kern arg offset.
3134 const Align KernelArgBaseAlign = Align(16);
3135
3136 for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3137 ++i) {
3138 const ISD::InputArg &Arg = Ins[i];
3139 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3140 InVals.push_back(DAG.getPOISON(Arg.VT));
3141 continue;
3142 }
3143
3144 CCValAssign &VA = ArgLocs[ArgIdx++];
3145 MVT VT = VA.getLocVT();
3146
3147 if (IsEntryFunc && VA.isMemLoc()) {
3148 VT = Ins[i].VT;
3149 EVT MemVT = VA.getLocVT();
3150
3151 const uint64_t Offset = VA.getLocMemOffset();
3152 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
3153
3154 if (Arg.Flags.isByRef()) {
3155 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
3156
3157 const GCNTargetMachine &TM =
3158 static_cast<const GCNTargetMachine &>(getTargetMachine());
3159 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
3160 Arg.Flags.getPointerAddrSpace())) {
3163 }
3164
3165 InVals.push_back(Ptr);
3166 continue;
3167 }
3168
3169 SDValue NewArg;
3170 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3171 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3172 // In this case the argument is packed into the previous preload SGPR.
3173 int64_t AlignDownOffset = alignDown(Offset, 4);
3174 int64_t OffsetDiff = Offset - AlignDownOffset;
3175 EVT IntVT = MemVT.changeTypeToInteger();
3176
3180 Register Reg =
3181 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3182
3183 assert(Reg);
3184 Register VReg = MRI.getLiveInVirtReg(Reg);
3185 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3186
3187 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
3188 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
3189
3190 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
3191 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
3192 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
3193 Ins[i].Flags.isSExt(), &Ins[i]);
3194
3195 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
3196 } else {
3200 const SmallVectorImpl<MCRegister> &PreloadRegs =
3201 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3202
3203 SDValue Copy;
3204 if (PreloadRegs.size() == 1) {
3205 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3206 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3207 NewArg = DAG.getCopyFromReg(
3208 Chain, DL, VReg,
3210 TRI->getRegSizeInBits(*RC)));
3211
3212 } else {
3213 // If the kernarg alignment does not match the alignment of the SGPR
3214 // tuple RC that can accommodate this argument, it will be built up
3215 // via copies from from the individual SGPRs that the argument was
3216 // preloaded to.
3218 for (auto Reg : PreloadRegs) {
3219 Register VReg = MRI.getLiveInVirtReg(Reg);
3220 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3221 Elts.push_back(Copy);
3222 }
3223 NewArg =
3224 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3225 PreloadRegs.size()),
3226 DL, Elts);
3227 }
3228
3229 // If the argument was preloaded to multiple consecutive 32-bit
3230 // registers because of misalignment between addressable SGPR tuples
3231 // and the argument size, we can still assume that because of kernarg
3232 // segment alignment restrictions that NewArg's size is the same as
3233 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3234 // truncate since we cannot preload to less than a single SGPR and the
3235 // MemVT may be smaller.
3236 EVT MemVTInt =
3238 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3239 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3240
3241 NewArg = DAG.getBitcast(MemVT, NewArg);
3242 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3243 Ins[i].Flags.isSExt(), &Ins[i]);
3244 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3245 }
3246 } else {
3247 // Hidden arguments that are in the kernel signature must be preloaded
3248 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3249 // the argument list and is not preloaded.
3250 if (Arg.isOrigArg()) {
3251 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3252 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3254 *OrigArg->getParent(),
3255 "hidden argument in kernel signature was not preloaded",
3256 DL.getDebugLoc()));
3257 }
3258 }
3259
3260 NewArg =
3261 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3262 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3263 }
3264 Chains.push_back(NewArg.getValue(1));
3265
3266 auto *ParamTy =
3267 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3269 ParamTy &&
3270 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3271 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3272 // On SI local pointers are just offsets into LDS, so they are always
3273 // less than 16-bits. On CI and newer they could potentially be
3274 // real pointers, so we can't guarantee their size.
3275 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3276 DAG.getValueType(MVT::i16));
3277 }
3278
3279 InVals.push_back(NewArg);
3280 continue;
3281 }
3282 if (!IsEntryFunc && VA.isMemLoc()) {
3283 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3284 InVals.push_back(Val);
3285 if (!Arg.Flags.isByVal())
3286 Chains.push_back(Val.getValue(1));
3287 continue;
3288 }
3289
3290 assert(VA.isRegLoc() && "Parameter must be in a register!");
3291
3292 Register Reg = VA.getLocReg();
3293 const TargetRegisterClass *RC = nullptr;
3294 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3295 RC = &AMDGPU::VGPR_32RegClass;
3296 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3297 RC = &AMDGPU::SGPR_32RegClass;
3298 else
3299 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3300 EVT ValVT = VA.getValVT();
3301
3302 Reg = MF.addLiveIn(Reg, RC);
3303 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3304
3305 if (Arg.Flags.isSRet()) {
3306 // The return object should be reasonably addressable.
3307
3308 // FIXME: This helps when the return is a real sret. If it is a
3309 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3310 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3311 unsigned NumBits =
3313 Val = DAG.getNode(
3314 ISD::AssertZext, DL, VT, Val,
3315 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3316 }
3317
3318 // If this is an 8 or 16-bit value, it is really passed promoted
3319 // to 32 bits. Insert an assert[sz]ext to capture this, then
3320 // truncate to the right size.
3321 switch (VA.getLocInfo()) {
3322 case CCValAssign::Full:
3323 break;
3324 case CCValAssign::BCvt:
3325 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3326 break;
3327 case CCValAssign::SExt:
3328 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val, DAG.getValueType(ValVT));
3329 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3330 break;
3331 case CCValAssign::ZExt:
3332 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, DAG.getValueType(ValVT));
3333 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3334 break;
3335 case CCValAssign::AExt:
3336 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3337 break;
3338 default:
3339 llvm_unreachable("Unknown loc info!");
3340 }
3341
3342 InVals.push_back(Val);
3343 }
3344
3345 // Start adding system SGPRs.
3346 if (IsEntryFunc)
3347 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3348
3349 // DAG.getPass() returns nullptr when using new pass manager.
3350 // TODO: Use DAG.getMFAM() to access analysis result.
3351 if (DAG.getPass()) {
3352 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3353 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3354 }
3355
3356 unsigned StackArgSize = CCInfo.getStackSize();
3357 Info->setBytesInStackArgArea(StackArgSize);
3358
3359 return Chains.empty() ? Chain
3360 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3361}
3362
3363// TODO: If return values can't fit in registers, we should return as many as
3364// possible in registers before passing on stack.
3366 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3367 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3368 const Type *RetTy) const {
3369 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3370 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3371 // for shaders. Vector types should be explicitly handled by CC.
3372 if (AMDGPU::isEntryFunctionCC(CallConv))
3373 return true;
3374
3376 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3377 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3378 return false;
3379
3380 // We must use the stack if return would require unavailable registers.
3381 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3382 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3383 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3384 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3385 return false;
3386
3387 return true;
3388}
3389
3390SDValue
3392 bool isVarArg,
3394 const SmallVectorImpl<SDValue> &OutVals,
3395 const SDLoc &DL, SelectionDAG &DAG) const {
3399
3400 if (AMDGPU::isKernel(CallConv)) {
3401 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3402 OutVals, DL, DAG);
3403 }
3404
3405 bool IsShader = AMDGPU::isShader(CallConv);
3406
3407 Info->setIfReturnsVoid(Outs.empty());
3408 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3409
3410 // CCValAssign - represent the assignment of the return value to a location.
3412
3413 // CCState - Info about the registers and stack slots.
3414 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3415 *DAG.getContext());
3416
3417 // Analyze outgoing return values.
3418 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3419
3420 SDValue Glue;
3422 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3423
3424 SDValue ReadFirstLane =
3425 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3426 // Copy the result values into the output registers.
3427 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3428 ++I, ++RealRVLocIdx) {
3429 CCValAssign &VA = RVLocs[I];
3430 assert(VA.isRegLoc() && "Can only return in registers!");
3431 // TODO: Partially return in registers if return values don't fit.
3432 SDValue Arg = OutVals[RealRVLocIdx];
3433
3434 // Copied from other backends.
3435 switch (VA.getLocInfo()) {
3436 case CCValAssign::Full:
3437 break;
3438 case CCValAssign::BCvt:
3439 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3440 break;
3441 case CCValAssign::SExt:
3442 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3443 break;
3444 case CCValAssign::ZExt:
3445 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3446 break;
3447 case CCValAssign::AExt:
3448 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3449 break;
3450 default:
3451 llvm_unreachable("Unknown loc info!");
3452 }
3453 if (TRI->isSGPRPhysReg(VA.getLocReg()))
3455 ReadFirstLane, Arg);
3456 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3457 Glue = Chain.getValue(1);
3458 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3459 }
3460
3461 // FIXME: Does sret work properly?
3462 if (!Info->isEntryFunction()) {
3463 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3464 const MCPhysReg *I =
3465 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3466 if (I) {
3467 for (; *I; ++I) {
3468 if (AMDGPU::SReg_64RegClass.contains(*I))
3469 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3470 else if (AMDGPU::SReg_32RegClass.contains(*I))
3471 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3472 else
3473 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3474 }
3475 }
3476 }
3477
3478 // Update chain and glue.
3479 RetOps[0] = Chain;
3480 if (Glue.getNode())
3481 RetOps.push_back(Glue);
3482
3483 unsigned Opc = AMDGPUISD::ENDPGM;
3484 if (!IsWaveEnd)
3485 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3486 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3488 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3489}
3490
3492 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3493 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3494 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3495 SDValue ThisVal) const {
3496 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3497
3498 // Assign locations to each value returned by this call.
3500 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3501 *DAG.getContext());
3502 CCInfo.AnalyzeCallResult(Ins, RetCC);
3503
3504 // Copy all of the result registers out of their specified physreg.
3505 for (CCValAssign VA : RVLocs) {
3506 SDValue Val;
3507
3508 if (VA.isRegLoc()) {
3509 Val =
3510 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3511 Chain = Val.getValue(1);
3512 InGlue = Val.getValue(2);
3513 } else if (VA.isMemLoc()) {
3514 report_fatal_error("TODO: return values in memory");
3515 } else
3516 llvm_unreachable("unknown argument location type");
3517
3518 switch (VA.getLocInfo()) {
3519 case CCValAssign::Full:
3520 break;
3521 case CCValAssign::BCvt:
3522 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3523 break;
3524 case CCValAssign::ZExt:
3525 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3526 DAG.getValueType(VA.getValVT()));
3527 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3528 break;
3529 case CCValAssign::SExt:
3530 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3531 DAG.getValueType(VA.getValVT()));
3532 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3533 break;
3534 case CCValAssign::AExt:
3535 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3536 break;
3537 default:
3538 llvm_unreachable("Unknown loc info!");
3539 }
3540
3541 InVals.push_back(Val);
3542 }
3543
3544 return Chain;
3545}
3546
3547// Add code to pass special inputs required depending on used features separate
3548// from the explicit user arguments present in the IR.
3550 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3551 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3552 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3553 // If we don't have a call site, this was a call inserted by
3554 // legalization. These can never use special inputs.
3555 if (!CLI.CB)
3556 return;
3557
3558 SelectionDAG &DAG = CLI.DAG;
3559 const SDLoc &DL = CLI.DL;
3560 const Function &F = DAG.getMachineFunction().getFunction();
3561
3562 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3563 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3564
3565 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3567 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3568 // DAG.getPass() returns nullptr when using new pass manager.
3569 // TODO: Use DAG.getMFAM() to access analysis result.
3570 if (DAG.getPass()) {
3571 auto &ArgUsageInfo =
3573 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3574 }
3575 }
3576
3577 // TODO: Unify with private memory register handling. This is complicated by
3578 // the fact that at least in kernels, the input argument is not necessarily
3579 // in the same location as the input.
3580 // clang-format off
3581 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3583 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3584 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3585 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3586 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3587 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3588 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3589 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3590 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3591 };
3592 // clang-format on
3593
3594 for (auto [InputID, Attr] : ImplicitAttrs) {
3595 // If the callee does not use the attribute value, skip copying the value.
3596 if (CLI.CB->hasFnAttr(Attr))
3597 continue;
3598
3599 const auto [OutgoingArg, ArgRC, ArgTy] =
3600 CalleeArgInfo->getPreloadedValue(InputID);
3601 if (!OutgoingArg)
3602 continue;
3603
3604 const auto [IncomingArg, IncomingArgRC, Ty] =
3605 CallerArgInfo.getPreloadedValue(InputID);
3606 assert(IncomingArgRC == ArgRC);
3607
3608 // All special arguments are ints for now.
3609 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3610 SDValue InputReg;
3611
3612 if (IncomingArg) {
3613 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3614 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3615 // The implicit arg ptr is special because it doesn't have a corresponding
3616 // input for kernels, and is computed from the kernarg segment pointer.
3617 InputReg = getImplicitArgPtr(DAG, DL);
3618 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3619 std::optional<uint32_t> Id =
3621 if (Id.has_value()) {
3622 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3623 } else {
3624 InputReg = DAG.getPOISON(ArgVT);
3625 }
3626 } else {
3627 // We may have proven the input wasn't needed, although the ABI is
3628 // requiring it. We just need to allocate the register appropriately.
3629 InputReg = DAG.getPOISON(ArgVT);
3630 }
3631
3632 if (OutgoingArg->isRegister()) {
3633 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3634 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3635 report_fatal_error("failed to allocate implicit input argument");
3636 } else {
3637 unsigned SpecialArgOffset =
3638 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3639 SDValue ArgStore =
3640 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3641 MemOpChains.push_back(ArgStore);
3642 }
3643 }
3644
3645 // Pack workitem IDs into a single register or pass it as is if already
3646 // packed.
3647
3648 auto [OutgoingArg, ArgRC, Ty] =
3650 if (!OutgoingArg)
3651 std::tie(OutgoingArg, ArgRC, Ty) =
3653 if (!OutgoingArg)
3654 std::tie(OutgoingArg, ArgRC, Ty) =
3656 if (!OutgoingArg)
3657 return;
3658
3659 const ArgDescriptor *IncomingArgX = std::get<0>(
3661 const ArgDescriptor *IncomingArgY = std::get<0>(
3663 const ArgDescriptor *IncomingArgZ = std::get<0>(
3665
3666 SDValue InputReg;
3667 SDLoc SL;
3668
3669 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3670 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3671 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3672
3673 // If incoming ids are not packed we need to pack them.
3674 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3675 NeedWorkItemIDX) {
3676 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3677 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3678 } else {
3679 InputReg = DAG.getConstant(0, DL, MVT::i32);
3680 }
3681 }
3682
3683 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3684 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3685 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3686 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3687 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3688 InputReg = InputReg.getNode()
3689 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3690 : Y;
3691 }
3692
3693 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3694 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3695 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3696 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3697 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3698 InputReg = InputReg.getNode()
3699 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3700 : Z;
3701 }
3702
3703 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3704 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3705 // We're in a situation where the outgoing function requires the workitem
3706 // ID, but the calling function does not have it (e.g a graphics function
3707 // calling a C calling convention function). This is illegal, but we need
3708 // to produce something.
3709 InputReg = DAG.getPOISON(MVT::i32);
3710 } else {
3711 // Workitem ids are already packed, any of present incoming arguments
3712 // will carry all required fields.
3713 ArgDescriptor IncomingArg =
3714 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3715 : IncomingArgY ? *IncomingArgY
3716 : *IncomingArgZ,
3717 ~0u);
3718 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3719 }
3720 }
3721
3722 if (OutgoingArg->isRegister()) {
3723 if (InputReg)
3724 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3725
3726 CCInfo.AllocateReg(OutgoingArg->getRegister());
3727 } else {
3728 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3729 if (InputReg) {
3730 SDValue ArgStore =
3731 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3732 MemOpChains.push_back(ArgStore);
3733 }
3734 }
3735}
3736
3738 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3740 const SmallVectorImpl<SDValue> &OutVals,
3741 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3742 if (AMDGPU::isChainCC(CalleeCC))
3743 return true;
3744
3745 if (!AMDGPU::mayTailCallThisCC(CalleeCC))
3746 return false;
3747
3748 // For a divergent call target, we need to do a waterfall loop over the
3749 // possible callees which precludes us from using a simple jump.
3750 if (Callee->isDivergent())
3751 return false;
3752
3754 const Function &CallerF = MF.getFunction();
3755 CallingConv::ID CallerCC = CallerF.getCallingConv();
3757 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3758
3759 // Kernels aren't callable, and don't have a live in return address so it
3760 // doesn't make sense to do a tail call with entry functions.
3761 if (!CallerPreserved)
3762 return false;
3763
3764 bool CCMatch = CallerCC == CalleeCC;
3765
3767 if (AMDGPU::canGuaranteeTCO(CalleeCC) && CCMatch)
3768 return true;
3769 return false;
3770 }
3771
3772 // TODO: Can we handle var args?
3773 if (IsVarArg)
3774 return false;
3775
3776 for (const Argument &Arg : CallerF.args()) {
3777 if (Arg.hasByValAttr())
3778 return false;
3779 }
3780
3781 LLVMContext &Ctx = *DAG.getContext();
3782
3783 // Check that the call results are passed in the same way.
3784 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3785 CCAssignFnForCall(CalleeCC, IsVarArg),
3786 CCAssignFnForCall(CallerCC, IsVarArg)))
3787 return false;
3788
3789 // The callee has to preserve all registers the caller needs to preserve.
3790 if (!CCMatch) {
3791 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3792 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3793 return false;
3794 }
3795
3796 // Nothing more to check if the callee is taking no arguments.
3797 if (Outs.empty())
3798 return true;
3799
3801 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3802
3803 // FIXME: We are not allocating special input registers, so we will be
3804 // deciding based on incorrect register assignments.
3805 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3806
3807 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3808 // If the stack arguments for this call do not fit into our own save area then
3809 // the call cannot be made tail.
3810 // TODO: Is this really necessary?
3811 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3812 return false;
3813
3814 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
3815 // FIXME: What about inreg arguments that end up passed in memory?
3816 if (!CCVA.isRegLoc())
3817 continue;
3818
3819 // If we are passing an argument in an SGPR, and the value is divergent,
3820 // this call requires a waterfall loop.
3821 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3822 LLVM_DEBUG(
3823 dbgs() << "Cannot tail call due to divergent outgoing argument in "
3824 << printReg(CCVA.getLocReg(), TRI) << '\n');
3825 return false;
3826 }
3827 }
3828
3829 const MachineRegisterInfo &MRI = MF.getRegInfo();
3830 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3831}
3832
3834 if (!CI->isTailCall())
3835 return false;
3836
3837 const Function *ParentFn = CI->getParent()->getParent();
3839 return false;
3840 return true;
3841}
3842
3843namespace {
3844// Chain calls have special arguments that we need to handle. These are
3845// tagging along at the end of the arguments list(s), after the SGPR and VGPR
3846// arguments (index 0 and 1 respectively).
3847enum ChainCallArgIdx {
3848 Exec = 2,
3849 Flags,
3850 NumVGPRs,
3851 FallbackExec,
3852 FallbackCallee
3853};
3854} // anonymous namespace
3855
3856// The wave scratch offset register is used as the global base pointer.
3858 SmallVectorImpl<SDValue> &InVals) const {
3859 CallingConv::ID CallConv = CLI.CallConv;
3860 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3861
3862 SelectionDAG &DAG = CLI.DAG;
3863
3864 const SDLoc &DL = CLI.DL;
3865 SDValue Chain = CLI.Chain;
3866 SDValue Callee = CLI.Callee;
3867
3868 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
3869 bool UsesDynamicVGPRs = false;
3870 if (IsChainCallConv) {
3871 // The last arguments should be the value that we need to put in EXEC,
3872 // followed by the flags and any other arguments with special meanings.
3873 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
3874 // we don't treat them like the "real" arguments.
3875 auto RequestedExecIt =
3876 llvm::find_if(CLI.Outs, [](const ISD::OutputArg &Arg) {
3877 return Arg.OrigArgIndex == 2;
3878 });
3879 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
3880
3881 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
3882 CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx,
3883 CLI.OutVals.end());
3884 CLI.Outs.erase(RequestedExecIt, CLI.Outs.end());
3885
3886 assert(CLI.Outs.back().OrigArgIndex < 2 &&
3887 "Haven't popped all the special args");
3888
3889 TargetLowering::ArgListEntry RequestedExecArg =
3890 CLI.Args[ChainCallArgIdx::Exec];
3891 if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3892 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3893
3894 // Convert constants into TargetConstants, so they become immediate operands
3895 // instead of being selected into S_MOV.
3896 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
3897 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {
3898 ChainCallSpecialArgs.push_back(DAG.getTargetConstant(
3899 ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0)));
3900 } else
3901 ChainCallSpecialArgs.push_back(Arg.Node);
3902 };
3903
3904 PushNodeOrTargetConstant(RequestedExecArg);
3905
3906 // Process any other special arguments depending on the value of the flags.
3907 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
3908
3909 const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
3910 if (FlagsValue.isZero()) {
3911 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
3912 return lowerUnhandledCall(CLI, InVals,
3913 "no additional args allowed if flags == 0");
3914 } else if (FlagsValue.isOneBitSet(0)) {
3915 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
3916 return lowerUnhandledCall(CLI, InVals, "expected 3 additional args");
3917 }
3918
3919 if (!Subtarget->isWave32()) {
3920 return lowerUnhandledCall(
3921 CLI, InVals, "dynamic VGPR mode is only supported for wave32");
3922 }
3923
3924 UsesDynamicVGPRs = true;
3925 std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
3926 CLI.Args.end(), PushNodeOrTargetConstant);
3927 }
3928 }
3929
3931 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3933 bool &IsTailCall = CLI.IsTailCall;
3934 bool IsVarArg = CLI.IsVarArg;
3935 bool IsSibCall = false;
3937
3938 if (Callee.isUndef() || isNullConstant(Callee)) {
3939 if (!CLI.IsTailCall) {
3940 for (ISD::InputArg &Arg : CLI.Ins)
3941 InVals.push_back(DAG.getPOISON(Arg.VT));
3942 }
3943
3944 return Chain;
3945 }
3946
3947 if (IsVarArg) {
3948 return lowerUnhandledCall(CLI, InVals,
3949 "unsupported call to variadic function ");
3950 }
3951
3952 if (!CLI.CB)
3953 return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization");
3954
3955 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3956 return lowerUnhandledCall(CLI, InVals,
3957 "unsupported required tail call to function ");
3958 }
3959
3960 if (IsTailCall) {
3961 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
3962 Outs, OutVals, Ins, DAG);
3963 if (!IsTailCall &&
3964 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3965 report_fatal_error("failed to perform tail call elimination on a call "
3966 "site marked musttail or on llvm.amdgcn.cs.chain");
3967 }
3968
3969 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3970
3971 // A sibling call is one where we're under the usual C ABI and not planning
3972 // to change that but can still do a tail call:
3973 if (!TailCallOpt && IsTailCall)
3974 IsSibCall = true;
3975
3976 if (IsTailCall)
3977 ++NumTailCalls;
3978 }
3979
3982 SmallVector<SDValue, 8> MemOpChains;
3983
3984 // Analyze operands of the call, assigning locations to each operand.
3986 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3987 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3988
3989 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
3991 // With a fixed ABI, allocate fixed registers before user arguments.
3992 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3993 }
3994
3995 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3996
3997 // Get a count of how many bytes are to be pushed on the stack.
3998 unsigned NumBytes = CCInfo.getStackSize();
3999
4000 if (IsSibCall) {
4001 // Since we're not changing the ABI to make this a tail call, the memory
4002 // operands are already available in the caller's incoming argument space.
4003 NumBytes = 0;
4004 }
4005
4006 // FPDiff is the byte offset of the call's argument area from the callee's.
4007 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4008 // by this amount for a tail call. In a sibling call it must be 0 because the
4009 // caller will deallocate the entire stack and the callee still expects its
4010 // arguments to begin at SP+0. Completely unused for non-tail calls.
4011 int32_t FPDiff = 0;
4012 MachineFrameInfo &MFI = MF.getFrameInfo();
4013 auto *TRI = Subtarget->getRegisterInfo();
4014
4015 // Adjust the stack pointer for the new arguments...
4016 // These operations are automatically eliminated by the prolog/epilog pass
4017 if (!IsSibCall)
4018 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
4019
4020 if (!IsSibCall || IsChainCallConv) {
4021 if (!Subtarget->enableFlatScratch()) {
4022 SmallVector<SDValue, 4> CopyFromChains;
4023
4024 // In the HSA case, this should be an identity copy.
4025 SDValue ScratchRSrcReg =
4026 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
4027 RegsToPass.emplace_back(IsChainCallConv
4028 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4029 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4030 ScratchRSrcReg);
4031 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
4032 Chain = DAG.getTokenFactor(DL, CopyFromChains);
4033 }
4034 }
4035
4036 const unsigned NumSpecialInputs = RegsToPass.size();
4037
4038 MVT PtrVT = MVT::i32;
4039
4040 // Walk the register/memloc assignments, inserting copies/loads.
4041 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4042 CCValAssign &VA = ArgLocs[i];
4043 SDValue Arg = OutVals[i];
4044
4045 // Promote the value if needed.
4046 switch (VA.getLocInfo()) {
4047 case CCValAssign::Full:
4048 break;
4049 case CCValAssign::BCvt:
4050 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
4051 break;
4052 case CCValAssign::ZExt:
4053 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
4054 break;
4055 case CCValAssign::SExt:
4056 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
4057 break;
4058 case CCValAssign::AExt:
4059 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
4060 break;
4061 case CCValAssign::FPExt:
4062 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
4063 break;
4064 default:
4065 llvm_unreachable("Unknown loc info!");
4066 }
4067
4068 if (VA.isRegLoc()) {
4069 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
4070 } else {
4071 assert(VA.isMemLoc());
4072
4073 SDValue DstAddr;
4074 MachinePointerInfo DstInfo;
4075
4076 unsigned LocMemOffset = VA.getLocMemOffset();
4077 int32_t Offset = LocMemOffset;
4078
4079 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
4080 MaybeAlign Alignment;
4081
4082 if (IsTailCall) {
4083 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4084 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4085 : VA.getValVT().getStoreSize();
4086
4087 // FIXME: We can have better than the minimum byval required alignment.
4088 Alignment =
4089 Flags.isByVal()
4090 ? Flags.getNonZeroByValAlign()
4091 : commonAlignment(Subtarget->getStackAlignment(), Offset);
4092
4093 Offset = Offset + FPDiff;
4094 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
4095
4096 DstAddr = DAG.getFrameIndex(FI, PtrVT);
4097 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4098
4099 // Make sure any stack arguments overlapping with where we're storing
4100 // are loaded before this eventual operation. Otherwise they'll be
4101 // clobbered.
4102
4103 // FIXME: Why is this really necessary? This seems to just result in a
4104 // lot of code to copy the stack and write them back to the same
4105 // locations, which are supposed to be immutable?
4106 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
4107 } else {
4108 // Stores to the argument stack area are relative to the stack pointer.
4109 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
4110 MVT::i32);
4111 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
4112 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
4113 Alignment =
4114 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
4115 }
4116
4117 if (Outs[i].Flags.isByVal()) {
4118 SDValue SizeNode =
4119 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
4120 SDValue Cpy =
4121 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
4122 Outs[i].Flags.getNonZeroByValAlign(),
4123 /*isVol = */ false, /*AlwaysInline = */ true,
4124 /*CI=*/nullptr, std::nullopt, DstInfo,
4126
4127 MemOpChains.push_back(Cpy);
4128 } else {
4129 SDValue Store =
4130 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
4131 MemOpChains.push_back(Store);
4132 }
4133 }
4134 }
4135
4136 if (!MemOpChains.empty())
4137 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
4138
4139 SDValue ReadFirstLaneID =
4140 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4141
4142 SDValue TokenGlue;
4143 if (CLI.ConvergenceControlToken) {
4144 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
4146 }
4147
4148 // Build a sequence of copy-to-reg nodes chained together with token chain
4149 // and flag operands which copy the outgoing args into the appropriate regs.
4150 SDValue InGlue;
4151
4152 unsigned ArgIdx = 0;
4153 for (auto [Reg, Val] : RegsToPass) {
4154 if (ArgIdx++ >= NumSpecialInputs &&
4155 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4156 // For chain calls, the inreg arguments are required to be
4157 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4158 // they are uniform.
4159 //
4160 // For other calls, if an inreg arguments is known to be uniform,
4161 // speculatively insert a readfirstlane in case it is in a VGPR.
4162 //
4163 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4164 // value, so let that continue to produce invalid code.
4165
4166 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4167 if (TokenGlue)
4168 ReadfirstlaneArgs.push_back(TokenGlue);
4170 ReadfirstlaneArgs);
4171 }
4172
4173 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4174 InGlue = Chain.getValue(1);
4175 }
4176
4177 // We don't usually want to end the call-sequence here because we would tidy
4178 // the frame up *after* the call, however in the ABI-changing tail-call case
4179 // we've carefully laid out the parameters so that when sp is reset they'll be
4180 // in the correct location.
4181 if (IsTailCall && !IsSibCall) {
4182 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
4183 InGlue = Chain.getValue(1);
4184 }
4185
4186 std::vector<SDValue> Ops({Chain});
4187
4188 // Add a redundant copy of the callee global which will not be legalized, as
4189 // we need direct access to the callee later.
4190 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
4191 const GlobalValue *GV = GSD->getGlobal();
4192 Ops.push_back(Callee);
4193 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
4194 } else {
4195 if (IsTailCall) {
4196 // isEligibleForTailCallOptimization considered whether the call target is
4197 // divergent, but we may still end up with a uniform value in a VGPR.
4198 // Insert a readfirstlane just in case.
4199 SDValue ReadFirstLaneID =
4200 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4201
4202 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4203 if (TokenGlue)
4204 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
4205 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
4206 ReadfirstlaneArgs);
4207 }
4208
4209 Ops.push_back(Callee);
4210 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
4211 }
4212
4213 if (IsTailCall) {
4214 // Each tail call may have to adjust the stack by a different amount, so
4215 // this information must travel along with the operation for eventual
4216 // consumption by emitEpilogue.
4217 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
4218 }
4219
4220 if (IsChainCallConv)
4221 llvm::append_range(Ops, ChainCallSpecialArgs);
4222
4223 // Add argument registers to the end of the list so that they are known live
4224 // into the call.
4225 for (auto &[Reg, Val] : RegsToPass)
4226 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
4227
4228 // Add a register mask operand representing the call-preserved registers.
4229 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4230 assert(Mask && "Missing call preserved mask for calling convention");
4231 Ops.push_back(DAG.getRegisterMask(Mask));
4232
4233 if (SDValue Token = CLI.ConvergenceControlToken) {
4235 GlueOps.push_back(Token);
4236 if (InGlue)
4237 GlueOps.push_back(InGlue);
4238
4239 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
4240 MVT::Glue, GlueOps),
4241 0);
4242 }
4243
4244 if (InGlue)
4245 Ops.push_back(InGlue);
4246
4247 // If we're doing a tall call, use a TC_RETURN here rather than an
4248 // actual call instruction.
4249 if (IsTailCall) {
4250 MFI.setHasTailCall();
4251 unsigned OPC = AMDGPUISD::TC_RETURN;
4252 switch (CallConv) {
4255 break;
4258 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4260 break;
4261 }
4262
4263 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4264 }
4265
4266 // Returns a chain and a flag for retval copy to use.
4267 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4268 Chain = Call.getValue(0);
4269 InGlue = Call.getValue(1);
4270
4271 uint64_t CalleePopBytes = NumBytes;
4272 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4273 if (!Ins.empty())
4274 InGlue = Chain.getValue(1);
4275
4276 // Handle result values, copying them out of physregs into vregs that we
4277 // return.
4278 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4279 InVals, /*IsThisReturn=*/false, SDValue());
4280}
4281
4282// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4283// except for:
4284// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4285// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4287 SelectionDAG &DAG) const {
4288 const MachineFunction &MF = DAG.getMachineFunction();
4290
4291 SDLoc dl(Op);
4292 EVT VT = Op.getValueType();
4293 SDValue Chain = Op.getOperand(0);
4294 Register SPReg = Info->getStackPtrOffsetReg();
4295
4296 // Chain the dynamic stack allocation so that it doesn't modify the stack
4297 // pointer when other instructions are using the stack.
4298 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4299
4300 SDValue Size = Op.getOperand(1);
4301 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4302 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4303
4304 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4306 "Stack grows upwards for AMDGPU");
4307
4308 Chain = BaseAddr.getValue(1);
4309 Align StackAlign = TFL->getStackAlign();
4310 if (Alignment > StackAlign) {
4311 uint64_t ScaledAlignment = Alignment.value()
4312 << Subtarget->getWavefrontSizeLog2();
4313 uint64_t StackAlignMask = ScaledAlignment - 1;
4314 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4315 DAG.getConstant(StackAlignMask, dl, VT));
4316 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4317 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4318 }
4319
4320 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4321 SDValue NewSP;
4322 if (isa<ConstantSDNode>(Size)) {
4323 // For constant sized alloca, scale alloca size by wave-size
4324 SDValue ScaledSize = DAG.getNode(
4325 ISD::SHL, dl, VT, Size,
4326 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4327 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4328 } else {
4329 // For dynamic sized alloca, perform wave-wide reduction to get max of
4330 // alloca size(divergent) and then scale it by wave-size
4331 SDValue WaveReduction =
4332 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4333 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4334 Size, DAG.getConstant(0, dl, MVT::i32));
4335 SDValue ScaledSize = DAG.getNode(
4336 ISD::SHL, dl, VT, Size,
4337 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4338 NewSP =
4339 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4340 SDValue ReadFirstLaneID =
4341 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4342 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4343 NewSP);
4344 }
4345
4346 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4347 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4348
4349 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4350}
4351
4353 if (Op.getValueType() != MVT::i32)
4354 return Op; // Defer to cannot select error.
4355
4357 SDLoc SL(Op);
4358
4359 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4360
4361 // Convert from wave uniform to swizzled vector address. This should protect
4362 // from any edge cases where the stacksave result isn't directly used with
4363 // stackrestore.
4364 SDValue VectorAddress =
4365 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4366 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4367}
4368
4370 SelectionDAG &DAG) const {
4371 SDLoc SL(Op);
4372 assert(Op.getValueType() == MVT::i32);
4373
4374 uint32_t BothRoundHwReg =
4376 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4377
4378 SDValue IntrinID =
4379 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4380 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4381 Op.getOperand(0), IntrinID, GetRoundBothImm);
4382
4383 // There are two rounding modes, one for f32 and one for f64/f16. We only
4384 // report in the standard value range if both are the same.
4385 //
4386 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4387 // ties away from zero is not supported, and the other values are rotated by
4388 // 1.
4389 //
4390 // If the two rounding modes are not the same, report a target defined value.
4391
4392 // Mode register rounding mode fields:
4393 //
4394 // [1:0] Single-precision round mode.
4395 // [3:2] Double/Half-precision round mode.
4396 //
4397 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4398 //
4399 // Hardware Spec
4400 // Toward-0 3 0
4401 // Nearest Even 0 1
4402 // +Inf 1 2
4403 // -Inf 2 3
4404 // NearestAway0 N/A 4
4405 //
4406 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4407 // table we can index by the raw hardware mode.
4408 //
4409 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4410
4411 SDValue BitTable =
4413
4414 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4415 SDValue RoundModeTimesNumBits =
4416 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4417
4418 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4419 // knew only one mode was demanded.
4420 SDValue TableValue =
4421 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4422 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4423
4424 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4425 SDValue TableEntry =
4426 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4427
4428 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4429 // if it's an extended value.
4430 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4431 SDValue IsStandardValue =
4432 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4433 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4434 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4435 TableEntry, EnumOffset);
4436
4437 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4438}
4439
4441 SelectionDAG &DAG) const {
4442 SDLoc SL(Op);
4443
4444 SDValue NewMode = Op.getOperand(1);
4445 assert(NewMode.getValueType() == MVT::i32);
4446
4447 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4448 // hardware MODE.fp_round values.
4449 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4450 uint32_t ClampedVal = std::min(
4451 static_cast<uint32_t>(ConstMode->getZExtValue()),
4453 NewMode = DAG.getConstant(
4454 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4455 } else {
4456 // If we know the input can only be one of the supported standard modes in
4457 // the range 0-3, we can use a simplified mapping to hardware values.
4458 KnownBits KB = DAG.computeKnownBits(NewMode);
4459 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4460 // The supported standard values are 0-3. The extended values start at 8. We
4461 // need to offset by 4 if the value is in the extended range.
4462
4463 if (UseReducedTable) {
4464 // Truncate to the low 32-bits.
4465 SDValue BitTable = DAG.getConstant(
4466 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4467
4468 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4469 SDValue RoundModeTimesNumBits =
4470 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4471
4472 NewMode =
4473 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4474
4475 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4476 // the table extracted bits into inline immediates.
4477 } else {
4478 // table_index = umin(value, value - 4)
4479 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4480 SDValue BitTable =
4482
4483 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4484 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4485 SDValue IndexVal =
4486 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4487
4488 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4489 SDValue RoundModeTimesNumBits =
4490 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4491
4492 SDValue TableValue =
4493 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4494 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4495
4496 // No need to mask out the high bits since the setreg will ignore them
4497 // anyway.
4498 NewMode = TruncTable;
4499 }
4500
4501 // Insert a readfirstlane in case the value is a VGPR. We could do this
4502 // earlier and keep more operations scalar, but that interferes with
4503 // combining the source.
4504 SDValue ReadFirstLaneID =
4505 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4506 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4507 ReadFirstLaneID, NewMode);
4508 }
4509
4510 // N.B. The setreg will be later folded into s_round_mode on supported
4511 // targets.
4512 SDValue IntrinID =
4513 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4514 uint32_t BothRoundHwReg =
4516 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4517
4518 SDValue SetReg =
4519 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4520 IntrinID, RoundBothImm, NewMode);
4521
4522 return SetReg;
4523}
4524
4526 if (Op->isDivergent() &&
4527 (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
4528 // Cannot do I$ prefetch with divergent pointer.
4529 return SDValue();
4530
4531 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4535 break;
4537 if (Subtarget->hasSafeSmemPrefetch())
4538 break;
4539 [[fallthrough]];
4540 default:
4541 return SDValue();
4542 }
4543
4544 // I$ prefetch
4545 if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
4546 return SDValue();
4547
4548 return Op;
4549}
4550
4551// Work around DAG legality rules only based on the result type.
4553 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4554 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4555 EVT SrcVT = Src.getValueType();
4556
4557 if (SrcVT.getScalarType() != MVT::bf16)
4558 return Op;
4559
4560 SDLoc SL(Op);
4561 SDValue BitCast =
4562 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4563
4564 EVT DstVT = Op.getValueType();
4565 if (IsStrict)
4566 llvm_unreachable("Need STRICT_BF16_TO_FP");
4567
4568 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4569}
4570
4572 SDLoc SL(Op);
4573 if (Op.getValueType() != MVT::i64)
4574 return Op;
4575
4576 uint32_t ModeHwReg =
4578 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4579 uint32_t TrapHwReg =
4581 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4582
4583 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4584 SDValue IntrinID =
4585 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4586 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4587 Op.getOperand(0), IntrinID, ModeHwRegImm);
4588 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4589 Op.getOperand(0), IntrinID, TrapHwRegImm);
4590 SDValue TokenReg =
4591 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4592 GetTrapReg.getValue(1));
4593
4594 SDValue CvtPtr =
4595 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4596 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4597
4598 return DAG.getMergeValues({Result, TokenReg}, SL);
4599}
4600
4602 SDLoc SL(Op);
4603 if (Op.getOperand(1).getValueType() != MVT::i64)
4604 return Op;
4605
4606 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4607 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4608 DAG.getConstant(0, SL, MVT::i32));
4609 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4610 DAG.getConstant(1, SL, MVT::i32));
4611
4612 SDValue ReadFirstLaneID =
4613 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4614 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4615 ReadFirstLaneID, NewModeReg);
4616 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4617 ReadFirstLaneID, NewTrapReg);
4618
4619 unsigned ModeHwReg =
4621 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4622 unsigned TrapHwReg =
4624 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4625
4626 SDValue IntrinID =
4627 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4628 SDValue SetModeReg =
4629 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4630 IntrinID, ModeHwRegImm, NewModeReg);
4631 SDValue SetTrapReg =
4632 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4633 IntrinID, TrapHwRegImm, NewTrapReg);
4634 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4635}
4636
4638 const MachineFunction &MF) const {
4639 const Function &Fn = MF.getFunction();
4640
4642 .Case("m0", AMDGPU::M0)
4643 .Case("exec", AMDGPU::EXEC)
4644 .Case("exec_lo", AMDGPU::EXEC_LO)
4645 .Case("exec_hi", AMDGPU::EXEC_HI)
4646 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4647 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4648 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4649 .Default(Register());
4650 if (!Reg)
4651 return Reg;
4652
4653 if (!Subtarget->hasFlatScrRegister() &&
4654 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4655 Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) +
4656 "\" for subtarget."));
4657 }
4658
4659 switch (Reg) {
4660 case AMDGPU::M0:
4661 case AMDGPU::EXEC_LO:
4662 case AMDGPU::EXEC_HI:
4663 case AMDGPU::FLAT_SCR_LO:
4664 case AMDGPU::FLAT_SCR_HI:
4665 if (VT.getSizeInBits() == 32)
4666 return Reg;
4667 break;
4668 case AMDGPU::EXEC:
4669 case AMDGPU::FLAT_SCR:
4670 if (VT.getSizeInBits() == 64)
4671 return Reg;
4672 break;
4673 default:
4674 llvm_unreachable("missing register type checking");
4675 }
4676
4678 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4679}
4680
4681// If kill is not the last instruction, split the block so kill is always a
4682// proper terminator.
4685 MachineBasicBlock *BB) const {
4686 MachineBasicBlock *SplitBB = BB->splitAt(MI, /*UpdateLiveIns=*/true);
4688 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4689 return SplitBB;
4690}
4691
4692// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4693// \p MI will be the only instruction in the loop body block. Otherwise, it will
4694// be the first instruction in the remainder block.
4695//
4696/// \returns { LoopBody, Remainder }
4697static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4701
4702 // To insert the loop we need to split the block. Move everything after this
4703 // point to a new block, and insert a new empty block between the two.
4705 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4707 ++MBBI;
4708
4709 MF->insert(MBBI, LoopBB);
4710 MF->insert(MBBI, RemainderBB);
4711
4712 LoopBB->addSuccessor(LoopBB);
4713 LoopBB->addSuccessor(RemainderBB);
4714
4715 // Move the rest of the block into a new block.
4716 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4717
4718 if (InstInLoop) {
4719 auto Next = std::next(I);
4720
4721 // Move instruction to loop body.
4722 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4723
4724 // Move the rest of the block.
4725 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4726 } else {
4727 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4728 }
4729
4730 MBB.addSuccessor(LoopBB);
4731
4732 return std::pair(LoopBB, RemainderBB);
4733}
4734
4735/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4737 MachineBasicBlock *MBB = MI.getParent();
4739 auto I = MI.getIterator();
4740 auto E = std::next(I);
4741
4742 // clang-format off
4743 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4744 .addImm(0);
4745 // clang-format on
4746
4747 MIBundleBuilder Bundler(*MBB, I, E);
4748 finalizeBundle(*MBB, Bundler.begin());
4749}
4750
4753 MachineBasicBlock *BB) const {
4754 const DebugLoc &DL = MI.getDebugLoc();
4755
4757
4759
4760 // Apparently kill flags are only valid if the def is in the same block?
4761 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4762 Src->setIsKill(false);
4763
4764 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
4765
4766 MachineBasicBlock::iterator I = LoopBB->end();
4767
4768 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4770
4771 // Clear TRAP_STS.MEM_VIOL
4772 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4773 .addImm(0)
4774 .addImm(EncodedReg);
4775
4777
4778 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4779
4780 // Load and check TRAP_STS.MEM_VIOL
4781 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4782 .addImm(EncodedReg);
4783
4784 // FIXME: Do we need to use an isel pseudo that may clobber scc?
4785 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4786 .addReg(Reg, RegState::Kill)
4787 .addImm(0);
4788 // clang-format off
4789 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4790 .addMBB(LoopBB);
4791 // clang-format on
4792
4793 return RemainderBB;
4794}
4795
4796// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4797// wavefront. If the value is uniform and just happens to be in a VGPR, this
4798// will only do one iteration. In the worst case, this will loop 64 times.
4799//
4800// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4803 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4804 const DebugLoc &DL, const MachineOperand &Idx,
4805 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4806 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4807 Register &SGPRIdxReg) {
4808
4809 MachineFunction *MF = OrigBB.getParent();
4810 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4811 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4813
4814 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4815 Register PhiExec = MRI.createVirtualRegister(BoolRC);
4816 Register NewExec = MRI.createVirtualRegister(BoolRC);
4817 Register CurrentIdxReg =
4818 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4819 Register CondReg = MRI.createVirtualRegister(BoolRC);
4820
4821 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
4822 .addReg(InitReg)
4823 .addMBB(&OrigBB)
4824 .addReg(ResultReg)
4825 .addMBB(&LoopBB);
4826
4827 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
4828 .addReg(InitSaveExecReg)
4829 .addMBB(&OrigBB)
4830 .addReg(NewExec)
4831 .addMBB(&LoopBB);
4832
4833 // Read the next variant <- also loop target.
4834 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4835 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
4836
4837 // Compare the just read M0 value to all possible Idx values.
4838 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4839 .addReg(CurrentIdxReg)
4840 .addReg(Idx.getReg(), 0, Idx.getSubReg());
4841
4842 // Update EXEC, save the original EXEC value to VCC.
4843 BuildMI(LoopBB, I, DL,
4844 TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4845 : AMDGPU::S_AND_SAVEEXEC_B64),
4846 NewExec)
4847 .addReg(CondReg, RegState::Kill);
4848
4849 MRI.setSimpleHint(NewExec, CondReg);
4850
4851 if (UseGPRIdxMode) {
4852 if (Offset == 0) {
4853 SGPRIdxReg = CurrentIdxReg;
4854 } else {
4855 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4856 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4857 .addReg(CurrentIdxReg, RegState::Kill)
4858 .addImm(Offset);
4859 }
4860 } else {
4861 // Move index from VCC into M0
4862 if (Offset == 0) {
4863 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
4864 .addReg(CurrentIdxReg, RegState::Kill);
4865 } else {
4866 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4867 .addReg(CurrentIdxReg, RegState::Kill)
4868 .addImm(Offset);
4869 }
4870 }
4871
4872 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4873 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4874 MachineInstr *InsertPt =
4875 BuildMI(LoopBB, I, DL,
4876 TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4877 : AMDGPU::S_XOR_B64_term),
4878 Exec)
4879 .addReg(Exec)
4880 .addReg(NewExec);
4881
4882 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4883 // s_cbranch_scc0?
4884
4885 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4886 // clang-format off
4887 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4888 .addMBB(&LoopBB);
4889 // clang-format on
4890
4891 return InsertPt->getIterator();
4892}
4893
4894// This has slightly sub-optimal regalloc when the source vector is killed by
4895// the read. The register allocator does not understand that the kill is
4896// per-workitem, so is kept alive for the whole loop so we end up not re-using a
4897// subregister from it, using 1 more VGPR than necessary. This was saved when
4898// this was expanded after register allocation.
4901 unsigned InitResultReg, unsigned PhiReg, int Offset,
4902 bool UseGPRIdxMode, Register &SGPRIdxReg) {
4904 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4905 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4907 const DebugLoc &DL = MI.getDebugLoc();
4909
4910 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
4911 Register DstReg = MI.getOperand(0).getReg();
4912 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4913 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
4914 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4915 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4916
4917 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
4918
4919 // Save the EXEC mask
4920 // clang-format off
4921 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
4922 .addReg(Exec);
4923 // clang-format on
4924
4925 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
4926
4927 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4928
4929 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
4930 InitResultReg, DstReg, PhiReg, TmpExec,
4931 Offset, UseGPRIdxMode, SGPRIdxReg);
4932
4933 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
4935 ++MBBI;
4936 MF->insert(MBBI, LandingPad);
4937 LoopBB->removeSuccessor(RemainderBB);
4938 LandingPad->addSuccessor(RemainderBB);
4939 LoopBB->addSuccessor(LandingPad);
4940 MachineBasicBlock::iterator First = LandingPad->begin();
4941 // clang-format off
4942 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
4943 .addReg(SaveExec);
4944 // clang-format on
4945
4946 return InsPt;
4947}
4948
4949// Returns subreg index, offset
4950static std::pair<unsigned, int>
4952 const TargetRegisterClass *SuperRC, unsigned VecReg,
4953 int Offset) {
4954 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
4955
4956 // Skip out of bounds offsets, or else we would end up using an undefined
4957 // register.
4958 if (Offset >= NumElts || Offset < 0)
4959 return std::pair(AMDGPU::sub0, Offset);
4960
4961 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
4962}
4963
4966 int Offset) {
4967 MachineBasicBlock *MBB = MI.getParent();
4968 const DebugLoc &DL = MI.getDebugLoc();
4970
4971 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4972
4973 assert(Idx->getReg() != AMDGPU::NoRegister);
4974
4975 if (Offset == 0) {
4976 // clang-format off
4977 BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
4978 .add(*Idx);
4979 // clang-format on
4980 } else {
4981 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4982 .add(*Idx)
4983 .addImm(Offset);
4984 }
4985}
4986
4989 int Offset) {
4990 MachineBasicBlock *MBB = MI.getParent();
4991 const DebugLoc &DL = MI.getDebugLoc();
4993
4994 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4995
4996 if (Offset == 0)
4997 return Idx->getReg();
4998
4999 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5000 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
5001 .add(*Idx)
5002 .addImm(Offset);
5003 return Tmp;
5004}
5005
5008 const GCNSubtarget &ST) {
5009 const SIInstrInfo *TII = ST.getInstrInfo();
5010 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5013
5014 Register Dst = MI.getOperand(0).getReg();
5015 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5016 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
5017 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5018
5019 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
5020 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5021
5022 unsigned SubReg;
5023 std::tie(SubReg, Offset) =
5024 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
5025
5026 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5027
5028 // Check for a SGPR index.
5029 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5031 const DebugLoc &DL = MI.getDebugLoc();
5032
5033 if (UseGPRIdxMode) {
5034 // TODO: Look at the uses to avoid the copy. This may require rescheduling
5035 // to avoid interfering with other uses, so probably requires a new
5036 // optimization pass.
5038
5039 const MCInstrDesc &GPRIDXDesc =
5040 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5041 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5042 .addReg(SrcReg)
5043 .addReg(Idx)
5044 .addImm(SubReg);
5045 } else {
5047
5048 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5049 .addReg(SrcReg, 0, SubReg)
5050 .addReg(SrcReg, RegState::Implicit);
5051 }
5052
5053 MI.eraseFromParent();
5054
5055 return &MBB;
5056 }
5057
5058 // Control flow needs to be inserted if indexing with a VGPR.
5059 const DebugLoc &DL = MI.getDebugLoc();
5061
5062 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5063 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5064
5065 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
5066
5067 Register SGPRIdxReg;
5068 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
5069 UseGPRIdxMode, SGPRIdxReg);
5070
5071 MachineBasicBlock *LoopBB = InsPt->getParent();
5072
5073 if (UseGPRIdxMode) {
5074 const MCInstrDesc &GPRIDXDesc =
5075 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5076
5077 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5078 .addReg(SrcReg)
5079 .addReg(SGPRIdxReg)
5080 .addImm(SubReg);
5081 } else {
5082 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5083 .addReg(SrcReg, 0, SubReg)
5084 .addReg(SrcReg, RegState::Implicit);
5085 }
5086
5087 MI.eraseFromParent();
5088
5089 return LoopBB;
5090}
5091
5094 const GCNSubtarget &ST) {
5095 const SIInstrInfo *TII = ST.getInstrInfo();
5096 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5099
5100 Register Dst = MI.getOperand(0).getReg();
5101 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
5102 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5103 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
5104 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5105 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
5106 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5107
5108 // This can be an immediate, but will be folded later.
5109 assert(Val->getReg());
5110
5111 unsigned SubReg;
5112 std::tie(SubReg, Offset) =
5113 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
5114 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5115
5116 if (Idx->getReg() == AMDGPU::NoRegister) {
5118 const DebugLoc &DL = MI.getDebugLoc();
5119
5120 assert(Offset == 0);
5121
5122 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
5123 .add(*SrcVec)
5124 .add(*Val)
5125 .addImm(SubReg);
5126
5127 MI.eraseFromParent();
5128 return &MBB;
5129 }
5130
5131 // Check for a SGPR index.
5132 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5134 const DebugLoc &DL = MI.getDebugLoc();
5135
5136 if (UseGPRIdxMode) {
5138
5139 const MCInstrDesc &GPRIDXDesc =
5140 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5141 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5142 .addReg(SrcVec->getReg())
5143 .add(*Val)
5144 .addReg(Idx)
5145 .addImm(SubReg);
5146 } else {
5148
5149 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5150 TRI.getRegSizeInBits(*VecRC), 32, false);
5151 BuildMI(MBB, I, DL, MovRelDesc, Dst)
5152 .addReg(SrcVec->getReg())
5153 .add(*Val)
5154 .addImm(SubReg);
5155 }
5156 MI.eraseFromParent();
5157 return &MBB;
5158 }
5159
5160 // Control flow needs to be inserted if indexing with a VGPR.
5161 if (Val->isReg())
5162 MRI.clearKillFlags(Val->getReg());
5163
5164 const DebugLoc &DL = MI.getDebugLoc();
5165
5166 Register PhiReg = MRI.createVirtualRegister(VecRC);
5167
5168 Register SGPRIdxReg;
5169 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
5170 UseGPRIdxMode, SGPRIdxReg);
5171 MachineBasicBlock *LoopBB = InsPt->getParent();
5172
5173 if (UseGPRIdxMode) {
5174 const MCInstrDesc &GPRIDXDesc =
5175 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5176
5177 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5178 .addReg(PhiReg)
5179 .add(*Val)
5180 .addReg(SGPRIdxReg)
5181 .addImm(SubReg);
5182 } else {
5183 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5184 TRI.getRegSizeInBits(*VecRC), 32, false);
5185 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
5186 .addReg(PhiReg)
5187 .add(*Val)
5188 .addImm(SubReg);
5189 }
5190
5191 MI.eraseFromParent();
5192 return LoopBB;
5193}
5194
5196 switch (Opc) {
5197 case AMDGPU::S_MIN_U32:
5198 return std::numeric_limits<uint32_t>::max();
5199 case AMDGPU::S_MIN_I32:
5200 return std::numeric_limits<int32_t>::max();
5201 case AMDGPU::S_MAX_U32:
5202 return std::numeric_limits<uint32_t>::min();
5203 case AMDGPU::S_MAX_I32:
5204 return std::numeric_limits<int32_t>::min();
5205 case AMDGPU::S_ADD_I32:
5206 case AMDGPU::S_SUB_I32:
5207 case AMDGPU::S_OR_B32:
5208 case AMDGPU::S_XOR_B32:
5209 return std::numeric_limits<uint32_t>::min();
5210 case AMDGPU::S_AND_B32:
5211 return std::numeric_limits<uint32_t>::max();
5212 default:
5213 llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction");
5214 }
5215}
5216
5219 const GCNSubtarget &ST,
5220 unsigned Opc) {
5222 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5223 const DebugLoc &DL = MI.getDebugLoc();
5224 const SIInstrInfo *TII = ST.getInstrInfo();
5225
5226 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5227 Register SrcReg = MI.getOperand(1).getReg();
5228 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
5229 Register DstReg = MI.getOperand(0).getReg();
5230 MachineBasicBlock *RetBB = nullptr;
5231 if (isSGPR) {
5232 switch (Opc) {
5233 case AMDGPU::S_MIN_U32:
5234 case AMDGPU::S_MIN_I32:
5235 case AMDGPU::S_MAX_U32:
5236 case AMDGPU::S_MAX_I32:
5237 case AMDGPU::S_AND_B32:
5238 case AMDGPU::S_OR_B32: {
5239 // Idempotent operations.
5240 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5241 RetBB = &BB;
5242 break;
5243 }
5244 case AMDGPU::S_XOR_B32:
5245 case AMDGPU::S_ADD_I32:
5246 case AMDGPU::S_SUB_I32: {
5247 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5248 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5249 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5250 Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
5251
5252 bool IsWave32 = ST.isWave32();
5253 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5254 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5255 unsigned CountReg =
5256 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5257
5258 auto Exec =
5259 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5260
5261 auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5262 .addReg(Exec->getOperand(0).getReg());
5263
5264 switch (Opc) {
5265 case AMDGPU::S_XOR_B32: {
5266 // Performing an XOR operation on a uniform value
5267 // depends on the parity of the number of active lanes.
5268 // For even parity, the result will be 0, for odd
5269 // parity the result will be the same as the input value.
5270 Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
5271
5272 auto ParityReg =
5273 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5274 .addReg(NewAccumulator->getOperand(0).getReg())
5275 .addImm(1);
5276 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5277 .addReg(SrcReg)
5278 .addReg(ParityReg->getOperand(0).getReg());
5279 break;
5280 }
5281 case AMDGPU::S_SUB_I32: {
5282 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5283
5284 // Take the negation of the source operand.
5285 auto InvertedValReg =
5286 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
5287 .addImm(-1)
5288 .addReg(SrcReg);
5289 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5290 .addReg(InvertedValReg->getOperand(0).getReg())
5291 .addReg(NewAccumulator->getOperand(0).getReg());
5292 break;
5293 }
5294 case AMDGPU::S_ADD_I32: {
5295 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5296 .addReg(SrcReg)
5297 .addReg(NewAccumulator->getOperand(0).getReg());
5298 break;
5299 }
5300 }
5301 RetBB = &BB;
5302 }
5303 }
5304 } else {
5305 // TODO: Implement DPP Strategy and switch based on immediate strategy
5306 // operand. For now, for all the cases (default, Iterative and DPP we use
5307 // iterative approach by default.)
5308
5309 // To reduce the VGPR using iterative approach, we need to iterate
5310 // over all the active lanes. Lowering consists of ComputeLoop,
5311 // which iterate over only active lanes. We use copy of EXEC register
5312 // as induction variable and every active lane modifies it using bitset0
5313 // so that we will get the next active lane for next iteration.
5315 Register SrcReg = MI.getOperand(1).getReg();
5316
5317 // Create Control flow for loop
5318 // Split MI's Machine Basic block into For loop
5319 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
5320
5321 // Create virtual registers required for lowering.
5322 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5323 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5324 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5325 Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
5326
5327 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5328 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5329 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5330
5331 Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
5332 Register LaneValueReg =
5333 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5334
5335 bool IsWave32 = ST.isWave32();
5336 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5337 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5338
5339 // Create initial values of induction variable from Exec, Accumulator and
5340 // insert branch instr to newly created ComputeBlock
5342 auto TmpSReg =
5343 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
5344 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
5345 .addImm(InitalValue);
5346 // clang-format off
5347 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5348 .addMBB(ComputeLoop);
5349 // clang-format on
5350
5351 // Start constructing ComputeLoop
5352 I = ComputeLoop->end();
5353 auto Accumulator =
5354 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5355 .addReg(InitalValReg)
5356 .addMBB(&BB);
5357 auto ActiveBits =
5358 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5359 .addReg(TmpSReg->getOperand(0).getReg())
5360 .addMBB(&BB);
5361
5362 // Perform the computations
5363 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5364 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5365 .addReg(ActiveBits->getOperand(0).getReg());
5366 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5367 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5368 .addReg(SrcReg)
5369 .addReg(FF1->getOperand(0).getReg());
5370 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5371 .addReg(Accumulator->getOperand(0).getReg())
5372 .addReg(LaneValue->getOperand(0).getReg());
5373
5374 // Manipulate the iterator to get the next active lane
5375 unsigned BITSETOpc =
5376 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5377 auto NewActiveBits =
5378 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5379 .addReg(FF1->getOperand(0).getReg())
5380 .addReg(ActiveBits->getOperand(0).getReg());
5381
5382 // Add phi nodes
5383 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5384 .addMBB(ComputeLoop);
5385 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5386 .addMBB(ComputeLoop);
5387
5388 // Creating branching
5389 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5390 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5391 .addReg(NewActiveBits->getOperand(0).getReg())
5392 .addImm(0);
5393 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5394 .addMBB(ComputeLoop);
5395
5396 RetBB = ComputeEnd;
5397 }
5398 MI.eraseFromParent();
5399 return RetBB;
5400}
5401
5404 MachineBasicBlock *BB) const {
5405
5407 MachineFunction *MF = BB->getParent();
5409
5410 switch (MI.getOpcode()) {
5411 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5412 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5413 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5414 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5415 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5416 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5417 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5418 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5419 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5420 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5421 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5422 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5423 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5424 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5425 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5426 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5427 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5428 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
5429 case AMDGPU::S_UADDO_PSEUDO:
5430 case AMDGPU::S_USUBO_PSEUDO: {
5431 const DebugLoc &DL = MI.getDebugLoc();
5432 MachineOperand &Dest0 = MI.getOperand(0);
5433 MachineOperand &Dest1 = MI.getOperand(1);
5434 MachineOperand &Src0 = MI.getOperand(2);
5435 MachineOperand &Src1 = MI.getOperand(3);
5436
5437 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5438 ? AMDGPU::S_ADD_I32
5439 : AMDGPU::S_SUB_I32;
5440 // clang-format off
5441 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
5442 .add(Src0)
5443 .add(Src1);
5444 // clang-format on
5445
5446 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
5447 .addImm(1)
5448 .addImm(0);
5449
5450 MI.eraseFromParent();
5451 return BB;
5452 }
5453 case AMDGPU::S_ADD_U64_PSEUDO:
5454 case AMDGPU::S_SUB_U64_PSEUDO: {
5455 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5456 // For GFX12, we emit s_add_u64 and s_sub_u64.
5457 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5459 const DebugLoc &DL = MI.getDebugLoc();
5460 MachineOperand &Dest = MI.getOperand(0);
5461 MachineOperand &Src0 = MI.getOperand(1);
5462 MachineOperand &Src1 = MI.getOperand(2);
5463 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5464 if (Subtarget->hasScalarAddSub64()) {
5465 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5466 // clang-format off
5467 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5468 .add(Src0)
5469 .add(Src1);
5470 // clang-format on
5471 } else {
5472 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5473 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5474
5475 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5476 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5477
5478 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5479 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5480 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5481 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5482
5483 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5484 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5485 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5486 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5487
5488 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5489 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5490 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5491 .add(Src0Sub0)
5492 .add(Src1Sub0);
5493 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5494 .add(Src0Sub1)
5495 .add(Src1Sub1);
5496 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5497 .addReg(DestSub0)
5498 .addImm(AMDGPU::sub0)
5499 .addReg(DestSub1)
5500 .addImm(AMDGPU::sub1);
5501 }
5502 MI.eraseFromParent();
5503 return BB;
5504 }
5505 case AMDGPU::V_ADD_U64_PSEUDO:
5506 case AMDGPU::V_SUB_U64_PSEUDO: {
5508 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5509 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5510 const DebugLoc &DL = MI.getDebugLoc();
5511
5512 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5513
5514 MachineOperand &Dest = MI.getOperand(0);
5515 MachineOperand &Src0 = MI.getOperand(1);
5516 MachineOperand &Src1 = MI.getOperand(2);
5517
5518 if (ST.hasAddSubU64Insts()) {
5519 auto I = BuildMI(*BB, MI, DL,
5520 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
5521 : AMDGPU::V_SUB_U64_e64),
5522 Dest.getReg())
5523 .add(Src0)
5524 .add(Src1)
5525 .addImm(0); // clamp
5526 TII->legalizeOperands(*I);
5527 MI.eraseFromParent();
5528 return BB;
5529 }
5530
5531 if (IsAdd && ST.hasLshlAddU64Inst()) {
5532 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
5533 Dest.getReg())
5534 .add(Src0)
5535 .addImm(0)
5536 .add(Src1);
5537 TII->legalizeOperands(*Add);
5538 MI.eraseFromParent();
5539 return BB;
5540 }
5541
5542 const auto *CarryRC = TRI->getWaveMaskRegClass();
5543
5544 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5545 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5546
5547 Register CarryReg = MRI.createVirtualRegister(CarryRC);
5548 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
5549
5550 const TargetRegisterClass *Src0RC = Src0.isReg()
5551 ? MRI.getRegClass(Src0.getReg())
5552 : &AMDGPU::VReg_64RegClass;
5553 const TargetRegisterClass *Src1RC = Src1.isReg()
5554 ? MRI.getRegClass(Src1.getReg())
5555 : &AMDGPU::VReg_64RegClass;
5556
5557 const TargetRegisterClass *Src0SubRC =
5558 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5559 const TargetRegisterClass *Src1SubRC =
5560 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5561
5562 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
5563 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5564 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
5565 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5566
5567 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
5568 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5569 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
5570 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5571
5572 unsigned LoOpc =
5573 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5574 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5575 .addReg(CarryReg, RegState::Define)
5576 .add(SrcReg0Sub0)
5577 .add(SrcReg1Sub0)
5578 .addImm(0); // clamp bit
5579
5580 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5581 MachineInstr *HiHalf =
5582 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5583 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
5584 .add(SrcReg0Sub1)
5585 .add(SrcReg1Sub1)
5586 .addReg(CarryReg, RegState::Kill)
5587 .addImm(0); // clamp bit
5588
5589 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5590 .addReg(DestSub0)
5591 .addImm(AMDGPU::sub0)
5592 .addReg(DestSub1)
5593 .addImm(AMDGPU::sub1);
5594 TII->legalizeOperands(*LoHalf);
5595 TII->legalizeOperands(*HiHalf);
5596 MI.eraseFromParent();
5597 return BB;
5598 }
5599 case AMDGPU::S_ADD_CO_PSEUDO:
5600 case AMDGPU::S_SUB_CO_PSEUDO: {
5601 // This pseudo has a chance to be selected
5602 // only from uniform add/subcarry node. All the VGPR operands
5603 // therefore assumed to be splat vectors.
5605 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5606 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5608 const DebugLoc &DL = MI.getDebugLoc();
5609 MachineOperand &Dest = MI.getOperand(0);
5610 MachineOperand &CarryDest = MI.getOperand(1);
5611 MachineOperand &Src0 = MI.getOperand(2);
5612 MachineOperand &Src1 = MI.getOperand(3);
5613 MachineOperand &Src2 = MI.getOperand(4);
5614 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5615 ? AMDGPU::S_ADDC_U32
5616 : AMDGPU::S_SUBB_U32;
5617 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5618 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5619 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5620 .addReg(Src0.getReg());
5621 Src0.setReg(RegOp0);
5622 }
5623 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5624 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5625 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5626 .addReg(Src1.getReg());
5627 Src1.setReg(RegOp1);
5628 }
5629 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5630 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
5631 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5632 .addReg(Src2.getReg());
5633 Src2.setReg(RegOp2);
5634 }
5635
5636 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
5637 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
5638 assert(WaveSize == 64 || WaveSize == 32);
5639
5640 if (WaveSize == 64) {
5641 if (ST.hasScalarCompareEq64()) {
5642 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
5643 .addReg(Src2.getReg())
5644 .addImm(0);
5645 } else {
5646 const TargetRegisterClass *SubRC =
5647 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5648 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5649 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5650 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5651 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5652 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5653
5654 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
5655 .add(Src2Sub0)
5656 .add(Src2Sub1);
5657
5658 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5659 .addReg(Src2_32, RegState::Kill)
5660 .addImm(0);
5661 }
5662 } else {
5663 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5664 .addReg(Src2.getReg())
5665 .addImm(0);
5666 }
5667
5668 // clang-format off
5669 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg())
5670 .add(Src0)
5671 .add(Src1);
5672 // clang-format on
5673
5674 unsigned SelOpc =
5675 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5676
5677 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
5678 .addImm(-1)
5679 .addImm(0);
5680
5681 MI.eraseFromParent();
5682 return BB;
5683 }
5684 case AMDGPU::SI_INIT_M0: {
5685 MachineOperand &M0Init = MI.getOperand(0);
5686 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5687 TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
5688 AMDGPU::M0)
5689 .add(M0Init);
5690 MI.eraseFromParent();
5691 return BB;
5692 }
5693 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
5694 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
5695 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5696 TII->get(AMDGPU::S_CMP_EQ_U32))
5697 .addImm(0)
5698 .addImm(0);
5699 return BB;
5700 }
5701 case AMDGPU::GET_GROUPSTATICSIZE: {
5702 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
5703 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
5704 DebugLoc DL = MI.getDebugLoc();
5705 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
5706 .add(MI.getOperand(0))
5707 .addImm(MFI->getLDSSize());
5708 MI.eraseFromParent();
5709 return BB;
5710 }
5711 case AMDGPU::GET_SHADERCYCLESHILO: {
5714 const DebugLoc &DL = MI.getDebugLoc();
5715 // The algorithm is:
5716 //
5717 // hi1 = getreg(SHADER_CYCLES_HI)
5718 // lo1 = getreg(SHADER_CYCLES_LO)
5719 // hi2 = getreg(SHADER_CYCLES_HI)
5720 //
5721 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
5722 // Otherwise there was overflow and the result is hi2:0. In both cases the
5723 // result should represent the actual time at some point during the sequence
5724 // of three getregs.
5725 using namespace AMDGPU::Hwreg;
5726 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5727 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
5728 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5729 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5730 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
5731 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5732 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5733 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
5734 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5735 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
5736 .addReg(RegHi1)
5737 .addReg(RegHi2);
5738 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5739 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
5740 .addReg(RegLo1)
5741 .addImm(0);
5742 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
5743 .add(MI.getOperand(0))
5744 .addReg(RegLo)
5745 .addImm(AMDGPU::sub0)
5746 .addReg(RegHi2)
5747 .addImm(AMDGPU::sub1);
5748 MI.eraseFromParent();
5749 return BB;
5750 }
5751 case AMDGPU::SI_INDIRECT_SRC_V1:
5752 case AMDGPU::SI_INDIRECT_SRC_V2:
5753 case AMDGPU::SI_INDIRECT_SRC_V4:
5754 case AMDGPU::SI_INDIRECT_SRC_V8:
5755 case AMDGPU::SI_INDIRECT_SRC_V9:
5756 case AMDGPU::SI_INDIRECT_SRC_V10:
5757 case AMDGPU::SI_INDIRECT_SRC_V11:
5758 case AMDGPU::SI_INDIRECT_SRC_V12:
5759 case AMDGPU::SI_INDIRECT_SRC_V16:
5760 case AMDGPU::SI_INDIRECT_SRC_V32:
5761 return emitIndirectSrc(MI, *BB, *getSubtarget());
5762 case AMDGPU::SI_INDIRECT_DST_V1:
5763 case AMDGPU::SI_INDIRECT_DST_V2:
5764 case AMDGPU::SI_INDIRECT_DST_V4:
5765 case AMDGPU::SI_INDIRECT_DST_V8:
5766 case AMDGPU::SI_INDIRECT_DST_V9:
5767 case AMDGPU::SI_INDIRECT_DST_V10:
5768 case AMDGPU::SI_INDIRECT_DST_V11:
5769 case AMDGPU::SI_INDIRECT_DST_V12:
5770 case AMDGPU::SI_INDIRECT_DST_V16:
5771 case AMDGPU::SI_INDIRECT_DST_V32:
5772 return emitIndirectDst(MI, *BB, *getSubtarget());
5773 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5774 case AMDGPU::SI_KILL_I1_PSEUDO:
5775 return splitKillBlock(MI, BB);
5776 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5778 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5779 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5780
5781 Register Dst = MI.getOperand(0).getReg();
5782 const MachineOperand &Src0 = MI.getOperand(1);
5783 const MachineOperand &Src1 = MI.getOperand(2);
5784 const DebugLoc &DL = MI.getDebugLoc();
5785 Register SrcCond = MI.getOperand(3).getReg();
5786
5787 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5788 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5789 const auto *CondRC = TRI->getWaveMaskRegClass();
5790 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
5791
5792 const TargetRegisterClass *Src0RC = Src0.isReg()
5793 ? MRI.getRegClass(Src0.getReg())
5794 : &AMDGPU::VReg_64RegClass;
5795 const TargetRegisterClass *Src1RC = Src1.isReg()
5796 ? MRI.getRegClass(Src1.getReg())
5797 : &AMDGPU::VReg_64RegClass;
5798
5799 const TargetRegisterClass *Src0SubRC =
5800 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5801 const TargetRegisterClass *Src1SubRC =
5802 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5803
5804 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5805 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5806 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5807 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5808
5809 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5810 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5811 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5812 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5813
5814 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
5815 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5816 .addImm(0)
5817 .add(Src0Sub0)
5818 .addImm(0)
5819 .add(Src1Sub0)
5820 .addReg(SrcCondCopy);
5821 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5822 .addImm(0)
5823 .add(Src0Sub1)
5824 .addImm(0)
5825 .add(Src1Sub1)
5826 .addReg(SrcCondCopy);
5827
5828 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5829 .addReg(DstLo)
5830 .addImm(AMDGPU::sub0)
5831 .addReg(DstHi)
5832 .addImm(AMDGPU::sub1);
5833 MI.eraseFromParent();
5834 return BB;
5835 }
5836 case AMDGPU::SI_BR_UNDEF: {
5838 const DebugLoc &DL = MI.getDebugLoc();
5839 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5840 .add(MI.getOperand(0));
5841 Br->getOperand(1).setIsUndef(); // read undef SCC
5842 MI.eraseFromParent();
5843 return BB;
5844 }
5845 case AMDGPU::ADJCALLSTACKUP:
5846 case AMDGPU::ADJCALLSTACKDOWN: {
5848 MachineInstrBuilder MIB(*MF, &MI);
5849 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
5850 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
5851 return BB;
5852 }
5853 case AMDGPU::SI_CALL_ISEL: {
5855 const DebugLoc &DL = MI.getDebugLoc();
5856
5857 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
5858
5860 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5861
5862 for (const MachineOperand &MO : MI.operands())
5863 MIB.add(MO);
5864
5865 MIB.cloneMemRefs(MI);
5866 MI.eraseFromParent();
5867 return BB;
5868 }
5869 case AMDGPU::V_ADD_CO_U32_e32:
5870 case AMDGPU::V_SUB_CO_U32_e32:
5871 case AMDGPU::V_SUBREV_CO_U32_e32: {
5872 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
5873 const DebugLoc &DL = MI.getDebugLoc();
5874 unsigned Opc = MI.getOpcode();
5875
5876 bool NeedClampOperand = false;
5877 if (TII->pseudoToMCOpcode(Opc) == -1) {
5879 NeedClampOperand = true;
5880 }
5881
5882 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
5883 if (TII->isVOP3(*I)) {
5884 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5885 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5886 I.addReg(TRI->getVCC(), RegState::Define);
5887 }
5888 I.add(MI.getOperand(1)).add(MI.getOperand(2));
5889 if (NeedClampOperand)
5890 I.addImm(0); // clamp bit for e64 encoding
5891
5892 TII->legalizeOperands(*I);
5893
5894 MI.eraseFromParent();
5895 return BB;
5896 }
5897 case AMDGPU::V_ADDC_U32_e32:
5898 case AMDGPU::V_SUBB_U32_e32:
5899 case AMDGPU::V_SUBBREV_U32_e32:
5900 // These instructions have an implicit use of vcc which counts towards the
5901 // constant bus limit.
5902 TII->legalizeOperands(MI);
5903 return BB;
5904 case AMDGPU::DS_GWS_INIT:
5905 case AMDGPU::DS_GWS_SEMA_BR:
5906 case AMDGPU::DS_GWS_BARRIER:
5907 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
5908 [[fallthrough]];
5909 case AMDGPU::DS_GWS_SEMA_V:
5910 case AMDGPU::DS_GWS_SEMA_P:
5911 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5912 // A s_waitcnt 0 is required to be the instruction immediately following.
5913 if (getSubtarget()->hasGWSAutoReplay()) {
5915 return BB;
5916 }
5917
5918 return emitGWSMemViolTestLoop(MI, BB);
5919 case AMDGPU::S_SETREG_B32: {
5920 // Try to optimize cases that only set the denormal mode or rounding mode.
5921 //
5922 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
5923 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
5924 // instead.
5925 //
5926 // FIXME: This could be predicates on the immediate, but tablegen doesn't
5927 // allow you to have a no side effect instruction in the output of a
5928 // sideeffecting pattern.
5929 auto [ID, Offset, Width] =
5930 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
5932 return BB;
5933
5934 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5935 const unsigned SetMask = WidthMask << Offset;
5936
5937 if (getSubtarget()->hasDenormModeInst()) {
5938 unsigned SetDenormOp = 0;
5939 unsigned SetRoundOp = 0;
5940
5941 // The dedicated instructions can only set the whole denorm or round mode
5942 // at once, not a subset of bits in either.
5943 if (SetMask ==
5945 // If this fully sets both the round and denorm mode, emit the two
5946 // dedicated instructions for these.
5947 SetRoundOp = AMDGPU::S_ROUND_MODE;
5948 SetDenormOp = AMDGPU::S_DENORM_MODE;
5949 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
5950 SetRoundOp = AMDGPU::S_ROUND_MODE;
5951 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
5952 SetDenormOp = AMDGPU::S_DENORM_MODE;
5953 }
5954
5955 if (SetRoundOp || SetDenormOp) {
5957 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
5958 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5959 unsigned ImmVal = Def->getOperand(1).getImm();
5960 if (SetRoundOp) {
5961 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
5962 .addImm(ImmVal & 0xf);
5963
5964 // If we also have the denorm mode, get just the denorm mode bits.
5965 ImmVal >>= 4;
5966 }
5967
5968 if (SetDenormOp) {
5969 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
5970 .addImm(ImmVal & 0xf);
5971 }
5972
5973 MI.eraseFromParent();
5974 return BB;
5975 }
5976 }
5977 }
5978
5979 // If only FP bits are touched, used the no side effects pseudo.
5980 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
5981 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
5982 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
5983
5984 return BB;
5985 }
5986 case AMDGPU::S_INVERSE_BALLOT_U32:
5987 case AMDGPU::S_INVERSE_BALLOT_U64:
5988 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
5989 // necessary. After that they are equivalent to a COPY.
5990 MI.setDesc(TII->get(AMDGPU::COPY));
5991 return BB;
5992 case AMDGPU::ENDPGM_TRAP: {
5993 const DebugLoc &DL = MI.getDebugLoc();
5994 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
5995 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
5996 MI.addOperand(MachineOperand::CreateImm(0));
5997 return BB;
5998 }
5999
6000 // We need a block split to make the real endpgm a terminator. We also don't
6001 // want to break phis in successor blocks, so we can't just delete to the
6002 // end of the block.
6003
6004 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
6006 MF->push_back(TrapBB);
6007 // clang-format off
6008 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
6009 .addImm(0);
6010 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
6011 .addMBB(TrapBB);
6012 // clang-format on
6013
6014 BB->addSuccessor(TrapBB);
6015 MI.eraseFromParent();
6016 return SplitBB;
6017 }
6018 case AMDGPU::SIMULATED_TRAP: {
6019 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6021 MachineBasicBlock *SplitBB =
6022 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
6023 MI.eraseFromParent();
6024 return SplitBB;
6025 }
6026 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6028
6029 // During ISel, it's difficult to propagate the original EXEC mask to use as
6030 // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
6031 MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
6032 Register OriginalExec = Setup->getOperand(0).getReg();
6033 assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6034 MF->getRegInfo().clearKillFlags(OriginalExec);
6035 MI.getOperand(0).setReg(OriginalExec);
6036 return BB;
6037 }
6038 default:
6039 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
6040 if (!MI.mayStore())
6042 return BB;
6043 }
6045 }
6046}
6047
6049 // This currently forces unfolding various combinations of fsub into fma with
6050 // free fneg'd operands. As long as we have fast FMA (controlled by
6051 // isFMAFasterThanFMulAndFAdd), we should perform these.
6052
6053 // When fma is quarter rate, for f64 where add / sub are at best half rate,
6054 // most of these combines appear to be cycle neutral but save on instruction
6055 // count / code size.
6056 return true;
6057}
6058
6060
6062 EVT VT) const {
6063 if (!VT.isVector()) {
6064 return MVT::i1;
6065 }
6066 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
6067}
6068
6070 // TODO: Should i16 be used always if legal? For now it would force VALU
6071 // shifts.
6072 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6073}
6074
6076 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6077 ? Ty.changeElementSize(16)
6078 : Ty.changeElementSize(32);
6079}
6080
6081// Answering this is somewhat tricky and depends on the specific device which
6082// have different rates for fma or all f64 operations.
6083//
6084// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
6085// regardless of which device (although the number of cycles differs between
6086// devices), so it is always profitable for f64.
6087//
6088// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
6089// only on full rate devices. Normally, we should prefer selecting v_mad_f32
6090// which we can always do even without fused FP ops since it returns the same
6091// result as the separate operations and since it is always full
6092// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
6093// however does not support denormals, so we do report fma as faster if we have
6094// a fast fma device and require denormals.
6095//
6097 EVT VT) const {
6098 VT = VT.getScalarType();
6099
6100 switch (VT.getSimpleVT().SimpleTy) {
6101 case MVT::f32: {
6102 // If mad is not available this depends only on if f32 fma is full rate.
6103 if (!Subtarget->hasMadMacF32Insts())
6104 return Subtarget->hasFastFMAF32();
6105
6106 // Otherwise f32 mad is always full rate and returns the same result as
6107 // the separate operations so should be preferred over fma.
6108 // However does not support denormals.
6110 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6111
6112 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
6113 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6114 }
6115 case MVT::f64:
6116 return true;
6117 case MVT::f16:
6118 case MVT::bf16:
6119 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
6120 default:
6121 break;
6122 }
6123
6124 return false;
6125}
6126
6128 LLT Ty) const {
6129 switch (Ty.getScalarSizeInBits()) {
6130 case 16:
6131 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
6132 case 32:
6133 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
6134 case 64:
6135 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
6136 default:
6137 break;
6138 }
6139
6140 return false;
6141}
6142
6144 if (!Ty.isScalar())
6145 return false;
6146
6147 if (Ty.getScalarSizeInBits() == 16)
6148 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
6149 if (Ty.getScalarSizeInBits() == 32)
6150 return Subtarget->hasMadMacF32Insts() &&
6151 denormalModeIsFlushAllF32(*MI.getMF());
6152
6153 return false;
6154}
6155
6157 const SDNode *N) const {
6158 // TODO: Check future ftz flag
6159 // v_mad_f32/v_mac_f32 do not support denormals.
6160 EVT VT = N->getValueType(0);
6161 if (VT == MVT::f32)
6162 return Subtarget->hasMadMacF32Insts() &&
6164 if (VT == MVT::f16) {
6165 return Subtarget->hasMadF16() &&
6167 }
6168
6169 return false;
6170}
6171
6172//===----------------------------------------------------------------------===//
6173// Custom DAG Lowering Operations
6174//===----------------------------------------------------------------------===//
6175
6176// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6177// wider vector type is legal.
6179 SelectionDAG &DAG) const {
6180 unsigned Opc = Op.getOpcode();
6181 EVT VT = Op.getValueType();
6182 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
6183 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
6184 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6185 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
6186
6187 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
6188
6189 SDLoc SL(Op);
6190 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
6191 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
6192
6193 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6194}
6195
6196// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6197// wider vector type is legal.
6199 SelectionDAG &DAG) const {
6200 unsigned Opc = Op.getOpcode();
6201 EVT VT = Op.getValueType();
6202 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6203 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6204 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6205 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6206 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6207 VT == MVT::v32bf16);
6208
6209 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
6210 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6211
6212 SDLoc SL(Op);
6213
6214 SDValue OpLo =
6215 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
6216 SDValue OpHi =
6217 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
6218
6219 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6220}
6221
6223 SelectionDAG &DAG) const {
6224 unsigned Opc = Op.getOpcode();
6225 EVT VT = Op.getValueType();
6226 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6227 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6228 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6229 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6230 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6231 VT == MVT::v32bf16);
6232
6233 SDValue Op0 = Op.getOperand(0);
6234 auto [Lo0, Hi0] = Op0.getValueType().isVector()
6235 ? DAG.SplitVectorOperand(Op.getNode(), 0)
6236 : std::pair(Op0, Op0);
6237
6238 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6239 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
6240
6241 SDLoc SL(Op);
6242 auto ResVT = DAG.GetSplitDestVTs(VT);
6243
6244 SDValue OpLo =
6245 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
6246 SDValue OpHi =
6247 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
6248
6249 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6250}
6251
6253 switch (Op.getOpcode()) {
6254 default:
6256 case ISD::BRCOND:
6257 return LowerBRCOND(Op, DAG);
6258 case ISD::RETURNADDR:
6259 return LowerRETURNADDR(Op, DAG);
6260 case ISD::LOAD: {
6261 SDValue Result = LowerLOAD(Op, DAG);
6262 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6263 "Load should return a value and a chain");
6264 return Result;
6265 }
6266 case ISD::FSQRT: {
6267 EVT VT = Op.getValueType();
6268 if (VT == MVT::f32)
6269 return lowerFSQRTF32(Op, DAG);
6270 if (VT == MVT::f64)
6271 return lowerFSQRTF64(Op, DAG);
6272 return SDValue();
6273 }
6274 case ISD::FSIN:
6275 case ISD::FCOS:
6276 return LowerTrig(Op, DAG);
6277 case ISD::SELECT:
6278 return LowerSELECT(Op, DAG);
6279 case ISD::FDIV:
6280 return LowerFDIV(Op, DAG);
6281 case ISD::FFREXP:
6282 return LowerFFREXP(Op, DAG);
6284 return LowerATOMIC_CMP_SWAP(Op, DAG);
6285 case ISD::STORE:
6286 return LowerSTORE(Op, DAG);
6287 case ISD::GlobalAddress: {
6290 return LowerGlobalAddress(MFI, Op, DAG);
6291 }
6293 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6295 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6297 return LowerINTRINSIC_VOID(Op, DAG);
6298 case ISD::ADDRSPACECAST:
6299 return lowerADDRSPACECAST(Op, DAG);
6301 return lowerINSERT_SUBVECTOR(Op, DAG);
6303 return lowerINSERT_VECTOR_ELT(Op, DAG);
6305 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
6307 return lowerVECTOR_SHUFFLE(Op, DAG);
6309 return lowerSCALAR_TO_VECTOR(Op, DAG);
6310 case ISD::BUILD_VECTOR:
6311 return lowerBUILD_VECTOR(Op, DAG);
6312 case ISD::FP_ROUND:
6314 return lowerFP_ROUND(Op, DAG);
6315 case ISD::TRAP:
6316 return lowerTRAP(Op, DAG);
6317 case ISD::DEBUGTRAP:
6318 return lowerDEBUGTRAP(Op, DAG);
6319 case ISD::ABS:
6320 case ISD::FABS:
6321 case ISD::FNEG:
6322 case ISD::FCANONICALIZE:
6323 case ISD::BSWAP:
6324 return splitUnaryVectorOp(Op, DAG);
6325 case ISD::FMINNUM:
6326 case ISD::FMAXNUM:
6327 return lowerFMINNUM_FMAXNUM(Op, DAG);
6328 case ISD::FMINIMUMNUM:
6329 case ISD::FMAXIMUMNUM:
6330 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
6331 case ISD::FMINIMUM:
6332 case ISD::FMAXIMUM:
6333 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
6334 case ISD::FLDEXP:
6335 case ISD::STRICT_FLDEXP:
6336 return lowerFLDEXP(Op, DAG);
6337 case ISD::FMA:
6338 return splitTernaryVectorOp(Op, DAG);
6339 case ISD::FP_TO_SINT:
6340 case ISD::FP_TO_UINT:
6341 return LowerFP_TO_INT(Op, DAG);
6342 case ISD::SHL:
6343 case ISD::SRA:
6344 case ISD::SRL:
6345 case ISD::ADD:
6346 case ISD::SUB:
6347 case ISD::SMIN:
6348 case ISD::SMAX:
6349 case ISD::UMIN:
6350 case ISD::UMAX:
6351 case ISD::FADD:
6352 case ISD::FMUL:
6353 case ISD::FMINNUM_IEEE:
6354 case ISD::FMAXNUM_IEEE:
6355 case ISD::UADDSAT:
6356 case ISD::USUBSAT:
6357 case ISD::SADDSAT:
6358 case ISD::SSUBSAT:
6359 return splitBinaryVectorOp(Op, DAG);
6360 case ISD::FCOPYSIGN:
6361 return lowerFCOPYSIGN(Op, DAG);
6362 case ISD::MUL:
6363 return lowerMUL(Op, DAG);
6364 case ISD::SMULO:
6365 case ISD::UMULO:
6366 return lowerXMULO(Op, DAG);
6367 case ISD::SMUL_LOHI:
6368 case ISD::UMUL_LOHI:
6369 return lowerXMUL_LOHI(Op, DAG);
6371 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6372 case ISD::STACKSAVE:
6373 return LowerSTACKSAVE(Op, DAG);
6374 case ISD::GET_ROUNDING:
6375 return lowerGET_ROUNDING(Op, DAG);
6376 case ISD::SET_ROUNDING:
6377 return lowerSET_ROUNDING(Op, DAG);
6378 case ISD::PREFETCH:
6379 return lowerPREFETCH(Op, DAG);
6380 case ISD::FP_EXTEND:
6382 return lowerFP_EXTEND(Op, DAG);
6383 case ISD::GET_FPENV:
6384 return lowerGET_FPENV(Op, DAG);
6385 case ISD::SET_FPENV:
6386 return lowerSET_FPENV(Op, DAG);
6387 }
6388 return SDValue();
6389}
6390
6391// Used for D16: Casts the result of an instruction into the right vector,
6392// packs values if loads return unpacked values.
6394 const SDLoc &DL, SelectionDAG &DAG,
6395 bool Unpacked) {
6396 if (!LoadVT.isVector())
6397 return Result;
6398
6399 // Cast back to the original packed type or to a larger type that is a
6400 // multiple of 32 bit for D16. Widening the return type is a required for
6401 // legalization.
6402 EVT FittingLoadVT = LoadVT;
6403 if ((LoadVT.getVectorNumElements() % 2) == 1) {
6404 FittingLoadVT =
6406 LoadVT.getVectorNumElements() + 1);
6407 }
6408
6409 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6410 // Truncate to v2i16/v4i16.
6411 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6412
6413 // Workaround legalizer not scalarizing truncate after vector op
6414 // legalization but not creating intermediate vector trunc.
6416 DAG.ExtractVectorElements(Result, Elts);
6417 for (SDValue &Elt : Elts)
6418 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6419
6420 // Pad illegal v1i16/v3fi6 to v4i16
6421 if ((LoadVT.getVectorNumElements() % 2) == 1)
6422 Elts.push_back(DAG.getPOISON(MVT::i16));
6423
6424 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6425
6426 // Bitcast to original type (v2f16/v4f16).
6427 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6428 }
6429
6430 // Cast back to the original packed type.
6431 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6432}
6433
6434SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6435 SelectionDAG &DAG,
6437 bool IsIntrinsic) const {
6438 SDLoc DL(M);
6439
6440 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6441 EVT LoadVT = M->getValueType(0);
6442
6443 EVT EquivLoadVT = LoadVT;
6444 if (LoadVT.isVector()) {
6445 if (Unpacked) {
6446 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6447 LoadVT.getVectorNumElements());
6448 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
6449 // Widen v3f16 to legal type
6450 EquivLoadVT =
6452 LoadVT.getVectorNumElements() + 1);
6453 }
6454 }
6455
6456 // Change from v4f16/v2f16 to EquivLoadVT.
6457 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
6458
6460 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
6461 M->getMemoryVT(), M->getMemOperand());
6462
6463 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
6464
6465 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
6466}
6467
6468SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
6469 SelectionDAG &DAG,
6470 ArrayRef<SDValue> Ops) const {
6471 SDLoc DL(M);
6472 EVT LoadVT = M->getValueType(0);
6473 EVT EltType = LoadVT.getScalarType();
6474 EVT IntVT = LoadVT.changeTypeToInteger();
6475
6476 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
6477
6478 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
6479 bool IsTFE = M->getNumValues() == 3;
6480
6481 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
6485
6486 if (IsD16) {
6487 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
6488 }
6489
6490 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6491 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
6492 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
6493 IsTFE);
6494
6495 if (isTypeLegal(LoadVT)) {
6496 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
6497 M->getMemOperand(), DAG);
6498 }
6499
6500 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
6501 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
6502 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
6503 M->getMemOperand(), DAG);
6504 return DAG.getMergeValues(
6505 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
6506 DL);
6507}
6508
6510 SelectionDAG &DAG) {
6511 EVT VT = N->getValueType(0);
6512 unsigned CondCode = N->getConstantOperandVal(3);
6513 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
6514 return DAG.getPOISON(VT);
6515
6516 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
6517
6518 SDValue LHS = N->getOperand(1);
6519 SDValue RHS = N->getOperand(2);
6520
6521 SDLoc DL(N);
6522
6523 EVT CmpVT = LHS.getValueType();
6524 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
6525 unsigned PromoteOp =
6527 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
6528 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
6529 }
6530
6531 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
6532
6533 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6534 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6535
6536 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
6537 DAG.getCondCode(CCOpcode));
6538 if (VT.bitsEq(CCVT))
6539 return SetCC;
6540 return DAG.getZExtOrTrunc(SetCC, DL, VT);
6541}
6542
6544 SelectionDAG &DAG) {
6545 EVT VT = N->getValueType(0);
6546
6547 unsigned CondCode = N->getConstantOperandVal(3);
6548 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
6549 return DAG.getPOISON(VT);
6550
6551 SDValue Src0 = N->getOperand(1);
6552 SDValue Src1 = N->getOperand(2);
6553 EVT CmpVT = Src0.getValueType();
6554 SDLoc SL(N);
6555
6556 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
6557 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6558 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6559 }
6560
6561 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
6562 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
6563 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6564 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6565 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
6566 DAG.getCondCode(CCOpcode));
6567 if (VT.bitsEq(CCVT))
6568 return SetCC;
6569 return DAG.getZExtOrTrunc(SetCC, SL, VT);
6570}
6571
6573 SelectionDAG &DAG) {
6574 EVT VT = N->getValueType(0);
6575 SDValue Src = N->getOperand(1);
6576 SDLoc SL(N);
6577
6578 if (Src.getOpcode() == ISD::SETCC) {
6579 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
6580 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
6581 Src.getOperand(1), Src.getOperand(2));
6582 }
6583 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
6584 // (ballot 0) -> 0
6585 if (Arg->isZero())
6586 return DAG.getConstant(0, SL, VT);
6587
6588 // (ballot 1) -> EXEC/EXEC_LO
6589 if (Arg->isOne()) {
6590 Register Exec;
6591 if (VT.getScalarSizeInBits() == 32)
6592 Exec = AMDGPU::EXEC_LO;
6593 else if (VT.getScalarSizeInBits() == 64)
6594 Exec = AMDGPU::EXEC;
6595 else
6596 return SDValue();
6597
6598 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
6599 }
6600 }
6601
6602 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
6603 // ISD::SETNE)
6604 return DAG.getNode(
6605 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
6606 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
6607}
6608
6610 SelectionDAG &DAG) {
6611 EVT VT = N->getValueType(0);
6612 unsigned ValSize = VT.getSizeInBits();
6613 unsigned IID = N->getConstantOperandVal(0);
6614 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6615 IID == Intrinsic::amdgcn_permlanex16;
6616 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6617 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6618 SDLoc SL(N);
6619 MVT IntVT = MVT::getIntegerVT(ValSize);
6620 const GCNSubtarget *ST = TLI.getSubtarget();
6621 unsigned SplitSize = 32;
6622 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
6623 ST->hasDPALU_DPP() &&
6624 AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))
6625 SplitSize = 64;
6626
6627 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
6628 SDValue Src2, MVT ValT) -> SDValue {
6630 switch (IID) {
6631 case Intrinsic::amdgcn_permlane16:
6632 case Intrinsic::amdgcn_permlanex16:
6633 case Intrinsic::amdgcn_update_dpp:
6634 Operands.push_back(N->getOperand(6));
6635 Operands.push_back(N->getOperand(5));
6636 Operands.push_back(N->getOperand(4));
6637 [[fallthrough]];
6638 case Intrinsic::amdgcn_writelane:
6639 Operands.push_back(Src2);
6640 [[fallthrough]];
6641 case Intrinsic::amdgcn_readlane:
6642 case Intrinsic::amdgcn_set_inactive:
6643 case Intrinsic::amdgcn_set_inactive_chain_arg:
6644 case Intrinsic::amdgcn_mov_dpp8:
6645 Operands.push_back(Src1);
6646 [[fallthrough]];
6647 case Intrinsic::amdgcn_readfirstlane:
6648 case Intrinsic::amdgcn_permlane64:
6649 Operands.push_back(Src0);
6650 break;
6651 default:
6652 llvm_unreachable("unhandled lane op");
6653 }
6654
6655 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
6656 std::reverse(Operands.begin(), Operands.end());
6657
6658 if (SDNode *GL = N->getGluedNode()) {
6659 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6660 GL = GL->getOperand(0).getNode();
6661 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6662 SDValue(GL, 0)));
6663 }
6664
6665 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
6666 };
6667
6668 SDValue Src0 = N->getOperand(1);
6669 SDValue Src1, Src2;
6670 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6671 IID == Intrinsic::amdgcn_mov_dpp8 ||
6672 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6673 Src1 = N->getOperand(2);
6674 if (IID == Intrinsic::amdgcn_writelane ||
6675 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
6676 Src2 = N->getOperand(3);
6677 }
6678
6679 if (ValSize == SplitSize) {
6680 // Already legal
6681 return SDValue();
6682 }
6683
6684 if (ValSize < 32) {
6685 bool IsFloat = VT.isFloatingPoint();
6686 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
6687 SL, MVT::i32);
6688
6689 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6690 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
6691 SL, MVT::i32);
6692 }
6693
6694 if (IID == Intrinsic::amdgcn_writelane) {
6695 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
6696 SL, MVT::i32);
6697 }
6698
6699 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6700 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
6701 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
6702 }
6703
6704 if (ValSize % SplitSize != 0)
6705 return SDValue();
6706
6707 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
6708 EVT VT = N->getValueType(0);
6709 unsigned NE = VT.getVectorNumElements();
6710 EVT EltVT = VT.getVectorElementType();
6712 unsigned NumOperands = N->getNumOperands();
6713 SmallVector<SDValue, 4> Operands(NumOperands);
6714 SDNode *GL = N->getGluedNode();
6715
6716 // only handle convergencectrl_glue
6718
6719 for (unsigned i = 0; i != NE; ++i) {
6720 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6721 ++j) {
6722 SDValue Operand = N->getOperand(j);
6723 EVT OperandVT = Operand.getValueType();
6724 if (OperandVT.isVector()) {
6725 // A vector operand; extract a single element.
6726 EVT OperandEltVT = OperandVT.getVectorElementType();
6727 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
6728 Operand, DAG.getVectorIdxConstant(i, SL));
6729 } else {
6730 // A scalar operand; just use it as is.
6731 Operands[j] = Operand;
6732 }
6733 }
6734
6735 if (GL)
6736 Operands[NumOperands - 1] =
6737 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6738 SDValue(GL->getOperand(0).getNode(), 0));
6739
6740 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
6741 }
6742
6743 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
6744 return DAG.getBuildVector(VecVT, SL, Scalars);
6745 };
6746
6747 if (VT.isVector()) {
6748 switch (MVT::SimpleValueType EltTy =
6750 case MVT::i32:
6751 case MVT::f32:
6752 if (SplitSize == 32) {
6753 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
6754 return unrollLaneOp(LaneOp.getNode());
6755 }
6756 [[fallthrough]];
6757 case MVT::i16:
6758 case MVT::f16:
6759 case MVT::bf16: {
6760 unsigned SubVecNumElt =
6761 SplitSize / VT.getVectorElementType().getSizeInBits();
6762 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
6764 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6765 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
6766 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
6767 DAG.getConstant(EltIdx, SL, MVT::i32));
6768
6769 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
6770 IsPermLane16)
6771 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
6772 DAG.getConstant(EltIdx, SL, MVT::i32));
6773
6774 if (IID == Intrinsic::amdgcn_writelane)
6775 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
6776 DAG.getConstant(EltIdx, SL, MVT::i32));
6777
6778 Pieces.push_back(
6779 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
6780 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6781 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6782 EltIdx += SubVecNumElt;
6783 }
6784 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
6785 }
6786 default:
6787 // Handle all other cases by bitcasting to i32 vectors
6788 break;
6789 }
6790 }
6791
6792 MVT VecVT =
6793 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
6794 Src0 = DAG.getBitcast(VecVT, Src0);
6795
6796 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6797 Src1 = DAG.getBitcast(VecVT, Src1);
6798
6799 if (IID == Intrinsic::amdgcn_writelane)
6800 Src2 = DAG.getBitcast(VecVT, Src2);
6801
6802 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6803 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
6804 return DAG.getBitcast(VT, UnrolledLaneOp);
6805}
6806
6809 SelectionDAG &DAG) const {
6810 switch (N->getOpcode()) {
6812 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
6813 Results.push_back(Res);
6814 return;
6815 }
6817 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
6818 Results.push_back(Res);
6819 return;
6820 }
6822 unsigned IID = N->getConstantOperandVal(0);
6823 switch (IID) {
6824 case Intrinsic::amdgcn_make_buffer_rsrc:
6825 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
6826 return;
6827 case Intrinsic::amdgcn_cvt_pkrtz: {
6828 SDValue Src0 = N->getOperand(1);
6829 SDValue Src1 = N->getOperand(2);
6830 SDLoc SL(N);
6831 SDValue Cvt =
6832 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
6833 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
6834 return;
6835 }
6836 case Intrinsic::amdgcn_cvt_pknorm_i16:
6837 case Intrinsic::amdgcn_cvt_pknorm_u16:
6838 case Intrinsic::amdgcn_cvt_pk_i16:
6839 case Intrinsic::amdgcn_cvt_pk_u16: {
6840 SDValue Src0 = N->getOperand(1);
6841 SDValue Src1 = N->getOperand(2);
6842 SDLoc SL(N);
6843 unsigned Opcode;
6844
6845 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6847 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6849 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6851 else
6853
6854 EVT VT = N->getValueType(0);
6855 if (isTypeLegal(VT))
6856 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
6857 else {
6858 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
6859 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
6860 }
6861 return;
6862 }
6863 case Intrinsic::amdgcn_s_buffer_load: {
6864 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
6865 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
6866 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
6867 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
6868 // s_buffer_load_i8.
6869 if (!Subtarget->hasScalarSubwordLoads())
6870 return;
6871 SDValue Op = SDValue(N, 0);
6872 SDValue Rsrc = Op.getOperand(1);
6873 SDValue Offset = Op.getOperand(2);
6874 SDValue CachePolicy = Op.getOperand(3);
6875 EVT VT = Op.getValueType();
6876 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
6877 SDLoc DL(Op);
6879 const DataLayout &DataLayout = DAG.getDataLayout();
6880 Align Alignment =
6886 VT.getStoreSize(), Alignment);
6887 SDValue LoadVal;
6888 if (!Offset->isDivergent()) {
6889 SDValue Ops[] = {Rsrc, // source register
6890 Offset, CachePolicy};
6891 SDValue BufferLoad =
6893 DAG.getVTList(MVT::i32), Ops, VT, MMO);
6894 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
6895 } else {
6896 SDValue Ops[] = {
6897 DAG.getEntryNode(), // Chain
6898 Rsrc, // rsrc
6899 DAG.getConstant(0, DL, MVT::i32), // vindex
6900 {}, // voffset
6901 {}, // soffset
6902 {}, // offset
6903 CachePolicy, // cachepolicy
6904 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
6905 };
6906 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
6907 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
6908 }
6909 Results.push_back(LoadVal);
6910 return;
6911 }
6912 case Intrinsic::amdgcn_dead: {
6913 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
6914 Results.push_back(DAG.getPOISON(N->getValueType(I)));
6915 return;
6916 }
6917 }
6918 break;
6919 }
6921 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
6922 if (Res.getOpcode() == ISD::MERGE_VALUES) {
6923 // FIXME: Hacky
6924 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
6925 Results.push_back(Res.getOperand(I));
6926 }
6927 } else {
6928 Results.push_back(Res);
6929 Results.push_back(Res.getValue(1));
6930 }
6931 return;
6932 }
6933
6934 break;
6935 }
6936 case ISD::SELECT: {
6937 SDLoc SL(N);
6938 EVT VT = N->getValueType(0);
6939 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
6940 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
6941 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
6942
6943 EVT SelectVT = NewVT;
6944 if (NewVT.bitsLT(MVT::i32)) {
6945 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
6946 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
6947 SelectVT = MVT::i32;
6948 }
6949
6950 SDValue NewSelect =
6951 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
6952
6953 if (NewVT != SelectVT)
6954 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
6955 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
6956 return;
6957 }
6958 case ISD::FNEG: {
6959 if (N->getValueType(0) != MVT::v2f16)
6960 break;
6961
6962 SDLoc SL(N);
6963 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6964
6965 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
6966 DAG.getConstant(0x80008000, SL, MVT::i32));
6967 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6968 return;
6969 }
6970 case ISD::FABS: {
6971 if (N->getValueType(0) != MVT::v2f16)
6972 break;
6973
6974 SDLoc SL(N);
6975 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6976
6977 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
6978 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
6979 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6980 return;
6981 }
6982 case ISD::FSQRT: {
6983 if (N->getValueType(0) != MVT::f16)
6984 break;
6985 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
6986 break;
6987 }
6988 default:
6990 break;
6991 }
6992}
6993
6994/// Helper function for LowerBRCOND
6995static SDNode *findUser(SDValue Value, unsigned Opcode) {
6996
6997 for (SDUse &U : Value->uses()) {
6998 if (U.get() != Value)
6999 continue;
7000
7001 if (U.getUser()->getOpcode() == Opcode)
7002 return U.getUser();
7003 }
7004 return nullptr;
7005}
7006
7007unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
7008 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
7009 switch (Intr->getConstantOperandVal(1)) {
7010 case Intrinsic::amdgcn_if:
7011 return AMDGPUISD::IF;
7012 case Intrinsic::amdgcn_else:
7013 return AMDGPUISD::ELSE;
7014 case Intrinsic::amdgcn_loop:
7015 return AMDGPUISD::LOOP;
7016 case Intrinsic::amdgcn_end_cf:
7017 llvm_unreachable("should not occur");
7018 default:
7019 return 0;
7020 }
7021 }
7022
7023 // break, if_break, else_break are all only used as inputs to loop, not
7024 // directly as branch conditions.
7025 return 0;
7026}
7027
7029 const Triple &TT = getTargetMachine().getTargetTriple();
7033}
7034
7036 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7037 return false;
7038
7039 // FIXME: Either avoid relying on address space here or change the default
7040 // address space for functions to avoid the explicit check.
7041 return (GV->getValueType()->isFunctionTy() ||
7044}
7045
7047 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
7048}
7049
7051 if (!GV->hasExternalLinkage())
7052 return true;
7053
7054 const auto OS = getTargetMachine().getTargetTriple().getOS();
7055 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
7056}
7057
7058/// This transforms the control flow intrinsics to get the branch destination as
7059/// last parameter, also switches branch target with BR if the need arise
7060SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
7061 SDLoc DL(BRCOND);
7062
7063 SDNode *Intr = BRCOND.getOperand(1).getNode();
7064 SDValue Target = BRCOND.getOperand(2);
7065 SDNode *BR = nullptr;
7066 SDNode *SetCC = nullptr;
7067
7068 if (Intr->getOpcode() == ISD::SETCC) {
7069 // As long as we negate the condition everything is fine
7070 SetCC = Intr;
7071 Intr = SetCC->getOperand(0).getNode();
7072
7073 } else {
7074 // Get the target from BR if we don't negate the condition
7075 BR = findUser(BRCOND, ISD::BR);
7076 assert(BR && "brcond missing unconditional branch user");
7077 Target = BR->getOperand(1);
7078 }
7079
7080 unsigned CFNode = isCFIntrinsic(Intr);
7081 if (CFNode == 0) {
7082 // This is a uniform branch so we don't need to legalize.
7083 return BRCOND;
7084 }
7085
7086 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
7087 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
7088
7089 assert(!SetCC ||
7090 (SetCC->getConstantOperandVal(1) == 1 &&
7091 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
7092 ISD::SETNE));
7093
7094 // operands of the new intrinsic call
7096 if (HaveChain)
7097 Ops.push_back(BRCOND.getOperand(0));
7098
7099 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
7100 Ops.push_back(Target);
7101
7102 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
7103
7104 // build the new intrinsic call
7105 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
7106
7107 if (!HaveChain) {
7108 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
7109
7110 Result = DAG.getMergeValues(Ops, DL).getNode();
7111 }
7112
7113 if (BR) {
7114 // Give the branch instruction our target
7115 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
7116 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
7117 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
7118 }
7119
7120 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
7121
7122 // Copy the intrinsic results to registers
7123 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
7125 if (!CopyToReg)
7126 continue;
7127
7128 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
7129 SDValue(Result, i - 1), SDValue());
7130
7131 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
7132 }
7133
7134 // Remove the old intrinsic from the chain
7135 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
7136 Intr->getOperand(0));
7137
7138 return Chain;
7139}
7140
7141SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
7142 MVT VT = Op.getSimpleValueType();
7143 SDLoc DL(Op);
7144 // Checking the depth
7145 if (Op.getConstantOperandVal(0) != 0)
7146 return DAG.getConstant(0, DL, VT);
7147
7150 // Check for kernel and shader functions
7151 if (Info->isEntryFunction())
7152 return DAG.getConstant(0, DL, VT);
7153
7154 MachineFrameInfo &MFI = MF.getFrameInfo();
7155 // There is a call to @llvm.returnaddress in this function
7156 MFI.setReturnAddressIsTaken(true);
7157
7159 // Get the return address reg and mark it as an implicit live-in
7160 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
7161 getRegClassFor(VT, Op.getNode()->isDivergent()));
7162
7163 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
7164}
7165
7166SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
7167 const SDLoc &DL, EVT VT) const {
7168 return Op.getValueType().bitsLE(VT)
7169 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
7170 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
7171 DAG.getTargetConstant(0, DL, MVT::i32));
7172}
7173
7174SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
7175 SelectionDAG &DAG) const {
7176 EVT DstVT = Op.getValueType();
7177 unsigned NumElts = DstVT.getVectorNumElements();
7178 assert(NumElts > 2 && isPowerOf2_32(NumElts));
7179
7180 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
7181
7182 SDLoc DL(Op);
7183 unsigned Opc = Op.getOpcode();
7184 SDValue Flags = Op.getOperand(1);
7185 EVT HalfDstVT =
7186 EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2);
7187 SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags);
7188 SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags);
7189
7190 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi);
7191}
7192
7193SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
7194 SDValue Src = Op.getOperand(0);
7195 EVT SrcVT = Src.getValueType();
7196 EVT DstVT = Op.getValueType();
7197
7198 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
7199 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
7200 if (SrcVT.getScalarType() != MVT::f32)
7201 return SDValue();
7202 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
7203 }
7204
7205 if (SrcVT.getScalarType() != MVT::f64)
7206 return Op;
7207
7208 SDLoc DL(Op);
7209 if (DstVT == MVT::f16) {
7210 // TODO: Handle strictfp
7211 if (Op.getOpcode() != ISD::FP_ROUND)
7212 return Op;
7213
7214 if (!Subtarget->has16BitInsts()) {
7215 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
7216 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7217 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7218 }
7219 if (Op->getFlags().hasApproximateFuncs()) {
7220 SDValue Flags = Op.getOperand(1);
7221 SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
7222 return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
7223 }
7224 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
7225 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7226 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7227 }
7228
7229 assert(DstVT.getScalarType() == MVT::bf16 &&
7230 "custom lower FP_ROUND for f16 or bf16");
7231 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
7232
7233 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
7234 // hardware f32 -> bf16 instruction.
7235 EVT F32VT = SrcVT.isVector() ? SrcVT.changeVectorElementType(MVT::f32) :
7236 MVT::f32;
7237 SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);
7238 return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,
7239 DAG.getTargetConstant(0, DL, MVT::i32));
7240}
7241
7242SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
7243 SelectionDAG &DAG) const {
7244 EVT VT = Op.getValueType();
7245 const MachineFunction &MF = DAG.getMachineFunction();
7247 bool IsIEEEMode = Info->getMode().IEEE;
7248
7249 // FIXME: Assert during selection that this is only selected for
7250 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
7251 // mode functions, but this happens to be OK since it's only done in cases
7252 // where there is known no sNaN.
7253 if (IsIEEEMode)
7254 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
7255
7256 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7257 VT == MVT::v16bf16)
7258 return splitBinaryVectorOp(Op, DAG);
7259 return Op;
7260}
7261
7262SDValue
7263SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
7264 SelectionDAG &DAG) const {
7265 EVT VT = Op.getValueType();
7266 const MachineFunction &MF = DAG.getMachineFunction();
7268 bool IsIEEEMode = Info->getMode().IEEE;
7269
7270 if (IsIEEEMode)
7271 return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
7272
7273 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7274 VT == MVT::v16bf16)
7275 return splitBinaryVectorOp(Op, DAG);
7276 return Op;
7277}
7278
7279SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
7280 SelectionDAG &DAG) const {
7281 EVT VT = Op.getValueType();
7282 if (VT.isVector())
7283 return splitBinaryVectorOp(Op, DAG);
7284
7285 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7286 !Subtarget->hasMinimum3Maximum3F16() &&
7287 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7288 "should not need to widen f16 minimum/maximum to v2f16");
7289
7290 // Widen f16 operation to v2f16
7291
7292 // fminimum f16:x, f16:y ->
7293 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
7294 // (v2f16 (scalar_to_vector y))), 0
7295 SDLoc SL(Op);
7296 SDValue WideSrc0 =
7297 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
7298 SDValue WideSrc1 =
7299 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
7300
7301 SDValue Widened =
7302 DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7303
7304 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
7305 DAG.getConstant(0, SL, MVT::i32));
7306}
7307
7308SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
7309 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
7310 EVT VT = Op.getValueType();
7311 assert(VT == MVT::f16);
7312
7313 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
7314 EVT ExpVT = Exp.getValueType();
7315 if (ExpVT == MVT::i16)
7316 return Op;
7317
7318 SDLoc DL(Op);
7319
7320 // Correct the exponent type for f16 to i16.
7321 // Clamp the range of the exponent to the instruction's range.
7322
7323 // TODO: This should be a generic narrowing legalization, and can easily be
7324 // for GlobalISel.
7325
7326 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
7327 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
7328
7329 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
7330 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
7331
7332 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
7333
7334 if (IsStrict) {
7335 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
7336 {Op.getOperand(0), Op.getOperand(1), TruncExp});
7337 }
7338
7339 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
7340}
7341
7343 switch (Op->getOpcode()) {
7344 case ISD::SRA:
7345 case ISD::SMIN:
7346 case ISD::SMAX:
7347 return ISD::SIGN_EXTEND;
7348 case ISD::SRL:
7349 case ISD::UMIN:
7350 case ISD::UMAX:
7351 return ISD::ZERO_EXTEND;
7352 case ISD::ADD:
7353 case ISD::SUB:
7354 case ISD::AND:
7355 case ISD::OR:
7356 case ISD::XOR:
7357 case ISD::SHL:
7358 case ISD::SELECT:
7359 case ISD::MUL:
7360 // operation result won't be influenced by garbage high bits.
7361 // TODO: are all of those cases correct, and are there more?
7362 return ISD::ANY_EXTEND;
7363 case ISD::SETCC: {
7364 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7366 }
7367 default:
7368 llvm_unreachable("unexpected opcode!");
7369 }
7370}
7371
7372SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
7373 DAGCombinerInfo &DCI) const {
7374 const unsigned Opc = Op.getOpcode();
7375 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
7376 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
7377 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
7378 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
7379 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
7380
7381 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
7382 : Op->getOperand(0).getValueType();
7383 auto ExtTy = OpTy.changeElementType(MVT::i32);
7384
7385 if (DCI.isBeforeLegalizeOps() ||
7386 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
7387 return SDValue();
7388
7389 auto &DAG = DCI.DAG;
7390
7391 SDLoc DL(Op);
7392 SDValue LHS;
7393 SDValue RHS;
7394 if (Opc == ISD::SELECT) {
7395 LHS = Op->getOperand(1);
7396 RHS = Op->getOperand(2);
7397 } else {
7398 LHS = Op->getOperand(0);
7399 RHS = Op->getOperand(1);
7400 }
7401
7402 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
7403 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
7404
7405 // Special case: for shifts, the RHS always needs a zext.
7406 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
7407 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
7408 else
7409 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
7410
7411 // setcc always return i1/i1 vec so no need to truncate after.
7412 if (Opc == ISD::SETCC) {
7413 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7414 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
7415 }
7416
7417 // For other ops, we extend the operation's return type as well so we need to
7418 // truncate back to the original type.
7419 SDValue NewVal;
7420 if (Opc == ISD::SELECT)
7421 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
7422 else
7423 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
7424
7425 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
7426}
7427
7428SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
7429 SDValue Mag = Op.getOperand(0);
7430 EVT MagVT = Mag.getValueType();
7431
7432 if (MagVT.getVectorNumElements() > 2)
7433 return splitBinaryVectorOp(Op, DAG);
7434
7435 SDValue Sign = Op.getOperand(1);
7436 EVT SignVT = Sign.getValueType();
7437
7438 if (MagVT == SignVT)
7439 return Op;
7440
7441 // fcopysign v2f16:mag, v2f32:sign ->
7442 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
7443
7444 SDLoc SL(Op);
7445 SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
7446 SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32);
7447
7448 SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
7449
7450 return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
7451}
7452
7453// Custom lowering for vector multiplications and s_mul_u64.
7454SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
7455 EVT VT = Op.getValueType();
7456
7457 // Split vector operands.
7458 if (VT.isVector())
7459 return splitBinaryVectorOp(Op, DAG);
7460
7461 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
7462
7463 // There are four ways to lower s_mul_u64:
7464 //
7465 // 1. If all the operands are uniform, then we lower it as it is.
7466 //
7467 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
7468 // multiplications because there is not a vector equivalent of s_mul_u64.
7469 //
7470 // 3. If the cost model decides that it is more efficient to use vector
7471 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
7472 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
7473 //
7474 // 4. If the cost model decides to use vector registers and both of the
7475 // operands are zero-extended/sign-extended from 32-bits, then we split the
7476 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
7477 // possible to check if the operands are zero-extended or sign-extended in
7478 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
7479 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
7480 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
7481 // If the cost model decides that we have to use vector registers, then
7482 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
7483 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
7484 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
7485 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
7486 // SIInstrInfo.cpp .
7487
7488 if (Op->isDivergent())
7489 return SDValue();
7490
7491 SDValue Op0 = Op.getOperand(0);
7492 SDValue Op1 = Op.getOperand(1);
7493 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
7494 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
7495 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
7496 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
7497 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
7498 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
7499 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
7500 SDLoc SL(Op);
7501 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7502 return SDValue(
7503 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7504 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
7505 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
7506 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7507 return SDValue(
7508 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7509 // If all the operands are uniform, then we lower s_mul_u64 as it is.
7510 return Op;
7511}
7512
7513SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
7514 EVT VT = Op.getValueType();
7515 SDLoc SL(Op);
7516 SDValue LHS = Op.getOperand(0);
7517 SDValue RHS = Op.getOperand(1);
7518 bool isSigned = Op.getOpcode() == ISD::SMULO;
7519
7520 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
7521 const APInt &C = RHSC->getAPIntValue();
7522 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
7523 if (C.isPowerOf2()) {
7524 // smulo(x, signed_min) is same as umulo(x, signed_min).
7525 bool UseArithShift = isSigned && !C.isMinSignedValue();
7526 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
7527 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
7528 SDValue Overflow =
7529 DAG.getSetCC(SL, MVT::i1,
7530 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
7531 Result, ShiftAmt),
7532 LHS, ISD::SETNE);
7533 return DAG.getMergeValues({Result, Overflow}, SL);
7534 }
7535 }
7536
7537 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
7538 SDValue Top =
7539 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
7540
7541 SDValue Sign = isSigned
7542 ? DAG.getNode(ISD::SRA, SL, VT, Result,
7543 DAG.getConstant(VT.getScalarSizeInBits() - 1,
7544 SL, MVT::i32))
7545 : DAG.getConstant(0, SL, VT);
7546 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
7547
7548 return DAG.getMergeValues({Result, Overflow}, SL);
7549}
7550
7551SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
7552 if (Op->isDivergent()) {
7553 // Select to V_MAD_[IU]64_[IU]32.
7554 return Op;
7555 }
7556 if (Subtarget->hasSMulHi()) {
7557 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
7558 return SDValue();
7559 }
7560 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
7561 // calculate the high part, so we might as well do the whole thing with
7562 // V_MAD_[IU]64_[IU]32.
7563 return Op;
7564}
7565
7566SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
7567 if (!Subtarget->isTrapHandlerEnabled() ||
7569 return lowerTrapEndpgm(Op, DAG);
7570
7571 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
7572 : lowerTrapHsaQueuePtr(Op, DAG);
7573}
7574
7575SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
7576 SDLoc SL(Op);
7577 SDValue Chain = Op.getOperand(0);
7578 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
7579}
7580
7581SDValue
7582SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
7583 const SDLoc &DL, Align Alignment,
7584 ImplicitParameter Param) const {
7587 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
7589 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
7592}
7593
7594SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
7595 SelectionDAG &DAG) const {
7596 SDLoc SL(Op);
7597 SDValue Chain = Op.getOperand(0);
7598
7599 SDValue QueuePtr;
7600 // For code object version 5, QueuePtr is passed through implicit kernarg.
7601 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7603 QueuePtr =
7604 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
7605 } else {
7608 Register UserSGPR = Info->getQueuePtrUserSGPR();
7609
7610 if (UserSGPR == AMDGPU::NoRegister) {
7611 // We probably are in a function incorrectly marked with
7612 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
7613 // trap, so just use a null pointer.
7614 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
7615 } else {
7616 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
7617 MVT::i64);
7618 }
7619 }
7620
7621 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
7622 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
7623
7625 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
7626 ToReg.getValue(1)};
7627 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7628}
7629
7630SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
7631 SDLoc SL(Op);
7632 SDValue Chain = Op.getOperand(0);
7633
7634 // We need to simulate the 's_trap 2' instruction on targets that run in
7635 // PRIV=1 (where it is treated as a nop).
7636 if (Subtarget->hasPrivEnabledTrap2NopBug())
7637 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
7638
7640 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7641 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7642}
7643
7644SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
7645 SDLoc SL(Op);
7646 SDValue Chain = Op.getOperand(0);
7648
7649 if (!Subtarget->isTrapHandlerEnabled() ||
7651 LLVMContext &Ctx = MF.getFunction().getContext();
7653 "debugtrap handler not supported",
7654 Op.getDebugLoc(), DS_Warning));
7655 return Chain;
7656 }
7657
7658 uint64_t TrapID =
7660 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7661 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7662}
7663
7664SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
7665 SelectionDAG &DAG) const {
7666 if (Subtarget->hasApertureRegs()) {
7667 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
7668 ? AMDGPU::SRC_SHARED_BASE
7669 : AMDGPU::SRC_PRIVATE_BASE;
7670 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
7671 !Subtarget->hasGloballyAddressableScratch()) &&
7672 "Cannot use src_private_base with globally addressable scratch!");
7673 // Note: this feature (register) is broken. When used as a 32-bit operand,
7674 // it returns a wrong value (all zeroes?). The real value is in the upper 32
7675 // bits.
7676 //
7677 // To work around the issue, directly emit a 64 bit mov from this register
7678 // then extract the high bits. Note that this shouldn't even result in a
7679 // shift being emitted and simply become a pair of registers (e.g.):
7680 // s_mov_b64 s[6:7], src_shared_base
7681 // v_mov_b32_e32 v1, s7
7682 //
7683 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
7684 // coalescing would kick in and it would think it's okay to use the "HI"
7685 // subregister directly (instead of extracting the HI 32 bits) which is an
7686 // artificial (unusable) register.
7687 // Register TableGen definitions would need an overhaul to get rid of the
7688 // artificial "HI" aperture registers and prevent this kind of issue from
7689 // happening.
7690 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
7691 DAG.getRegister(ApertureRegNo, MVT::i64));
7692 return DAG.getNode(
7693 ISD::TRUNCATE, DL, MVT::i32,
7694 DAG.getNode(ISD::SRL, DL, MVT::i64,
7695 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
7696 }
7697
7698 // For code object version 5, private_base and shared_base are passed through
7699 // implicit kernargs.
7700 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7704 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
7705 }
7706
7709 Register UserSGPR = Info->getQueuePtrUserSGPR();
7710 if (UserSGPR == AMDGPU::NoRegister) {
7711 // We probably are in a function incorrectly marked with
7712 // amdgpu-no-queue-ptr. This is undefined.
7713 return DAG.getPOISON(MVT::i32);
7714 }
7715
7716 SDValue QueuePtr =
7717 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
7718
7719 // Offset into amd_queue_t for group_segment_aperture_base_hi /
7720 // private_segment_aperture_base_hi.
7721 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
7722
7723 SDValue Ptr =
7724 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
7725
7726 // TODO: Use custom target PseudoSourceValue.
7727 // TODO: We should use the value from the IR intrinsic call, but it might not
7728 // be available and how do we get it?
7730 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
7731 commonAlignment(Align(64), StructOffset),
7734}
7735
7736/// Return true if the value is a known valid address, such that a null check is
7737/// not necessary.
7739 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
7740 if (isa<FrameIndexSDNode, GlobalAddressSDNode, BasicBlockSDNode>(Val))
7741 return true;
7742
7743 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7744 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7745
7746 // TODO: Search through arithmetic, handle arguments and loads
7747 // marked nonnull.
7748 return false;
7749}
7750
7751SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
7752 SelectionDAG &DAG) const {
7753 SDLoc SL(Op);
7754
7755 const AMDGPUTargetMachine &TM =
7756 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
7757
7758 unsigned DestAS, SrcAS;
7759 SDValue Src;
7760 bool IsNonNull = false;
7761 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
7762 SrcAS = ASC->getSrcAddressSpace();
7763 Src = ASC->getOperand(0);
7764 DestAS = ASC->getDestAddressSpace();
7765 } else {
7766 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
7767 Op.getConstantOperandVal(0) ==
7768 Intrinsic::amdgcn_addrspacecast_nonnull);
7769 Src = Op->getOperand(1);
7770 SrcAS = Op->getConstantOperandVal(2);
7771 DestAS = Op->getConstantOperandVal(3);
7772 IsNonNull = true;
7773 }
7774
7775 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
7776
7777 // flat -> local/private
7778 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
7779 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
7780 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
7781 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7782
7783 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
7784 Subtarget->hasGloballyAddressableScratch()) {
7785 // flat -> private with globally addressable scratch: subtract
7786 // src_flat_scratch_base_lo.
7787 SDValue FlatScratchBaseLo(
7788 DAG.getMachineNode(
7789 AMDGPU::S_MOV_B32, SL, MVT::i32,
7790 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
7791 0);
7792 Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
7793 }
7794
7795 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7796 return Ptr;
7797
7798 unsigned NullVal = TM.getNullPointerValue(DestAS);
7799 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7800 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
7801
7802 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
7803 SegmentNullPtr);
7804 }
7805 }
7806
7807 // local/private -> flat
7808 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
7809 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
7810 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
7811 SDValue CvtPtr;
7812 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
7813 Subtarget->hasGloballyAddressableScratch()) {
7814 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
7815 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
7816 SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
7817 SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
7818 ThreadID = DAG.getNode(
7819 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
7820 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
7821 AllOnes, ThreadID);
7822 if (Subtarget->isWave64())
7823 ThreadID = DAG.getNode(
7824 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
7825 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
7826 AllOnes, ThreadID);
7827 SDValue ShAmt = DAG.getShiftAmountConstant(
7828 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
7829 SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
7830 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
7831 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7832 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
7833 // 64-bit hi:lo value.
7834 SDValue FlatScratchBase = {
7835 DAG.getMachineNode(
7836 AMDGPU::S_MOV_B64, SL, MVT::i64,
7837 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
7838 0};
7839 CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
7840 } else {
7841 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7842 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
7843 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7844 }
7845
7846 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7847 return CvtPtr;
7848
7849 unsigned NullVal = TM.getNullPointerValue(SrcAS);
7850 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7851
7852 SDValue NonNull =
7853 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
7854
7855 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
7856 FlatNullPtr);
7857 }
7858 }
7859
7860 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7861 Op.getValueType() == MVT::i64) {
7864 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
7865 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
7866 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
7867 }
7868
7869 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7870 Src.getValueType() == MVT::i64)
7871 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7872
7873 // global <-> flat are no-ops and never emitted.
7874
7875 // Invalid casts are poison.
7876 return DAG.getPOISON(Op->getValueType(0));
7877}
7878
7879// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
7880// the small vector and inserting them into the big vector. That is better than
7881// the default expansion of doing it via a stack slot. Even though the use of
7882// the stack slot would be optimized away afterwards, the stack slot itself
7883// remains.
7884SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
7885 SelectionDAG &DAG) const {
7886 SDValue Vec = Op.getOperand(0);
7887 SDValue Ins = Op.getOperand(1);
7888 SDValue Idx = Op.getOperand(2);
7889 EVT VecVT = Vec.getValueType();
7890 EVT InsVT = Ins.getValueType();
7891 EVT EltVT = VecVT.getVectorElementType();
7892 unsigned InsNumElts = InsVT.getVectorNumElements();
7893 unsigned IdxVal = Idx->getAsZExtVal();
7894 SDLoc SL(Op);
7895
7896 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
7897 // Insert 32-bit registers at a time.
7898 assert(InsNumElts % 2 == 0 && "expect legal vector types");
7899
7900 unsigned VecNumElts = VecVT.getVectorNumElements();
7901 EVT NewVecVT =
7902 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
7903 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7905 MVT::i32, InsNumElts / 2);
7906
7907 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
7908 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
7909
7910 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
7911 SDValue Elt;
7912 if (InsNumElts == 2) {
7913 Elt = Ins;
7914 } else {
7915 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
7916 DAG.getConstant(I, SL, MVT::i32));
7917 }
7918 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
7919 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
7920 }
7921
7922 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
7923 }
7924
7925 for (unsigned I = 0; I != InsNumElts; ++I) {
7926 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
7927 DAG.getConstant(I, SL, MVT::i32));
7928 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
7929 DAG.getConstant(IdxVal + I, SL, MVT::i32));
7930 }
7931 return Vec;
7932}
7933
7934SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
7935 SelectionDAG &DAG) const {
7936 SDValue Vec = Op.getOperand(0);
7937 SDValue InsVal = Op.getOperand(1);
7938 SDValue Idx = Op.getOperand(2);
7939 EVT VecVT = Vec.getValueType();
7940 EVT EltVT = VecVT.getVectorElementType();
7941 unsigned VecSize = VecVT.getSizeInBits();
7942 unsigned EltSize = EltVT.getSizeInBits();
7943 SDLoc SL(Op);
7944
7945 // Specially handle the case of v4i16 with static indexing.
7946 unsigned NumElts = VecVT.getVectorNumElements();
7947 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
7948 if (NumElts == 4 && EltSize == 16 && KIdx) {
7949 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
7950
7951 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7952 DAG.getConstant(0, SL, MVT::i32));
7953 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7954 DAG.getConstant(1, SL, MVT::i32));
7955
7956 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
7957 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
7958
7959 unsigned Idx = KIdx->getZExtValue();
7960 bool InsertLo = Idx < 2;
7961 SDValue InsHalf = DAG.getNode(
7962 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
7963 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
7964 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
7965
7966 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
7967
7968 SDValue Concat =
7969 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
7970 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
7971
7972 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
7973 }
7974
7975 // Static indexing does not lower to stack access, and hence there is no need
7976 // for special custom lowering to avoid stack access.
7977 if (isa<ConstantSDNode>(Idx))
7978 return SDValue();
7979
7980 // Avoid stack access for dynamic indexing by custom lowering to
7981 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
7982
7983 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
7984
7985 MVT IntVT = MVT::getIntegerVT(VecSize);
7986
7987 // Convert vector index to bit-index and get the required bit mask.
7988 assert(isPowerOf2_32(EltSize));
7989 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7990 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7991 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7992 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
7993 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
7994
7995 // 1. Create a congruent vector with the target value in each element.
7996 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
7997 DAG.getSplatBuildVector(VecVT, SL, InsVal));
7998
7999 // 2. Mask off all other indices except the required index within (1).
8000 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
8001
8002 // 3. Mask off the required index within the target vector.
8003 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8004 SDValue RHS =
8005 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
8006
8007 // 4. Get (2) and (3) ORed into the target vector.
8008 SDValue BFI =
8009 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
8010
8011 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
8012}
8013
8014SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
8015 SelectionDAG &DAG) const {
8016 SDLoc SL(Op);
8017
8018 EVT ResultVT = Op.getValueType();
8019 SDValue Vec = Op.getOperand(0);
8020 SDValue Idx = Op.getOperand(1);
8021 EVT VecVT = Vec.getValueType();
8022 unsigned VecSize = VecVT.getSizeInBits();
8023 EVT EltVT = VecVT.getVectorElementType();
8024
8025 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
8026
8027 // Make sure we do any optimizations that will make it easier to fold
8028 // source modifiers before obscuring it with bit operations.
8029
8030 // XXX - Why doesn't this get called when vector_shuffle is expanded?
8031 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
8032 return Combined;
8033
8034 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8035 SDValue Lo, Hi;
8036 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
8037
8038 if (VecSize == 128) {
8039 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
8040 Lo = DAG.getBitcast(LoVT,
8041 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8042 DAG.getConstant(0, SL, MVT::i32)));
8043 Hi = DAG.getBitcast(HiVT,
8044 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8045 DAG.getConstant(1, SL, MVT::i32)));
8046 } else if (VecSize == 256) {
8047 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
8048 SDValue Parts[4];
8049 for (unsigned P = 0; P < 4; ++P) {
8050 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8051 DAG.getConstant(P, SL, MVT::i32));
8052 }
8053
8054 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8055 Parts[0], Parts[1]));
8056 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8057 Parts[2], Parts[3]));
8058 } else {
8059 assert(VecSize == 512);
8060
8061 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
8062 SDValue Parts[8];
8063 for (unsigned P = 0; P < 8; ++P) {
8064 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8065 DAG.getConstant(P, SL, MVT::i32));
8066 }
8067
8068 Lo = DAG.getBitcast(LoVT,
8069 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8070 Parts[0], Parts[1], Parts[2], Parts[3]));
8071 Hi = DAG.getBitcast(HiVT,
8072 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8073 Parts[4], Parts[5], Parts[6], Parts[7]));
8074 }
8075
8076 EVT IdxVT = Idx.getValueType();
8077 unsigned NElem = VecVT.getVectorNumElements();
8078 assert(isPowerOf2_32(NElem));
8079 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
8080 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
8081 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
8082 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
8083 }
8084
8085 assert(VecSize <= 64);
8086
8087 MVT IntVT = MVT::getIntegerVT(VecSize);
8088
8089 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
8090 SDValue VecBC = peekThroughBitcasts(Vec);
8091 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8092 SDValue Src = VecBC.getOperand(0);
8093 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8094 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
8095 }
8096
8097 unsigned EltSize = EltVT.getSizeInBits();
8098 assert(isPowerOf2_32(EltSize));
8099
8100 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8101
8102 // Convert vector index to bit-index (* EltSize)
8103 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8104
8105 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8106 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
8107
8108 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8109 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
8110 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
8111 }
8112
8113 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
8114}
8115
8116static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
8117 assert(Elt % 2 == 0);
8118 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8119}
8120
8121static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
8122 assert(Elt % 2 == 0);
8123 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8124 !(Mask[Elt + 1] & 1);
8125}
8126
8127SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
8128 SelectionDAG &DAG) const {
8129 SDLoc SL(Op);
8130 EVT ResultVT = Op.getValueType();
8131 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
8132 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
8133 const int NewSrcNumElts = 2;
8134 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
8135 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
8136
8137 // Break up the shuffle into registers sized pieces.
8138 //
8139 // We're trying to form sub-shuffles that the register allocation pipeline
8140 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
8141 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
8142 // pair of copies into a consecutive register copy, so use the ordinary
8143 // extract_vector_elt lowering unless we can use the shuffle.
8144 //
8145 // TODO: This is a bit of hack, and we should probably always use
8146 // extract_subvector for the largest possible subvector we can (or at least
8147 // use it for PackVT aligned pieces). However we have worse support for
8148 // combines on them don't directly treat extract_subvector / insert_subvector
8149 // as legal. The DAG scheduler also ends up doing a worse job with the
8150 // extract_subvectors.
8151 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
8152
8153 // vector_shuffle <0,1,6,7> lhs, rhs
8154 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
8155 //
8156 // vector_shuffle <6,7,2,3> lhs, rhs
8157 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
8158 //
8159 // vector_shuffle <6,7,0,1> lhs, rhs
8160 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
8161
8162 // Avoid scalarizing when both halves are reading from consecutive elements.
8163
8164 // If we're treating 2 element shuffles as legal, also create odd-to-even
8165 // shuffles of neighboring pairs.
8166 //
8167 // vector_shuffle <3,2,7,6> lhs, rhs
8168 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
8169 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
8170
8172 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
8173 if (ShouldUseConsecutiveExtract &&
8175 const int Idx = SVN->getMaskElt(I);
8176 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8177 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8178 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
8179 SVN->getOperand(VecIdx),
8180 DAG.getConstant(EltIdx, SL, MVT::i32));
8181 Pieces.push_back(SubVec);
8182 } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
8184 int Idx0 = SVN->getMaskElt(I);
8185 int Idx1 = SVN->getMaskElt(I + 1);
8186
8187 SDValue SrcOp0 = SVN->getOperand(0);
8188 SDValue SrcOp1 = SrcOp0;
8189 if (Idx0 >= SrcNumElts) {
8190 SrcOp0 = SVN->getOperand(1);
8191 Idx0 -= SrcNumElts;
8192 }
8193
8194 if (Idx1 >= SrcNumElts) {
8195 SrcOp1 = SVN->getOperand(1);
8196 Idx1 -= SrcNumElts;
8197 }
8198
8199 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8200 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8201
8202 // Extract nearest even aligned piece.
8203 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
8204 DAG.getConstant(AlignedIdx0, SL, MVT::i32));
8205 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
8206 DAG.getConstant(AlignedIdx1, SL, MVT::i32));
8207
8208 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8209 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8210
8211 SDValue Result0 = SubVec0;
8212 SDValue Result1 = SubVec0;
8213
8214 if (SubVec0 != SubVec1) {
8215 NewMaskIdx1 += NewSrcNumElts;
8216 Result1 = SubVec1;
8217 } else {
8218 Result1 = DAG.getPOISON(PackVT);
8219 }
8220
8221 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
8222 {NewMaskIdx0, NewMaskIdx1});
8223 Pieces.push_back(Shuf);
8224 } else {
8225 const int Idx0 = SVN->getMaskElt(I);
8226 const int Idx1 = SVN->getMaskElt(I + 1);
8227 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8228 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8229 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8230 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8231
8232 SDValue Vec0 = SVN->getOperand(VecIdx0);
8233 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
8234 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
8235
8236 SDValue Vec1 = SVN->getOperand(VecIdx1);
8237 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
8238 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
8239 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
8240 }
8241 }
8242
8243 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
8244}
8245
8246SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
8247 SelectionDAG &DAG) const {
8248 SDValue SVal = Op.getOperand(0);
8249 EVT ResultVT = Op.getValueType();
8250 EVT SValVT = SVal.getValueType();
8251 SDValue UndefVal = DAG.getPOISON(SValVT);
8252 SDLoc SL(Op);
8253
8255 VElts.push_back(SVal);
8256 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
8257 VElts.push_back(UndefVal);
8258
8259 return DAG.getBuildVector(ResultVT, SL, VElts);
8260}
8261
8262SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
8263 SelectionDAG &DAG) const {
8264 SDLoc SL(Op);
8265 EVT VT = Op.getValueType();
8266
8267 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8268 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
8269
8270 SDValue Lo = Op.getOperand(0);
8271 SDValue Hi = Op.getOperand(1);
8272
8273 // Avoid adding defined bits with the zero_extend.
8274 if (Hi.isUndef()) {
8275 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8276 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
8277 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
8278 }
8279
8280 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
8281 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
8282
8283 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
8284 DAG.getConstant(16, SL, MVT::i32));
8285 if (Lo.isUndef())
8286 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
8287
8288 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8289 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
8290
8291 SDValue Or =
8292 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
8293 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
8294 }
8295
8296 // Split into 2-element chunks.
8297 const unsigned NumParts = VT.getVectorNumElements() / 2;
8299 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
8300
8302 for (unsigned P = 0; P < NumParts; ++P) {
8303 SDValue Vec = DAG.getBuildVector(
8304 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
8305 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
8306 }
8307
8308 SDValue Blend =
8309 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
8310 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
8311}
8312
8314 const GlobalAddressSDNode *GA) const {
8315 // OSes that use ELF REL relocations (instead of RELA) can only store a
8316 // 32-bit addend in the instruction, so it is not safe to allow offset folding
8317 // which can create arbitrary 64-bit addends. (This is only a problem for
8318 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
8319 // the high 32 bits of the addend.)
8320 //
8321 // This should be kept in sync with how HasRelocationAddend is initialized in
8322 // the constructor of ELFAMDGPUAsmBackend.
8323 if (!Subtarget->isAmdHsaOS())
8324 return false;
8325
8326 // We can fold offsets for anything that doesn't require a GOT relocation.
8327 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
8331}
8332
8333static SDValue
8335 const SDLoc &DL, int64_t Offset, EVT PtrVT,
8336 unsigned GAFlags = SIInstrInfo::MO_NONE) {
8337 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
8338 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
8339 // lowered to the following code sequence:
8340 //
8341 // For constant address space:
8342 // s_getpc_b64 s[0:1]
8343 // s_add_u32 s0, s0, $symbol
8344 // s_addc_u32 s1, s1, 0
8345 //
8346 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8347 // a fixup or relocation is emitted to replace $symbol with a literal
8348 // constant, which is a pc-relative offset from the encoding of the $symbol
8349 // operand to the global variable.
8350 //
8351 // For global address space:
8352 // s_getpc_b64 s[0:1]
8353 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
8354 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
8355 //
8356 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8357 // fixups or relocations are emitted to replace $symbol@*@lo and
8358 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
8359 // which is a 64-bit pc-relative offset from the encoding of the $symbol
8360 // operand to the global variable.
8361 if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
8362 assert(GAFlags != SIInstrInfo::MO_NONE);
8363
8364 SDValue Ptr =
8365 DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2);
8366 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr);
8367 }
8368
8369 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
8370 SDValue PtrHi;
8371 if (GAFlags == SIInstrInfo::MO_NONE)
8372 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
8373 else
8374 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
8375 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
8376}
8377
8378SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
8379 SDValue Op,
8380 SelectionDAG &DAG) const {
8381 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
8382 SDLoc DL(GSD);
8383 EVT PtrVT = Op.getValueType();
8384
8385 const GlobalValue *GV = GSD->getGlobal();
8391 GV->hasExternalLinkage()) {
8392 Type *Ty = GV->getValueType();
8393 // HIP uses an unsized array `extern __shared__ T s[]` or similar
8394 // zero-sized type in other languages to declare the dynamic shared
8395 // memory which size is not known at the compile time. They will be
8396 // allocated by the runtime and placed directly after the static
8397 // allocated ones. They all share the same offset.
8398 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
8399 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
8400 // Adjust alignment for that dynamic shared memory array.
8402 MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
8403 MFI->setUsesDynamicLDS(true);
8404 return SDValue(
8405 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
8406 }
8407 }
8409 }
8410
8412 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
8414 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
8415 }
8416
8417 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8418 if (Subtarget->has64BitLiterals()) {
8420 GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64);
8421 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr),
8422 0);
8423 }
8424
8425 SDValue AddrLo = DAG.getTargetGlobalAddress(
8426 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
8427 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
8428
8429 SDValue AddrHi = DAG.getTargetGlobalAddress(
8430 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
8431 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
8432
8433 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
8434 }
8435
8436 if (shouldEmitFixup(GV))
8437 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
8438
8439 if (shouldEmitPCReloc(GV))
8440 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
8442
8443 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
8445 PointerType *PtrTy =
8447 const DataLayout &DataLayout = DAG.getDataLayout();
8448 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
8449 MachinePointerInfo PtrInfo =
8451
8452 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
8455}
8456
8458 const SDLoc &DL, SDValue V) const {
8459 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
8460 // the destination register.
8461 //
8462 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
8463 // so we will end up with redundant moves to m0.
8464 //
8465 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
8466
8467 // A Null SDValue creates a glue result.
8468 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
8469 V, Chain);
8470 return SDValue(M0, 0);
8471}
8472
8473SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
8474 MVT VT,
8475 unsigned Offset) const {
8476 SDLoc SL(Op);
8477 SDValue Param = lowerKernargMemParameter(
8478 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
8479 // The local size values will have the hi 16-bits as zero.
8480 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
8481 DAG.getValueType(VT));
8482}
8483
8485 EVT VT) {
8488 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
8489 return DAG.getPOISON(VT);
8490}
8491
8493 EVT VT) {
8496 "intrinsic not supported on subtarget", DL.getDebugLoc()));
8497 return DAG.getPOISON(VT);
8498}
8499
8501 ArrayRef<SDValue> Elts) {
8502 assert(!Elts.empty());
8503 MVT Type;
8504 unsigned NumElts = Elts.size();
8505
8506 if (NumElts <= 12) {
8507 Type = MVT::getVectorVT(MVT::f32, NumElts);
8508 } else {
8509 assert(Elts.size() <= 16);
8510 Type = MVT::v16f32;
8511 NumElts = 16;
8512 }
8513
8514 SmallVector<SDValue, 16> VecElts(NumElts);
8515 for (unsigned i = 0; i < Elts.size(); ++i) {
8516 SDValue Elt = Elts[i];
8517 if (Elt.getValueType() != MVT::f32)
8518 Elt = DAG.getBitcast(MVT::f32, Elt);
8519 VecElts[i] = Elt;
8520 }
8521 for (unsigned i = Elts.size(); i < NumElts; ++i)
8522 VecElts[i] = DAG.getPOISON(MVT::f32);
8523
8524 if (NumElts == 1)
8525 return VecElts[0];
8526 return DAG.getBuildVector(Type, DL, VecElts);
8527}
8528
8529static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
8530 SDValue Src, int ExtraElts) {
8531 EVT SrcVT = Src.getValueType();
8532
8534
8535 if (SrcVT.isVector())
8536 DAG.ExtractVectorElements(Src, Elts);
8537 else
8538 Elts.push_back(Src);
8539
8540 SDValue Undef = DAG.getPOISON(SrcVT.getScalarType());
8541 while (ExtraElts--)
8542 Elts.push_back(Undef);
8543
8544 return DAG.getBuildVector(CastVT, DL, Elts);
8545}
8546
8547// Re-construct the required return value for a image load intrinsic.
8548// This is more complicated due to the optional use TexFailCtrl which means the
8549// required return type is an aggregate
8551 ArrayRef<EVT> ResultTypes, bool IsTexFail,
8552 bool Unpacked, bool IsD16, int DMaskPop,
8553 int NumVDataDwords, bool IsAtomicPacked16Bit,
8554 const SDLoc &DL) {
8555 // Determine the required return type. This is the same regardless of
8556 // IsTexFail flag
8557 EVT ReqRetVT = ResultTypes[0];
8558 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
8559 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
8560 ? (ReqRetNumElts + 1) / 2
8561 : ReqRetNumElts;
8562
8563 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
8564
8565 MVT DataDwordVT =
8566 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
8567
8568 MVT MaskPopVT =
8569 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
8570
8571 SDValue Data(Result, 0);
8572 SDValue TexFail;
8573
8574 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
8575 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
8576 if (MaskPopVT.isVector()) {
8577 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
8578 SDValue(Result, 0), ZeroIdx);
8579 } else {
8580 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
8581 SDValue(Result, 0), ZeroIdx);
8582 }
8583 }
8584
8585 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
8586 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
8587 NumDataDwords - MaskPopDwords);
8588
8589 if (IsD16)
8590 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
8591
8592 EVT LegalReqRetVT = ReqRetVT;
8593 if (!ReqRetVT.isVector()) {
8594 if (!Data.getValueType().isInteger())
8595 Data = DAG.getNode(ISD::BITCAST, DL,
8596 Data.getValueType().changeTypeToInteger(), Data);
8597 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
8598 } else {
8599 // We need to widen the return vector to a legal type
8600 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
8601 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
8602 LegalReqRetVT =
8604 ReqRetVT.getVectorNumElements() + 1);
8605 }
8606 }
8607 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
8608
8609 if (IsTexFail) {
8610 TexFail =
8611 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
8612 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
8613
8614 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
8615 }
8616
8617 if (Result->getNumValues() == 1)
8618 return Data;
8619
8620 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
8621}
8622
8623static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
8624 SDValue *LWE, bool &IsTexFail) {
8625 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
8626
8627 uint64_t Value = TexFailCtrlConst->getZExtValue();
8628 if (Value) {
8629 IsTexFail = true;
8630 }
8631
8632 SDLoc DL(TexFailCtrlConst);
8633 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
8634 Value &= ~(uint64_t)0x1;
8635 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
8636 Value &= ~(uint64_t)0x2;
8637
8638 return Value == 0;
8639}
8640
8642 MVT PackVectorVT,
8643 SmallVectorImpl<SDValue> &PackedAddrs,
8644 unsigned DimIdx, unsigned EndIdx,
8645 unsigned NumGradients) {
8646 SDLoc DL(Op);
8647 for (unsigned I = DimIdx; I < EndIdx; I++) {
8648 SDValue Addr = Op.getOperand(I);
8649
8650 // Gradients are packed with undef for each coordinate.
8651 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
8652 // 1D: undef,dx/dh; undef,dx/dv
8653 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
8654 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
8655 if (((I + 1) >= EndIdx) ||
8656 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
8657 I == DimIdx + NumGradients - 1))) {
8658 if (Addr.getValueType() != MVT::i16)
8659 Addr = DAG.getBitcast(MVT::i16, Addr);
8660 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
8661 } else {
8662 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
8663 I++;
8664 }
8665 Addr = DAG.getBitcast(MVT::f32, Addr);
8666 PackedAddrs.push_back(Addr);
8667 }
8668}
8669
8670SDValue SITargetLowering::lowerImage(SDValue Op,
8672 SelectionDAG &DAG, bool WithChain) const {
8673 SDLoc DL(Op);
8675 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
8676 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
8678 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
8679 unsigned IntrOpcode = Intr->BaseOpcode;
8680 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
8681 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
8682 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
8683
8684 SmallVector<EVT, 3> ResultTypes(Op->values());
8685 SmallVector<EVT, 3> OrigResultTypes(Op->values());
8686 bool IsD16 = false;
8687 bool IsG16 = false;
8688 bool IsA16 = false;
8689 SDValue VData;
8690 int NumVDataDwords = 0;
8691 bool AdjustRetType = false;
8692 bool IsAtomicPacked16Bit = false;
8693
8694 // Offset of intrinsic arguments
8695 const unsigned ArgOffset = WithChain ? 2 : 1;
8696
8697 unsigned DMask;
8698 unsigned DMaskLanes = 0;
8699
8700 if (BaseOpcode->Atomic) {
8701 VData = Op.getOperand(2);
8702
8703 IsAtomicPacked16Bit =
8704 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
8705 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
8706
8707 bool Is64Bit = VData.getValueSizeInBits() == 64;
8708 if (BaseOpcode->AtomicX2) {
8709 SDValue VData2 = Op.getOperand(3);
8710 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
8711 {VData, VData2});
8712 if (Is64Bit)
8713 VData = DAG.getBitcast(MVT::v4i32, VData);
8714
8715 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
8716 DMask = Is64Bit ? 0xf : 0x3;
8717 NumVDataDwords = Is64Bit ? 4 : 2;
8718 } else {
8719 DMask = Is64Bit ? 0x3 : 0x1;
8720 NumVDataDwords = Is64Bit ? 2 : 1;
8721 }
8722 } else {
8723 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
8724 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
8725
8726 if (BaseOpcode->Store) {
8727 VData = Op.getOperand(2);
8728
8729 MVT StoreVT = VData.getSimpleValueType();
8730 if (StoreVT.getScalarType() == MVT::f16) {
8731 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8732 return Op; // D16 is unsupported for this instruction
8733
8734 IsD16 = true;
8735 VData = handleD16VData(VData, DAG, true);
8736 }
8737
8738 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
8739 } else if (!BaseOpcode->NoReturn) {
8740 // Work out the num dwords based on the dmask popcount and underlying type
8741 // and whether packing is supported.
8742 MVT LoadVT = ResultTypes[0].getSimpleVT();
8743 if (LoadVT.getScalarType() == MVT::f16) {
8744 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8745 return Op; // D16 is unsupported for this instruction
8746
8747 IsD16 = true;
8748 }
8749
8750 // Confirm that the return type is large enough for the dmask specified
8751 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
8752 (!LoadVT.isVector() && DMaskLanes > 1))
8753 return Op;
8754
8755 // The sq block of gfx8 and gfx9 do not estimate register use correctly
8756 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
8757 // instructions.
8758 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
8759 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
8760 NumVDataDwords = (DMaskLanes + 1) / 2;
8761 else
8762 NumVDataDwords = DMaskLanes;
8763
8764 AdjustRetType = true;
8765 }
8766 }
8767
8768 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
8770
8771 // Check for 16 bit addresses or derivatives and pack if true.
8772 MVT VAddrVT =
8773 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
8774 MVT VAddrScalarVT = VAddrVT.getScalarType();
8775 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8776 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8777
8778 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
8779 VAddrScalarVT = VAddrVT.getScalarType();
8780 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8781 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8782
8783 // Push back extra arguments.
8784 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
8785 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
8786 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
8787 // Special handling of bias when A16 is on. Bias is of type half but
8788 // occupies full 32-bit.
8789 SDValue Bias = DAG.getBuildVector(
8790 MVT::v2f16, DL,
8791 {Op.getOperand(ArgOffset + I), DAG.getPOISON(MVT::f16)});
8792 VAddrs.push_back(Bias);
8793 } else {
8794 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
8795 "Bias needs to be converted to 16 bit in A16 mode");
8796 VAddrs.push_back(Op.getOperand(ArgOffset + I));
8797 }
8798 }
8799
8800 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
8801 // 16 bit gradients are supported, but are tied to the A16 control
8802 // so both gradients and addresses must be 16 bit
8803 LLVM_DEBUG(
8804 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
8805 "require 16 bit args for both gradients and addresses");
8806 return Op;
8807 }
8808
8809 if (IsA16) {
8810 if (!ST->hasA16()) {
8811 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
8812 "support 16 bit addresses\n");
8813 return Op;
8814 }
8815 }
8816
8817 // We've dealt with incorrect input so we know that if IsA16, IsG16
8818 // are set then we have to compress/pack operands (either address,
8819 // gradient or both)
8820 // In the case where a16 and gradients are tied (no G16 support) then we
8821 // have already verified that both IsA16 and IsG16 are true
8822 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
8823 // Activate g16
8824 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
8826 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
8827 }
8828
8829 // Add gradients (packed or unpacked)
8830 if (IsG16) {
8831 // Pack the gradients
8832 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
8833 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
8834 ArgOffset + Intr->GradientStart,
8835 ArgOffset + Intr->CoordStart, Intr->NumGradients);
8836 } else {
8837 for (unsigned I = ArgOffset + Intr->GradientStart;
8838 I < ArgOffset + Intr->CoordStart; I++)
8839 VAddrs.push_back(Op.getOperand(I));
8840 }
8841
8842 // Add addresses (packed or unpacked)
8843 if (IsA16) {
8844 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
8845 ArgOffset + Intr->CoordStart, VAddrEnd,
8846 0 /* No gradients */);
8847 } else {
8848 // Add uncompressed address
8849 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
8850 VAddrs.push_back(Op.getOperand(I));
8851 }
8852
8853 // If the register allocator cannot place the address registers contiguously
8854 // without introducing moves, then using the non-sequential address encoding
8855 // is always preferable, since it saves VALU instructions and is usually a
8856 // wash in terms of code size or even better.
8857 //
8858 // However, we currently have no way of hinting to the register allocator that
8859 // MIMG addresses should be placed contiguously when it is possible to do so,
8860 // so force non-NSA for the common 2-address case as a heuristic.
8861 //
8862 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
8863 // allocation when possible.
8864 //
8865 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
8866 // set of the remaining addresses.
8867 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
8868 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
8869 const bool UseNSA = ST->hasNSAEncoding() &&
8870 VAddrs.size() >= ST->getNSAThreshold(MF) &&
8871 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
8872 const bool UsePartialNSA =
8873 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
8874
8875 SDValue VAddr;
8876 if (UsePartialNSA) {
8877 VAddr = getBuildDwordsVector(DAG, DL,
8878 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8879 } else if (!UseNSA) {
8880 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
8881 }
8882
8883 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
8884 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
8885 SDValue Unorm;
8886 if (!BaseOpcode->Sampler) {
8887 Unorm = True;
8888 } else {
8889 uint64_t UnormConst =
8890 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
8891
8892 Unorm = UnormConst ? True : False;
8893 }
8894
8895 SDValue TFE;
8896 SDValue LWE;
8897 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
8898 bool IsTexFail = false;
8899 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8900 return Op;
8901
8902 if (IsTexFail) {
8903 if (!DMaskLanes) {
8904 // Expecting to get an error flag since TFC is on - and dmask is 0
8905 // Force dmask to be at least 1 otherwise the instruction will fail
8906 DMask = 0x1;
8907 DMaskLanes = 1;
8908 NumVDataDwords = 1;
8909 }
8910 NumVDataDwords += 1;
8911 AdjustRetType = true;
8912 }
8913
8914 // Has something earlier tagged that the return type needs adjusting
8915 // This happens if the instruction is a load or has set TexFailCtrl flags
8916 if (AdjustRetType) {
8917 // NumVDataDwords reflects the true number of dwords required in the return
8918 // type
8919 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8920 // This is a no-op load. This can be eliminated
8921 SDValue Undef = DAG.getPOISON(Op.getValueType());
8922 if (isa<MemSDNode>(Op))
8923 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
8924 return Undef;
8925 }
8926
8927 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
8928 MVT::i32, NumVDataDwords)
8929 : MVT::i32;
8930
8931 ResultTypes[0] = NewVT;
8932 if (ResultTypes.size() == 3) {
8933 // Original result was aggregate type used for TexFailCtrl results
8934 // The actual instruction returns as a vector type which has now been
8935 // created. Remove the aggregate result.
8936 ResultTypes.erase(&ResultTypes[1]);
8937 }
8938 }
8939
8940 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
8941 if (BaseOpcode->Atomic)
8942 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
8943 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
8945 return Op;
8946
8948 if (BaseOpcode->Store || BaseOpcode->Atomic)
8949 Ops.push_back(VData); // vdata
8950 if (UsePartialNSA) {
8951 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
8952 Ops.push_back(VAddr);
8953 } else if (UseNSA)
8954 append_range(Ops, VAddrs);
8955 else
8956 Ops.push_back(VAddr);
8957 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
8958 EVT RsrcVT = Rsrc.getValueType();
8959 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
8960 return Op;
8961 Ops.push_back(Rsrc);
8962 if (BaseOpcode->Sampler) {
8963 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
8964 if (Samp.getValueType() != MVT::v4i32)
8965 return Op;
8966 Ops.push_back(Samp);
8967 }
8968 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
8969 if (IsGFX10Plus)
8970 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
8971 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8972 Ops.push_back(Unorm);
8973 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
8974 Ops.push_back(IsA16 && // r128, a16 for gfx9
8975 ST->hasFeature(AMDGPU::FeatureR128A16)
8976 ? True
8977 : False);
8978 if (IsGFX10Plus)
8979 Ops.push_back(IsA16 ? True : False);
8980
8981 if (!Subtarget->hasGFX90AInsts())
8982 Ops.push_back(TFE); // tfe
8983 else if (TFE->getAsZExtVal()) {
8986 "TFE is not supported on this GPU", DL.getDebugLoc()));
8987 }
8988
8989 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8990 Ops.push_back(LWE); // lwe
8991 if (!IsGFX10Plus)
8992 Ops.push_back(DimInfo->DA ? True : False);
8993 if (BaseOpcode->HasD16)
8994 Ops.push_back(IsD16 ? True : False);
8995 if (isa<MemSDNode>(Op))
8996 Ops.push_back(Op.getOperand(0)); // chain
8997
8998 int NumVAddrDwords =
8999 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
9000 int Opcode = -1;
9001
9002 if (IsGFX12Plus) {
9003 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
9004 NumVDataDwords, NumVAddrDwords);
9005 } else if (IsGFX11Plus) {
9006 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9007 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9008 : AMDGPU::MIMGEncGfx11Default,
9009 NumVDataDwords, NumVAddrDwords);
9010 } else if (IsGFX10Plus) {
9011 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9012 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9013 : AMDGPU::MIMGEncGfx10Default,
9014 NumVDataDwords, NumVAddrDwords);
9015 } else {
9016 if (Subtarget->hasGFX90AInsts()) {
9017 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
9018 NumVDataDwords, NumVAddrDwords);
9019 if (Opcode == -1) {
9022 "requested image instruction is not supported on this GPU",
9023 DL.getDebugLoc()));
9024
9025 unsigned Idx = 0;
9026 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
9027 for (EVT VT : OrigResultTypes) {
9028 if (VT == MVT::Other)
9029 RetValues[Idx++] = Op.getOperand(0); // Chain
9030 else
9031 RetValues[Idx++] = DAG.getPOISON(VT);
9032 }
9033
9034 return DAG.getMergeValues(RetValues, DL);
9035 }
9036 }
9037 if (Opcode == -1 &&
9039 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
9040 NumVDataDwords, NumVAddrDwords);
9041 if (Opcode == -1)
9042 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
9043 NumVDataDwords, NumVAddrDwords);
9044 }
9045 if (Opcode == -1)
9046 return Op;
9047
9048 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
9049 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
9050 MachineMemOperand *MemRef = MemOp->getMemOperand();
9051 DAG.setNodeMemRefs(NewNode, {MemRef});
9052 }
9053
9054 if (BaseOpcode->AtomicX2) {
9056 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
9057 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
9058 }
9059 if (BaseOpcode->NoReturn)
9060 return SDValue(NewNode, 0);
9061 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
9062 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9063 NumVDataDwords, IsAtomicPacked16Bit, DL);
9064}
9065
9066SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
9067 SDValue Offset, SDValue CachePolicy,
9068 SelectionDAG &DAG) const {
9070
9071 const DataLayout &DataLayout = DAG.getDataLayout();
9072 Align Alignment =
9074
9079 VT.getStoreSize(), Alignment);
9080
9081 if (!Offset->isDivergent()) {
9082 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
9083
9084 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
9085 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
9086 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
9087 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
9088 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9089 SDValue BufferLoad =
9091 DAG.getVTList(MVT::i32), Ops, VT, MMO);
9092 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
9093 }
9094
9095 // Widen vec3 load to vec4.
9096 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
9097 !Subtarget->hasScalarDwordx3Loads()) {
9098 EVT WidenedVT =
9100 auto WidenedOp = DAG.getMemIntrinsicNode(
9101 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
9102 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
9103 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
9104 DAG.getVectorIdxConstant(0, DL));
9105 return Subvector;
9106 }
9107
9109 DAG.getVTList(VT), Ops, VT, MMO);
9110 }
9111
9112 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
9113 // assume that the buffer is unswizzled.
9114 SDValue Ops[] = {
9115 DAG.getEntryNode(), // Chain
9116 Rsrc, // rsrc
9117 DAG.getConstant(0, DL, MVT::i32), // vindex
9118 {}, // voffset
9119 {}, // soffset
9120 {}, // offset
9121 CachePolicy, // cachepolicy
9122 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9123 };
9124 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9125 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
9126 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
9127 }
9128
9130 unsigned NumLoads = 1;
9131 MVT LoadVT = VT.getSimpleVT();
9132 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
9133 assert((LoadVT.getScalarType() == MVT::i32 ||
9134 LoadVT.getScalarType() == MVT::f32));
9135
9136 if (NumElts == 8 || NumElts == 16) {
9137 NumLoads = NumElts / 4;
9138 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
9139 }
9140
9141 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other});
9142
9143 // Use the alignment to ensure that the required offsets will fit into the
9144 // immediate offsets.
9145 setBufferOffsets(Offset, DAG, &Ops[3],
9146 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
9147
9148 uint64_t InstOffset = Ops[5]->getAsZExtVal();
9149 for (unsigned i = 0; i < NumLoads; ++i) {
9150 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
9151 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
9152 LoadVT, MMO, DAG));
9153 }
9154
9155 if (NumElts == 8 || NumElts == 16)
9156 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
9157
9158 return Loads[0];
9159}
9160
9161SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
9162 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
9163 if (!Subtarget->hasArchitectedSGPRs())
9164 return {};
9165 SDLoc SL(Op);
9166 MVT VT = MVT::i32;
9167 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
9168 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
9169 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
9170}
9171
9172SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
9173 unsigned Dim,
9174 const ArgDescriptor &Arg) const {
9175 SDLoc SL(Op);
9177 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
9178 if (MaxID == 0)
9179 return DAG.getConstant(0, SL, MVT::i32);
9180
9181 // It's undefined behavior if a function marked with the amdgpu-no-*
9182 // attributes uses the corresponding intrinsic.
9183 if (!Arg)
9184 return DAG.getPOISON(Op->getValueType(0));
9185
9186 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
9187 SDLoc(DAG.getEntryNode()), Arg);
9188
9189 // Don't bother inserting AssertZext for packed IDs since we're emitting the
9190 // masking operations anyway.
9191 //
9192 // TODO: We could assert the top bit is 0 for the source copy.
9193 if (Arg.isMasked())
9194 return Val;
9195
9196 // Preserve the known bits after expansion to a copy.
9198 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
9199 DAG.getValueType(SmallVT));
9200}
9201
9202SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
9203 SelectionDAG &DAG) const {
9205 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
9206
9207 EVT VT = Op.getValueType();
9208 SDLoc DL(Op);
9209 unsigned IntrinsicID = Op.getConstantOperandVal(0);
9210
9211 // TODO: Should this propagate fast-math-flags?
9212
9213 switch (IntrinsicID) {
9214 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9215 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
9216 return emitNonHSAIntrinsicError(DAG, DL, VT);
9217 return getPreloadedValue(DAG, *MFI, VT,
9219 }
9220 case Intrinsic::amdgcn_dispatch_ptr:
9221 case Intrinsic::amdgcn_queue_ptr: {
9222 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
9224 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
9225 DL.getDebugLoc()));
9226 return DAG.getPOISON(VT);
9227 }
9228
9229 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9232 return getPreloadedValue(DAG, *MFI, VT, RegID);
9233 }
9234 case Intrinsic::amdgcn_implicitarg_ptr: {
9235 if (MFI->isEntryFunction())
9236 return getImplicitArgPtr(DAG, DL);
9237 return getPreloadedValue(DAG, *MFI, VT,
9239 }
9240 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9242 // This only makes sense to call in a kernel, so just lower to null.
9243 return DAG.getConstant(0, DL, VT);
9244 }
9245
9246 return getPreloadedValue(DAG, *MFI, VT,
9248 }
9249 case Intrinsic::amdgcn_dispatch_id: {
9250 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
9251 }
9252 case Intrinsic::amdgcn_rcp:
9253 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
9254 case Intrinsic::amdgcn_rsq:
9255 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9256 case Intrinsic::amdgcn_rsq_legacy:
9258 return emitRemovedIntrinsicError(DAG, DL, VT);
9259 return SDValue();
9260 case Intrinsic::amdgcn_rcp_legacy:
9262 return emitRemovedIntrinsicError(DAG, DL, VT);
9263 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
9264 case Intrinsic::amdgcn_rsq_clamp: {
9266 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
9267
9268 Type *Type = VT.getTypeForEVT(*DAG.getContext());
9271
9272 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9273 SDValue Tmp =
9274 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
9275 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
9276 DAG.getConstantFP(Min, DL, VT));
9277 }
9278 case Intrinsic::r600_read_ngroups_x:
9279 if (Subtarget->isAmdHsaOS())
9280 return emitNonHSAIntrinsicError(DAG, DL, VT);
9281
9282 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9284 false);
9285 case Intrinsic::r600_read_ngroups_y:
9286 if (Subtarget->isAmdHsaOS())
9287 return emitNonHSAIntrinsicError(DAG, DL, VT);
9288
9289 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9291 false);
9292 case Intrinsic::r600_read_ngroups_z:
9293 if (Subtarget->isAmdHsaOS())
9294 return emitNonHSAIntrinsicError(DAG, DL, VT);
9295
9296 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9298 false);
9299 case Intrinsic::r600_read_local_size_x:
9300 if (Subtarget->isAmdHsaOS())
9301 return emitNonHSAIntrinsicError(DAG, DL, VT);
9302
9303 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9305 case Intrinsic::r600_read_local_size_y:
9306 if (Subtarget->isAmdHsaOS())
9307 return emitNonHSAIntrinsicError(DAG, DL, VT);
9308
9309 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9311 case Intrinsic::r600_read_local_size_z:
9312 if (Subtarget->isAmdHsaOS())
9313 return emitNonHSAIntrinsicError(DAG, DL, VT);
9314
9315 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9317 case Intrinsic::amdgcn_workgroup_id_x:
9318 return getPreloadedValue(DAG, *MFI, VT,
9320 case Intrinsic::amdgcn_workgroup_id_y:
9321 return getPreloadedValue(DAG, *MFI, VT,
9323 case Intrinsic::amdgcn_workgroup_id_z:
9324 return getPreloadedValue(DAG, *MFI, VT,
9326 case Intrinsic::amdgcn_wave_id:
9327 return lowerWaveID(DAG, Op);
9328 case Intrinsic::amdgcn_lds_kernel_id: {
9329 if (MFI->isEntryFunction())
9330 return getLDSKernelId(DAG, DL);
9331 return getPreloadedValue(DAG, *MFI, VT,
9333 }
9334 case Intrinsic::amdgcn_workitem_id_x:
9335 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
9336 case Intrinsic::amdgcn_workitem_id_y:
9337 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
9338 case Intrinsic::amdgcn_workitem_id_z:
9339 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
9340 case Intrinsic::amdgcn_wavefrontsize:
9342 SDLoc(Op), MVT::i32);
9343 case Intrinsic::amdgcn_s_buffer_load: {
9344 unsigned CPol = Op.getConstantOperandVal(3);
9345 // s_buffer_load, because of how it's optimized, can't be volatile
9346 // so reject ones with the volatile bit set.
9347 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
9350 return Op;
9351 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
9352 Op.getOperand(3), DAG);
9353 }
9354 case Intrinsic::amdgcn_fdiv_fast:
9355 return lowerFDIV_FAST(Op, DAG);
9356 case Intrinsic::amdgcn_sin:
9357 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
9358
9359 case Intrinsic::amdgcn_cos:
9360 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
9361
9362 case Intrinsic::amdgcn_mul_u24:
9363 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
9364 Op.getOperand(2));
9365 case Intrinsic::amdgcn_mul_i24:
9366 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
9367 Op.getOperand(2));
9368
9369 case Intrinsic::amdgcn_log_clamp: {
9371 return SDValue();
9372
9373 return emitRemovedIntrinsicError(DAG, DL, VT);
9374 }
9375 case Intrinsic::amdgcn_fract:
9376 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
9377
9378 case Intrinsic::amdgcn_class:
9379 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
9380 Op.getOperand(2));
9381 case Intrinsic::amdgcn_div_fmas:
9382 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
9383 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9384
9385 case Intrinsic::amdgcn_div_fixup:
9386 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
9387 Op.getOperand(2), Op.getOperand(3));
9388
9389 case Intrinsic::amdgcn_div_scale: {
9390 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
9391
9392 // Translate to the operands expected by the machine instruction. The
9393 // first parameter must be the same as the first instruction.
9394 SDValue Numerator = Op.getOperand(1);
9395 SDValue Denominator = Op.getOperand(2);
9396
9397 // Note this order is opposite of the machine instruction's operations,
9398 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
9399 // intrinsic has the numerator as the first operand to match a normal
9400 // division operation.
9401
9402 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
9403
9404 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
9405 Denominator, Numerator);
9406 }
9407 case Intrinsic::amdgcn_icmp: {
9408 // There is a Pat that handles this variant, so return it as-is.
9409 if (Op.getOperand(1).getValueType() == MVT::i1 &&
9410 Op.getConstantOperandVal(2) == 0 &&
9411 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
9412 return Op;
9413 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
9414 }
9415 case Intrinsic::amdgcn_fcmp: {
9416 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
9417 }
9418 case Intrinsic::amdgcn_ballot:
9419 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
9420 case Intrinsic::amdgcn_fmed3:
9421 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
9422 Op.getOperand(2), Op.getOperand(3));
9423 case Intrinsic::amdgcn_fdot2:
9424 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
9425 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9426 case Intrinsic::amdgcn_fmul_legacy:
9427 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
9428 Op.getOperand(2));
9429 case Intrinsic::amdgcn_sffbh:
9430 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
9431 case Intrinsic::amdgcn_sbfe:
9432 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
9433 Op.getOperand(2), Op.getOperand(3));
9434 case Intrinsic::amdgcn_ubfe:
9435 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
9436 Op.getOperand(2), Op.getOperand(3));
9437 case Intrinsic::amdgcn_cvt_pkrtz:
9438 case Intrinsic::amdgcn_cvt_pknorm_i16:
9439 case Intrinsic::amdgcn_cvt_pknorm_u16:
9440 case Intrinsic::amdgcn_cvt_pk_i16:
9441 case Intrinsic::amdgcn_cvt_pk_u16: {
9442 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
9443 EVT VT = Op.getValueType();
9444 unsigned Opcode;
9445
9446 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
9448 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
9450 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
9452 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
9454 else
9456
9457 if (isTypeLegal(VT))
9458 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
9459
9460 SDValue Node =
9461 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
9462 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
9463 }
9464 case Intrinsic::amdgcn_fmad_ftz:
9465 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
9466 Op.getOperand(2), Op.getOperand(3));
9467
9468 case Intrinsic::amdgcn_if_break:
9469 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
9470 Op->getOperand(1), Op->getOperand(2)),
9471 0);
9472
9473 case Intrinsic::amdgcn_groupstaticsize: {
9475 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
9476 return Op;
9477
9478 const Module *M = MF.getFunction().getParent();
9479 const GlobalValue *GV =
9480 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
9481 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
9483 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
9484 }
9485 case Intrinsic::amdgcn_is_shared:
9486 case Intrinsic::amdgcn_is_private: {
9487 SDLoc SL(Op);
9488 SDValue SrcVec =
9489 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
9490 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
9491 DAG.getConstant(1, SL, MVT::i32));
9492
9493 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
9496 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
9497 Subtarget->hasGloballyAddressableScratch()) {
9498 SDValue FlatScratchBaseHi(
9499 DAG.getMachineNode(
9500 AMDGPU::S_MOV_B32, DL, MVT::i32,
9501 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
9502 0);
9503 // Test bits 63..58 against the aperture address.
9504 return DAG.getSetCC(
9505 SL, MVT::i1,
9506 DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
9507 DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
9508 }
9509
9510 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
9511 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
9512 }
9513 case Intrinsic::amdgcn_perm:
9514 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
9515 Op.getOperand(2), Op.getOperand(3));
9516 case Intrinsic::amdgcn_reloc_constant: {
9517 Module *M = MF.getFunction().getParent();
9518 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
9519 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
9520 auto *RelocSymbol = cast<GlobalVariable>(
9521 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
9522 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
9524 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
9525 }
9526 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
9527 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
9528 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
9529 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
9530 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
9531 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
9532 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
9533 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
9534 if (Op.getOperand(4).getValueType() == MVT::i32)
9535 return SDValue();
9536
9537 SDLoc SL(Op);
9538 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
9539 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
9540 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9541 Op.getOperand(3), IndexKeyi32);
9542 }
9543 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
9544 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
9545 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
9546 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
9547 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
9548 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
9549 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
9550 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
9551 if (Op.getOperand(4).getValueType() == MVT::i64)
9552 return SDValue();
9553
9554 SDLoc SL(Op);
9555 auto IndexKeyi64 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64);
9556 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
9557 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9558 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
9559 Op.getOperand(6)});
9560 }
9561 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
9562 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
9563 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
9564 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
9565 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
9566 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
9567 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
9568 ? MVT::i64
9569 : MVT::i32;
9570 if (Op.getOperand(6).getValueType() == IndexKeyTy)
9571 return SDValue();
9572
9573 SDLoc SL(Op);
9574 auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
9575 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
9576 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9577 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
9578 IndexKey, Op.getOperand(7),
9579 Op.getOperand(8)}); // No clamp operand
9580 }
9581 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
9582 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
9583 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
9584 if (Op.getOperand(6).getValueType() == MVT::i32)
9585 return SDValue();
9586
9587 SDLoc SL(Op);
9588 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
9589 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
9590 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9591 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
9592 IndexKeyi32, Op.getOperand(7)});
9593 }
9594 case Intrinsic::amdgcn_addrspacecast_nonnull:
9595 return lowerADDRSPACECAST(Op, DAG);
9596 case Intrinsic::amdgcn_readlane:
9597 case Intrinsic::amdgcn_readfirstlane:
9598 case Intrinsic::amdgcn_writelane:
9599 case Intrinsic::amdgcn_permlane16:
9600 case Intrinsic::amdgcn_permlanex16:
9601 case Intrinsic::amdgcn_permlane64:
9602 case Intrinsic::amdgcn_set_inactive:
9603 case Intrinsic::amdgcn_set_inactive_chain_arg:
9604 case Intrinsic::amdgcn_mov_dpp8:
9605 case Intrinsic::amdgcn_update_dpp:
9606 return lowerLaneOp(*this, Op.getNode(), DAG);
9607 case Intrinsic::amdgcn_dead: {
9609 for (const EVT ValTy : Op.getNode()->values())
9610 Poisons.push_back(DAG.getPOISON(ValTy));
9611 return DAG.getMergeValues(Poisons, SDLoc(Op));
9612 }
9613 default:
9614 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9616 return lowerImage(Op, ImageDimIntr, DAG, false);
9617
9618 return Op;
9619 }
9620}
9621
9622// On targets not supporting constant in soffset field, turn zero to
9623// SGPR_NULL to avoid generating an extra s_mov with zero.
9625 const GCNSubtarget *Subtarget) {
9626 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
9627 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
9628 return SOffset;
9629}
9630
9631SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
9632 SelectionDAG &DAG,
9633 unsigned NewOpcode) const {
9634 SDLoc DL(Op);
9635
9636 SDValue VData = Op.getOperand(2);
9637 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9638 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9639 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9640 SDValue Ops[] = {
9641 Op.getOperand(0), // Chain
9642 VData, // vdata
9643 Rsrc, // rsrc
9644 DAG.getConstant(0, DL, MVT::i32), // vindex
9645 VOffset, // voffset
9646 SOffset, // soffset
9647 Offset, // offset
9648 Op.getOperand(6), // cachepolicy
9649 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9650 };
9651
9652 auto *M = cast<MemSDNode>(Op);
9653
9654 EVT MemVT = VData.getValueType();
9655 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
9656 M->getMemOperand());
9657}
9658
9659SDValue
9660SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
9661 unsigned NewOpcode) const {
9662 SDLoc DL(Op);
9663
9664 SDValue VData = Op.getOperand(2);
9665 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9666 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9667 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9668 SDValue Ops[] = {
9669 Op.getOperand(0), // Chain
9670 VData, // vdata
9671 Rsrc, // rsrc
9672 Op.getOperand(4), // vindex
9673 VOffset, // voffset
9674 SOffset, // soffset
9675 Offset, // offset
9676 Op.getOperand(7), // cachepolicy
9677 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9678 };
9679
9680 auto *M = cast<MemSDNode>(Op);
9681
9682 EVT MemVT = VData.getValueType();
9683 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
9684 M->getMemOperand());
9685}
9686
9687SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
9688 SelectionDAG &DAG) const {
9689 unsigned IntrID = Op.getConstantOperandVal(1);
9690 SDLoc DL(Op);
9691
9692 switch (IntrID) {
9693 case Intrinsic::amdgcn_ds_ordered_add:
9694 case Intrinsic::amdgcn_ds_ordered_swap: {
9695 MemSDNode *M = cast<MemSDNode>(Op);
9696 SDValue Chain = M->getOperand(0);
9697 SDValue M0 = M->getOperand(2);
9698 SDValue Value = M->getOperand(3);
9699 unsigned IndexOperand = M->getConstantOperandVal(7);
9700 unsigned WaveRelease = M->getConstantOperandVal(8);
9701 unsigned WaveDone = M->getConstantOperandVal(9);
9702
9703 unsigned OrderedCountIndex = IndexOperand & 0x3f;
9704 IndexOperand &= ~0x3f;
9705 unsigned CountDw = 0;
9706
9707 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
9708 CountDw = (IndexOperand >> 24) & 0xf;
9709 IndexOperand &= ~(0xf << 24);
9710
9711 if (CountDw < 1 || CountDw > 4) {
9712 const Function &Fn = DAG.getMachineFunction().getFunction();
9714 Fn, "ds_ordered_count: dword count must be between 1 and 4",
9715 DL.getDebugLoc()));
9716 CountDw = 1;
9717 }
9718 }
9719
9720 if (IndexOperand) {
9721 const Function &Fn = DAG.getMachineFunction().getFunction();
9723 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
9724 }
9725
9726 if (WaveDone && !WaveRelease) {
9727 // TODO: Move this to IR verifier
9728 const Function &Fn = DAG.getMachineFunction().getFunction();
9730 Fn, "ds_ordered_count: wave_done requires wave_release",
9731 DL.getDebugLoc()));
9732 }
9733
9734 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
9735 unsigned ShaderType =
9737 unsigned Offset0 = OrderedCountIndex << 2;
9738 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
9739
9740 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
9741 Offset1 |= (CountDw - 1) << 6;
9742
9743 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
9744 Offset1 |= ShaderType << 2;
9745
9746 unsigned Offset = Offset0 | (Offset1 << 8);
9747
9748 SDValue Ops[] = {
9749 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
9750 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
9751 };
9753 M->getVTList(), Ops, M->getMemoryVT(),
9754 M->getMemOperand());
9755 }
9756 case Intrinsic::amdgcn_raw_buffer_load:
9757 case Intrinsic::amdgcn_raw_ptr_buffer_load:
9758 case Intrinsic::amdgcn_raw_atomic_buffer_load:
9759 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
9760 case Intrinsic::amdgcn_raw_buffer_load_format:
9761 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
9762 const bool IsFormat =
9763 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
9764 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
9765
9766 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9767 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
9768 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
9769 SDValue Ops[] = {
9770 Op.getOperand(0), // Chain
9771 Rsrc, // rsrc
9772 DAG.getConstant(0, DL, MVT::i32), // vindex
9773 VOffset, // voffset
9774 SOffset, // soffset
9775 Offset, // offset
9776 Op.getOperand(5), // cachepolicy, swizzled buffer
9777 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9778 };
9779
9780 auto *M = cast<MemSDNode>(Op);
9781 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
9782 }
9783 case Intrinsic::amdgcn_struct_buffer_load:
9784 case Intrinsic::amdgcn_struct_ptr_buffer_load:
9785 case Intrinsic::amdgcn_struct_buffer_load_format:
9786 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
9787 case Intrinsic::amdgcn_struct_atomic_buffer_load:
9788 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
9789 const bool IsFormat =
9790 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
9791 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
9792
9793 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9794 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9795 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9796 SDValue Ops[] = {
9797 Op.getOperand(0), // Chain
9798 Rsrc, // rsrc
9799 Op.getOperand(3), // vindex
9800 VOffset, // voffset
9801 SOffset, // soffset
9802 Offset, // offset
9803 Op.getOperand(6), // cachepolicy, swizzled buffer
9804 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9805 };
9806
9807 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
9808 }
9809 case Intrinsic::amdgcn_raw_tbuffer_load:
9810 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
9811 MemSDNode *M = cast<MemSDNode>(Op);
9812 EVT LoadVT = Op.getValueType();
9813 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9814 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
9815 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
9816
9817 SDValue Ops[] = {
9818 Op.getOperand(0), // Chain
9819 Rsrc, // rsrc
9820 DAG.getConstant(0, DL, MVT::i32), // vindex
9821 VOffset, // voffset
9822 SOffset, // soffset
9823 Offset, // offset
9824 Op.getOperand(5), // format
9825 Op.getOperand(6), // cachepolicy, swizzled buffer
9826 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9827 };
9828
9829 if (LoadVT.getScalarType() == MVT::f16)
9830 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9831 Ops);
9832 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9833 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9834 DAG);
9835 }
9836 case Intrinsic::amdgcn_struct_tbuffer_load:
9837 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
9838 MemSDNode *M = cast<MemSDNode>(Op);
9839 EVT LoadVT = Op.getValueType();
9840 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9841 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9842 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9843
9844 SDValue Ops[] = {
9845 Op.getOperand(0), // Chain
9846 Rsrc, // rsrc
9847 Op.getOperand(3), // vindex
9848 VOffset, // voffset
9849 SOffset, // soffset
9850 Offset, // offset
9851 Op.getOperand(6), // format
9852 Op.getOperand(7), // cachepolicy, swizzled buffer
9853 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9854 };
9855
9856 if (LoadVT.getScalarType() == MVT::f16)
9857 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9858 Ops);
9859 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9860 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9861 DAG);
9862 }
9863 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9864 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9865 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
9866 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9867 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9868 return lowerStructBufferAtomicIntrin(Op, DAG,
9870 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9871 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9872 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
9873 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9874 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9875 return lowerStructBufferAtomicIntrin(Op, DAG,
9877 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9878 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9879 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
9880 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9881 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9882 return lowerStructBufferAtomicIntrin(Op, DAG,
9884 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9885 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9886 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
9887 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9888 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9889 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9890 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9891 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9892 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9893 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9894 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9895 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
9896 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9897 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9898 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
9899 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9900 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9901 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
9902 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9903 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9904 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
9905 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9906 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9907 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9908 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9909 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9910 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9911 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9912 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9913 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9914 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9915 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9916 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9917 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9918 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9919 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9920 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9921 return lowerRawBufferAtomicIntrin(Op, DAG,
9923 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9924 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9925 return lowerStructBufferAtomicIntrin(Op, DAG,
9927 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9928 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9929 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9930 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9931 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9932 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9933 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9934 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9935 return lowerStructBufferAtomicIntrin(Op, DAG,
9937 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9938 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9939 return lowerStructBufferAtomicIntrin(Op, DAG,
9941 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9942 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9943 return lowerStructBufferAtomicIntrin(Op, DAG,
9945 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9946 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9947 return lowerStructBufferAtomicIntrin(Op, DAG,
9949 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9950 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9951 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9952 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9953 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9954 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9955 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9956 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9957 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9958 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9959 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9960 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9961 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9962 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9963 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9964 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9965 return lowerStructBufferAtomicIntrin(Op, DAG,
9967
9968 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9969 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9970 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
9971 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9972 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9973 SDValue Ops[] = {
9974 Op.getOperand(0), // Chain
9975 Op.getOperand(2), // src
9976 Op.getOperand(3), // cmp
9977 Rsrc, // rsrc
9978 DAG.getConstant(0, DL, MVT::i32), // vindex
9979 VOffset, // voffset
9980 SOffset, // soffset
9981 Offset, // offset
9982 Op.getOperand(7), // cachepolicy
9983 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9984 };
9985 EVT VT = Op.getValueType();
9986 auto *M = cast<MemSDNode>(Op);
9987
9989 Op->getVTList(), Ops, VT,
9990 M->getMemOperand());
9991 }
9992 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9993 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9994 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
9995 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
9996 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
9997 SDValue Ops[] = {
9998 Op.getOperand(0), // Chain
9999 Op.getOperand(2), // src
10000 Op.getOperand(3), // cmp
10001 Rsrc, // rsrc
10002 Op.getOperand(5), // vindex
10003 VOffset, // voffset
10004 SOffset, // soffset
10005 Offset, // offset
10006 Op.getOperand(8), // cachepolicy
10007 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10008 };
10009 EVT VT = Op.getValueType();
10010 auto *M = cast<MemSDNode>(Op);
10011
10013 Op->getVTList(), Ops, VT,
10014 M->getMemOperand());
10015 }
10016 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10017 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10018 MemSDNode *M = cast<MemSDNode>(Op);
10019 SDValue NodePtr = M->getOperand(2);
10020 SDValue RayExtent = M->getOperand(3);
10021 SDValue InstanceMask = M->getOperand(4);
10022 SDValue RayOrigin = M->getOperand(5);
10023 SDValue RayDir = M->getOperand(6);
10024 SDValue Offsets = M->getOperand(7);
10025 SDValue TDescr = M->getOperand(8);
10026
10027 assert(NodePtr.getValueType() == MVT::i64);
10028 assert(RayDir.getValueType() == MVT::v3f32);
10029
10030 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10031 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10032 return SDValue();
10033 }
10034
10035 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10036 const unsigned NumVDataDwords = 10;
10037 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10038 int Opcode = AMDGPU::getMIMGOpcode(
10039 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10040 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10041 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10042 assert(Opcode != -1);
10043
10045 Ops.push_back(NodePtr);
10046 Ops.push_back(DAG.getBuildVector(
10047 MVT::v2i32, DL,
10048 {DAG.getBitcast(MVT::i32, RayExtent),
10049 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10050 Ops.push_back(RayOrigin);
10051 Ops.push_back(RayDir);
10052 Ops.push_back(Offsets);
10053 Ops.push_back(TDescr);
10054 Ops.push_back(M->getChain());
10055
10056 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10057 MachineMemOperand *MemRef = M->getMemOperand();
10058 DAG.setNodeMemRefs(NewNode, {MemRef});
10059 return SDValue(NewNode, 0);
10060 }
10061 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10062 MemSDNode *M = cast<MemSDNode>(Op);
10063 SDValue NodePtr = M->getOperand(2);
10064 SDValue RayExtent = M->getOperand(3);
10065 SDValue RayOrigin = M->getOperand(4);
10066 SDValue RayDir = M->getOperand(5);
10067 SDValue RayInvDir = M->getOperand(6);
10068 SDValue TDescr = M->getOperand(7);
10069
10070 assert(NodePtr.getValueType() == MVT::i32 ||
10071 NodePtr.getValueType() == MVT::i64);
10072 assert(RayDir.getValueType() == MVT::v3f16 ||
10073 RayDir.getValueType() == MVT::v3f32);
10074
10075 if (!Subtarget->hasGFX10_AEncoding()) {
10076 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10077 return SDValue();
10078 }
10079
10080 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
10081 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
10082 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10083 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
10084 const bool Is64 = NodePtr.getValueType() == MVT::i64;
10085 const unsigned NumVDataDwords = 4;
10086 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10087 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10088 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10089 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
10090 IsGFX12Plus;
10091 const unsigned BaseOpcodes[2][2] = {
10092 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10093 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10094 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10095 int Opcode;
10096 if (UseNSA) {
10097 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10098 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10099 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10100 : AMDGPU::MIMGEncGfx10NSA,
10101 NumVDataDwords, NumVAddrDwords);
10102 } else {
10103 assert(!IsGFX12Plus);
10104 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10105 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10106 : AMDGPU::MIMGEncGfx10Default,
10107 NumVDataDwords, NumVAddrDwords);
10108 }
10109 assert(Opcode != -1);
10110
10112
10113 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
10115 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
10116 if (Lanes[0].getValueSizeInBits() == 32) {
10117 for (unsigned I = 0; I < 3; ++I)
10118 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
10119 } else {
10120 if (IsAligned) {
10121 Ops.push_back(DAG.getBitcast(
10122 MVT::i32,
10123 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
10124 Ops.push_back(Lanes[2]);
10125 } else {
10126 SDValue Elt0 = Ops.pop_back_val();
10127 Ops.push_back(DAG.getBitcast(
10128 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
10129 Ops.push_back(DAG.getBitcast(
10130 MVT::i32,
10131 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
10132 }
10133 }
10134 };
10135
10136 if (UseNSA && IsGFX11Plus) {
10137 Ops.push_back(NodePtr);
10138 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10139 Ops.push_back(RayOrigin);
10140 if (IsA16) {
10141 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
10142 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
10143 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
10144 for (unsigned I = 0; I < 3; ++I) {
10145 MergedLanes.push_back(DAG.getBitcast(
10146 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
10147 {DirLanes[I], InvDirLanes[I]})));
10148 }
10149 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
10150 } else {
10151 Ops.push_back(RayDir);
10152 Ops.push_back(RayInvDir);
10153 }
10154 } else {
10155 if (Is64)
10156 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
10157 2);
10158 else
10159 Ops.push_back(NodePtr);
10160
10161 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10162 packLanes(RayOrigin, true);
10163 packLanes(RayDir, true);
10164 packLanes(RayInvDir, false);
10165 }
10166
10167 if (!UseNSA) {
10168 // Build a single vector containing all the operands so far prepared.
10169 if (NumVAddrDwords > 12) {
10170 SDValue Undef = DAG.getPOISON(MVT::i32);
10171 Ops.append(16 - Ops.size(), Undef);
10172 }
10173 assert(Ops.size() >= 8 && Ops.size() <= 12);
10174 SDValue MergedOps =
10175 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
10176 Ops.clear();
10177 Ops.push_back(MergedOps);
10178 }
10179
10180 Ops.push_back(TDescr);
10181 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
10182 Ops.push_back(M->getChain());
10183
10184 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10185 MachineMemOperand *MemRef = M->getMemOperand();
10186 DAG.setNodeMemRefs(NewNode, {MemRef});
10187 return SDValue(NewNode, 0);
10188 }
10189 case Intrinsic::amdgcn_global_atomic_fmin_num:
10190 case Intrinsic::amdgcn_global_atomic_fmax_num:
10191 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10192 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10193 MemSDNode *M = cast<MemSDNode>(Op);
10194 SDValue Ops[] = {
10195 M->getOperand(0), // Chain
10196 M->getOperand(2), // Ptr
10197 M->getOperand(3) // Value
10198 };
10199 unsigned Opcode = 0;
10200 switch (IntrID) {
10201 case Intrinsic::amdgcn_global_atomic_fmin_num:
10202 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10203 Opcode = ISD::ATOMIC_LOAD_FMIN;
10204 break;
10205 }
10206 case Intrinsic::amdgcn_global_atomic_fmax_num:
10207 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10208 Opcode = ISD::ATOMIC_LOAD_FMAX;
10209 break;
10210 }
10211 default:
10212 llvm_unreachable("unhandled atomic opcode");
10213 }
10214 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
10215 Ops, M->getMemOperand());
10216 }
10217 case Intrinsic::amdgcn_s_get_barrier_state:
10218 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10219 SDValue Chain = Op->getOperand(0);
10221 unsigned Opc;
10222
10223 if (isa<ConstantSDNode>(Op->getOperand(2))) {
10224 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
10225 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10226 BarID = (BarID >> 4) & 0x3F;
10227 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10228 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10229 Ops.push_back(K);
10230 Ops.push_back(Chain);
10231 } else {
10232 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10233 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10234 SDValue M0Val;
10235 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
10236 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10237 M0Val = SDValue(
10238 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10239 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10240 0);
10241 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10242 } else
10243 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
10244 }
10245
10246 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10247 return SDValue(NewMI, 0);
10248 }
10249 default:
10250
10251 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10253 return lowerImage(Op, ImageDimIntr, DAG, true);
10254
10255 return SDValue();
10256 }
10257}
10258
10259// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
10260// dwordx4 if on SI and handle TFE loads.
10261SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
10262 SDVTList VTList,
10263 ArrayRef<SDValue> Ops, EVT MemVT,
10264 MachineMemOperand *MMO,
10265 SelectionDAG &DAG) const {
10266 LLVMContext &C = *DAG.getContext();
10268 EVT VT = VTList.VTs[0];
10269
10270 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
10271 bool IsTFE = VTList.NumVTs == 3;
10272 if (IsTFE) {
10273 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
10274 unsigned NumOpDWords = NumValueDWords + 1;
10275 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
10276 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
10277 MachineMemOperand *OpDWordsMMO =
10278 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
10279 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
10280 OpDWordsVT, OpDWordsMMO, DAG);
10282 DAG.getVectorIdxConstant(NumValueDWords, DL));
10283 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
10284 SDValue ValueDWords =
10285 NumValueDWords == 1
10286 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
10288 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
10289 ZeroIdx);
10290 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
10291 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10292 }
10293
10294 if (!Subtarget->hasDwordx3LoadStores() &&
10295 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10296 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
10297 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
10298 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
10299 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
10300 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
10301 WidenedMemVT, WidenedMMO);
10303 DAG.getVectorIdxConstant(0, DL));
10304 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
10305 }
10306
10307 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
10308}
10309
10310SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
10311 bool ImageStore) const {
10312 EVT StoreVT = VData.getValueType();
10313
10314 // No change for f16 and legal vector D16 types.
10315 if (!StoreVT.isVector())
10316 return VData;
10317
10318 SDLoc DL(VData);
10319 unsigned NumElements = StoreVT.getVectorNumElements();
10320
10321 if (Subtarget->hasUnpackedD16VMem()) {
10322 // We need to unpack the packed data to store.
10323 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10324 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10325
10326 EVT EquivStoreVT =
10327 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
10328 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
10329 return DAG.UnrollVectorOp(ZExt.getNode());
10330 }
10331
10332 // The sq block of gfx8.1 does not estimate register use correctly for d16
10333 // image store instructions. The data operand is computed as if it were not a
10334 // d16 image instruction.
10335 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
10336 // Bitcast to i16
10337 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10338 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10339
10340 // Decompose into scalars
10342 DAG.ExtractVectorElements(IntVData, Elts);
10343
10344 // Group pairs of i16 into v2i16 and bitcast to i32
10345 SmallVector<SDValue, 4> PackedElts;
10346 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
10347 SDValue Pair =
10348 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
10349 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10350 PackedElts.push_back(IntPair);
10351 }
10352 if ((NumElements % 2) == 1) {
10353 // Handle v3i16
10354 unsigned I = Elts.size() / 2;
10355 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
10356 {Elts[I * 2], DAG.getPOISON(MVT::i16)});
10357 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10358 PackedElts.push_back(IntPair);
10359 }
10360
10361 // Pad using UNDEF
10362 PackedElts.resize(Elts.size(), DAG.getPOISON(MVT::i32));
10363
10364 // Build final vector
10365 EVT VecVT =
10366 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
10367 return DAG.getBuildVector(VecVT, DL, PackedElts);
10368 }
10369
10370 if (NumElements == 3) {
10371 EVT IntStoreVT =
10373 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10374
10375 EVT WidenedStoreVT = EVT::getVectorVT(
10376 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
10377 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
10378 WidenedStoreVT.getStoreSizeInBits());
10379 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
10380 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
10381 }
10382
10383 assert(isTypeLegal(StoreVT));
10384 return VData;
10385}
10386
10387SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
10388 SelectionDAG &DAG) const {
10389 SDLoc DL(Op);
10390 SDValue Chain = Op.getOperand(0);
10391 unsigned IntrinsicID = Op.getConstantOperandVal(1);
10393
10394 switch (IntrinsicID) {
10395 case Intrinsic::amdgcn_exp_compr: {
10396 if (!Subtarget->hasCompressedExport()) {
10399 "intrinsic not supported on subtarget", DL.getDebugLoc()));
10400 }
10401 SDValue Src0 = Op.getOperand(4);
10402 SDValue Src1 = Op.getOperand(5);
10403 // Hack around illegal type on SI by directly selecting it.
10404 if (isTypeLegal(Src0.getValueType()))
10405 return SDValue();
10406
10407 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
10408 SDValue Undef = DAG.getPOISON(MVT::f32);
10409 const SDValue Ops[] = {
10410 Op.getOperand(2), // tgt
10411 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
10412 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
10413 Undef, // src2
10414 Undef, // src3
10415 Op.getOperand(7), // vm
10416 DAG.getTargetConstant(1, DL, MVT::i1), // compr
10417 Op.getOperand(3), // en
10418 Op.getOperand(0) // Chain
10419 };
10420
10421 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
10422 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
10423 }
10424
10425 case Intrinsic::amdgcn_struct_tbuffer_store:
10426 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
10427 SDValue VData = Op.getOperand(2);
10428 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
10429 if (IsD16)
10430 VData = handleD16VData(VData, DAG);
10431 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10432 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10433 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10434 SDValue Ops[] = {
10435 Chain,
10436 VData, // vdata
10437 Rsrc, // rsrc
10438 Op.getOperand(4), // vindex
10439 VOffset, // voffset
10440 SOffset, // soffset
10441 Offset, // offset
10442 Op.getOperand(7), // format
10443 Op.getOperand(8), // cachepolicy, swizzled buffer
10444 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10445 };
10446 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
10448 MemSDNode *M = cast<MemSDNode>(Op);
10449 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
10450 M->getMemoryVT(), M->getMemOperand());
10451 }
10452
10453 case Intrinsic::amdgcn_raw_tbuffer_store:
10454 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
10455 SDValue VData = Op.getOperand(2);
10456 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
10457 if (IsD16)
10458 VData = handleD16VData(VData, DAG);
10459 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10460 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10461 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10462 SDValue Ops[] = {
10463 Chain,
10464 VData, // vdata
10465 Rsrc, // rsrc
10466 DAG.getConstant(0, DL, MVT::i32), // vindex
10467 VOffset, // voffset
10468 SOffset, // soffset
10469 Offset, // offset
10470 Op.getOperand(6), // format
10471 Op.getOperand(7), // cachepolicy, swizzled buffer
10472 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10473 };
10474 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
10476 MemSDNode *M = cast<MemSDNode>(Op);
10477 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
10478 M->getMemoryVT(), M->getMemOperand());
10479 }
10480
10481 case Intrinsic::amdgcn_raw_buffer_store:
10482 case Intrinsic::amdgcn_raw_ptr_buffer_store:
10483 case Intrinsic::amdgcn_raw_buffer_store_format:
10484 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
10485 const bool IsFormat =
10486 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
10487 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
10488
10489 SDValue VData = Op.getOperand(2);
10490 EVT VDataVT = VData.getValueType();
10491 EVT EltType = VDataVT.getScalarType();
10492 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
10493 if (IsD16) {
10494 VData = handleD16VData(VData, DAG);
10495 VDataVT = VData.getValueType();
10496 }
10497
10498 if (!isTypeLegal(VDataVT)) {
10499 VData =
10500 DAG.getNode(ISD::BITCAST, DL,
10501 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
10502 }
10503
10504 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10505 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10506 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10507 SDValue Ops[] = {
10508 Chain,
10509 VData,
10510 Rsrc,
10511 DAG.getConstant(0, DL, MVT::i32), // vindex
10512 VOffset, // voffset
10513 SOffset, // soffset
10514 Offset, // offset
10515 Op.getOperand(6), // cachepolicy, swizzled buffer
10516 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10517 };
10518 unsigned Opc =
10521 MemSDNode *M = cast<MemSDNode>(Op);
10522
10523 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
10524 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
10525 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
10526
10527 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
10528 M->getMemoryVT(), M->getMemOperand());
10529 }
10530
10531 case Intrinsic::amdgcn_struct_buffer_store:
10532 case Intrinsic::amdgcn_struct_ptr_buffer_store:
10533 case Intrinsic::amdgcn_struct_buffer_store_format:
10534 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
10535 const bool IsFormat =
10536 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
10537 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
10538
10539 SDValue VData = Op.getOperand(2);
10540 EVT VDataVT = VData.getValueType();
10541 EVT EltType = VDataVT.getScalarType();
10542 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
10543
10544 if (IsD16) {
10545 VData = handleD16VData(VData, DAG);
10546 VDataVT = VData.getValueType();
10547 }
10548
10549 if (!isTypeLegal(VDataVT)) {
10550 VData =
10551 DAG.getNode(ISD::BITCAST, DL,
10552 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
10553 }
10554
10555 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10556 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10557 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10558 SDValue Ops[] = {
10559 Chain,
10560 VData,
10561 Rsrc,
10562 Op.getOperand(4), // vindex
10563 VOffset, // voffset
10564 SOffset, // soffset
10565 Offset, // offset
10566 Op.getOperand(7), // cachepolicy, swizzled buffer
10567 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10568 };
10569 unsigned Opc =
10572 MemSDNode *M = cast<MemSDNode>(Op);
10573
10574 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
10575 EVT VDataType = VData.getValueType().getScalarType();
10576 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
10577 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
10578
10579 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
10580 M->getMemoryVT(), M->getMemOperand());
10581 }
10582 case Intrinsic::amdgcn_raw_buffer_load_lds:
10583 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
10584 case Intrinsic::amdgcn_struct_buffer_load_lds:
10585 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
10586 if (!Subtarget->hasVMemToLDSLoad())
10587 return SDValue();
10588 unsigned Opc;
10589 bool HasVIndex =
10590 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
10591 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
10592 unsigned OpOffset = HasVIndex ? 1 : 0;
10593 SDValue VOffset = Op.getOperand(5 + OpOffset);
10594 bool HasVOffset = !isNullConstant(VOffset);
10595 unsigned Size = Op->getConstantOperandVal(4);
10596
10597 switch (Size) {
10598 default:
10599 return SDValue();
10600 case 1:
10601 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
10602 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
10603 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
10604 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
10605 break;
10606 case 2:
10607 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
10608 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
10609 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
10610 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
10611 break;
10612 case 4:
10613 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
10614 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
10615 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
10616 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
10617 break;
10618 case 12:
10619 if (!Subtarget->hasLDSLoadB96_B128())
10620 return SDValue();
10621 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
10622 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
10623 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
10624 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
10625 break;
10626 case 16:
10627 if (!Subtarget->hasLDSLoadB96_B128())
10628 return SDValue();
10629 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
10630 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
10631 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
10632 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
10633 break;
10634 }
10635
10636 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
10637
10639
10640 if (HasVIndex && HasVOffset)
10641 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
10642 {Op.getOperand(5), // VIndex
10643 VOffset}));
10644 else if (HasVIndex)
10645 Ops.push_back(Op.getOperand(5));
10646 else if (HasVOffset)
10647 Ops.push_back(VOffset);
10648
10649 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10650 Ops.push_back(Rsrc);
10651 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
10652 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
10653 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10654 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
10656 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
10657 DL, MVT::i8)); // cpol
10659 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
10660 ? 1
10661 : 0,
10662 DL, MVT::i8)); // swz
10663 Ops.push_back(M0Val.getValue(0)); // Chain
10664 Ops.push_back(M0Val.getValue(1)); // Glue
10665
10666 auto *M = cast<MemSDNode>(Op);
10667 MachineMemOperand *LoadMMO = M->getMemOperand();
10668 // Don't set the offset value here because the pointer points to the base of
10669 // the buffer.
10670 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
10671
10672 MachinePointerInfo StorePtrI = LoadPtrI;
10673 LoadPtrI.V = PoisonValue::get(
10677
10678 auto F = LoadMMO->getFlags() &
10680 LoadMMO =
10682 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
10683
10685 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
10686 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
10687
10688 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
10689 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
10690
10691 return SDValue(Load, 0);
10692 }
10693 // Buffers are handled by LowerBufferFatPointers, and we're going to go
10694 // for "trust me" that the remaining cases are global pointers until
10695 // such time as we can put two mem operands on an intrinsic.
10696 case Intrinsic::amdgcn_load_to_lds:
10697 case Intrinsic::amdgcn_global_load_lds: {
10698 if (!Subtarget->hasVMemToLDSLoad())
10699 return SDValue();
10700
10701 unsigned Opc;
10702 unsigned Size = Op->getConstantOperandVal(4);
10703 switch (Size) {
10704 default:
10705 return SDValue();
10706 case 1:
10707 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
10708 break;
10709 case 2:
10710 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
10711 break;
10712 case 4:
10713 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
10714 break;
10715 case 12:
10716 if (!Subtarget->hasLDSLoadB96_B128())
10717 return SDValue();
10718 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
10719 break;
10720 case 16:
10721 if (!Subtarget->hasLDSLoadB96_B128())
10722 return SDValue();
10723 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
10724 break;
10725 }
10726
10727 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
10728
10730
10731 SDValue Addr = Op.getOperand(2); // Global ptr
10732 SDValue VOffset;
10733 // Try to split SAddr and VOffset. Global and LDS pointers share the same
10734 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
10735 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
10736 SDValue LHS = Addr.getOperand(0);
10737 SDValue RHS = Addr.getOperand(1);
10738
10739 if (LHS->isDivergent())
10740 std::swap(LHS, RHS);
10741
10742 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
10743 RHS.getOperand(0).getValueType() == MVT::i32) {
10744 // add (i64 sgpr), (zero_extend (i32 vgpr))
10745 Addr = LHS;
10746 VOffset = RHS.getOperand(0);
10747 }
10748 }
10749
10750 Ops.push_back(Addr);
10751 if (!Addr->isDivergent()) {
10753 if (!VOffset)
10754 VOffset =
10755 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
10756 DAG.getTargetConstant(0, DL, MVT::i32)),
10757 0);
10758 Ops.push_back(VOffset);
10759 }
10760
10761 Ops.push_back(Op.getOperand(5)); // Offset
10762 Ops.push_back(Op.getOperand(6)); // CPol
10763 Ops.push_back(M0Val.getValue(0)); // Chain
10764 Ops.push_back(M0Val.getValue(1)); // Glue
10765
10766 auto *M = cast<MemSDNode>(Op);
10767 MachineMemOperand *LoadMMO = M->getMemOperand();
10768 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
10769 LoadPtrI.Offset = Op->getConstantOperandVal(5);
10770 MachinePointerInfo StorePtrI = LoadPtrI;
10771 LoadPtrI.V = PoisonValue::get(
10775 auto F = LoadMMO->getFlags() &
10777 LoadMMO =
10779 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
10781 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
10782 LoadMMO->getAAInfo());
10783
10784 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10785 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
10786
10787 return SDValue(Load, 0);
10788 }
10789 case Intrinsic::amdgcn_end_cf:
10790 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
10791 Op->getOperand(2), Chain),
10792 0);
10793 case Intrinsic::amdgcn_s_barrier_init:
10794 case Intrinsic::amdgcn_s_barrier_signal_var: {
10795 // these two intrinsics have two operands: barrier pointer and member count
10796 SDValue Chain = Op->getOperand(0);
10798 SDValue BarOp = Op->getOperand(2);
10799 SDValue CntOp = Op->getOperand(3);
10800 SDValue M0Val;
10801 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
10802 ? AMDGPU::S_BARRIER_INIT_M0
10803 : AMDGPU::S_BARRIER_SIGNAL_M0;
10804 // extract the BarrierID from bits 4-9 of BarOp
10805 SDValue BarID;
10806 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
10807 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10808 BarID =
10809 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
10810 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10811 0);
10812 // Member count should be put into M0[ShAmt:+6]
10813 // Barrier ID should be put into M0[5:0]
10814 M0Val =
10815 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
10816 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10817 0);
10818 constexpr unsigned ShAmt = 16;
10819 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
10820 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
10821
10822 M0Val = SDValue(
10823 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
10824
10825 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10826
10827 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10828 return SDValue(NewMI, 0);
10829 }
10830 case Intrinsic::amdgcn_s_barrier_join: {
10831 // these three intrinsics have one operand: barrier pointer
10832 SDValue Chain = Op->getOperand(0);
10834 SDValue BarOp = Op->getOperand(2);
10835 unsigned Opc;
10836
10837 if (isa<ConstantSDNode>(BarOp)) {
10838 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
10839 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
10840
10841 // extract the BarrierID from bits 4-9 of the immediate
10842 unsigned BarID = (BarVal >> 4) & 0x3F;
10843 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10844 Ops.push_back(K);
10845 Ops.push_back(Chain);
10846 } else {
10847 Opc = AMDGPU::S_BARRIER_JOIN_M0;
10848
10849 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
10850 SDValue M0Val;
10851 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
10852 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10853 M0Val =
10854 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10855 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10856 0);
10857 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10858 }
10859
10860 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10861 return SDValue(NewMI, 0);
10862 }
10863 case Intrinsic::amdgcn_s_prefetch_data: {
10864 // For non-global address space preserve the chain and remove the call.
10865 if (!AMDGPU::isFlatGlobalAddrSpace(cast<MemSDNode>(Op)->getAddressSpace()))
10866 return Op.getOperand(0);
10867 return Op;
10868 }
10869 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
10870 SDValue Ops[] = {
10871 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
10872 Op.getOperand(3), // offset
10873 Op.getOperand(4), // length
10874 };
10875
10876 MemSDNode *M = cast<MemSDNode>(Op);
10878 Op->getVTList(), Ops, M->getMemoryVT(),
10879 M->getMemOperand());
10880 }
10881 default: {
10882 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10884 return lowerImage(Op, ImageDimIntr, DAG, true);
10885
10886 return Op;
10887 }
10888 }
10889}
10890
10891// Return whether the operation has NoUnsignedWrap property.
10893 return (Addr.getOpcode() == ISD::ADD &&
10894 Addr->getFlags().hasNoUnsignedWrap()) ||
10895 Addr->getOpcode() == ISD::OR;
10896}
10897
10899 EVT PtrVT) const {
10900 return UseSelectionDAGPTRADD && PtrVT == MVT::i64;
10901}
10902
10903// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
10904// offset (the offset that is included in bounds checking and swizzling, to be
10905// split between the instruction's voffset and immoffset fields) and soffset
10906// (the offset that is excluded from bounds checking and swizzling, to go in
10907// the instruction's soffset field). This function takes the first kind of
10908// offset and figures out how to split it between voffset and immoffset.
10909std::pair<SDValue, SDValue>
10910SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
10911 SDLoc DL(Offset);
10912 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
10913 SDValue N0 = Offset;
10914 ConstantSDNode *C1 = nullptr;
10915
10916 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
10917 N0 = SDValue();
10918 else if (DAG.isBaseWithConstantOffset(N0)) {
10919 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
10920 // being added, so we can only safely match a 32-bit addition with no
10921 // unsigned overflow.
10922 bool CheckNUW = AMDGPU::isGFX1250(*Subtarget);
10923 if (!CheckNUW || isNoUnsignedWrap(N0)) {
10924 C1 = cast<ConstantSDNode>(N0.getOperand(1));
10925 N0 = N0.getOperand(0);
10926 }
10927 }
10928
10929 if (C1) {
10930 unsigned ImmOffset = C1->getZExtValue();
10931 // If the immediate value is too big for the immoffset field, put only bits
10932 // that would normally fit in the immoffset field. The remaining value that
10933 // is copied/added for the voffset field is a large power of 2, and it
10934 // stands more chance of being CSEd with the copy/add for another similar
10935 // load/store.
10936 // However, do not do that rounding down if that is a negative
10937 // number, as it appears to be illegal to have a negative offset in the
10938 // vgpr, even if adding the immediate offset makes it positive.
10939 unsigned Overflow = ImmOffset & ~MaxImm;
10940 ImmOffset -= Overflow;
10941 if ((int32_t)Overflow < 0) {
10942 Overflow += ImmOffset;
10943 ImmOffset = 0;
10944 }
10945 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
10946 if (Overflow) {
10947 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
10948 if (!N0)
10949 N0 = OverflowVal;
10950 else {
10951 SDValue Ops[] = {N0, OverflowVal};
10952 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
10953 }
10954 }
10955 }
10956 if (!N0)
10957 N0 = DAG.getConstant(0, DL, MVT::i32);
10958 if (!C1)
10959 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
10960 return {N0, SDValue(C1, 0)};
10961}
10962
10963// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
10964// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
10965// pointed to by Offsets.
10966void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
10967 SelectionDAG &DAG, SDValue *Offsets,
10968 Align Alignment) const {
10970 SDLoc DL(CombinedOffset);
10971 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10972 uint32_t Imm = C->getZExtValue();
10973 uint32_t SOffset, ImmOffset;
10974 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10975 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
10976 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10977 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10978 return;
10979 }
10980 }
10981 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
10982 SDValue N0 = CombinedOffset.getOperand(0);
10983 SDValue N1 = CombinedOffset.getOperand(1);
10984 uint32_t SOffset, ImmOffset;
10985 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10986 if (Offset >= 0 &&
10987 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
10988 Offsets[0] = N0;
10989 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10990 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10991 return;
10992 }
10993 }
10994
10995 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
10996 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
10997 : DAG.getConstant(0, DL, MVT::i32);
10998
10999 Offsets[0] = CombinedOffset;
11000 Offsets[1] = SOffsetZero;
11001 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
11002}
11003
11004SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
11005 SelectionDAG &DAG) const {
11006 if (!MaybePointer.getValueType().isScalarInteger())
11007 return MaybePointer;
11008
11009 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
11010 return Rsrc;
11011}
11012
11013// Wrap a global or flat pointer into a buffer intrinsic using the flags
11014// specified in the intrinsic.
11015SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
11016 SelectionDAG &DAG) const {
11017 SDLoc Loc(Op);
11018
11019 SDValue Pointer = Op->getOperand(1);
11020 SDValue Stride = Op->getOperand(2);
11021 SDValue NumRecords = Op->getOperand(3);
11022 SDValue Flags = Op->getOperand(4);
11023
11024 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11025 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
11026 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
11027 std::optional<uint32_t> ConstStride = std::nullopt;
11028 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
11029 ConstStride = ConstNode->getZExtValue();
11030
11031 SDValue NewHighHalf = Masked;
11032 if (!ConstStride || *ConstStride != 0) {
11033 SDValue ShiftedStride;
11034 if (ConstStride) {
11035 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
11036 } else {
11037 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
11038 ShiftedStride =
11039 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11040 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
11041 }
11042 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
11043 }
11044
11045 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
11046 NewHighHalf, NumRecords, Flags);
11047 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11048 return RsrcPtr;
11049}
11050
11051// Handle 8 bit and 16 bit buffer loads
11052SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
11053 EVT LoadVT, SDLoc DL,
11055 MachineMemOperand *MMO,
11056 bool IsTFE) const {
11057 EVT IntVT = LoadVT.changeTypeToInteger();
11058
11059 if (IsTFE) {
11060 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
11064 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
11065 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
11066 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
11068 DAG.getConstant(1, DL, MVT::i32));
11069 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11070 DAG.getConstant(0, DL, MVT::i32));
11071 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
11072 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
11073 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
11074 }
11075
11076 unsigned Opc = LoadVT.getScalarType() == MVT::i8
11079
11080 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
11081 SDValue BufferLoad =
11082 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
11083 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
11084 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
11085
11086 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
11087}
11088
11089// Handle 8 bit and 16 bit buffer stores
11090SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
11091 EVT VDataType, SDLoc DL,
11092 SDValue Ops[],
11093 MemSDNode *M) const {
11094 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11095 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
11096
11097 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
11098 Ops[1] = BufferStoreExt;
11099 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
11101 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
11102 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
11103 M->getMemOperand());
11104}
11105
11107 SDValue Op, const SDLoc &SL, EVT VT) {
11108 if (VT.bitsLT(Op.getValueType()))
11109 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
11110
11111 switch (ExtType) {
11112 case ISD::SEXTLOAD:
11113 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
11114 case ISD::ZEXTLOAD:
11115 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
11116 case ISD::EXTLOAD:
11117 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
11118 case ISD::NON_EXTLOAD:
11119 return Op;
11120 }
11121
11122 llvm_unreachable("invalid ext type");
11123}
11124
11125// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
11126// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
11127SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
11128 DAGCombinerInfo &DCI) const {
11129 SelectionDAG &DAG = DCI.DAG;
11130 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
11131 return SDValue();
11132
11133 // FIXME: Constant loads should all be marked invariant.
11134 unsigned AS = Ld->getAddressSpace();
11135 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
11137 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
11138 return SDValue();
11139
11140 // Don't do this early, since it may interfere with adjacent load merging for
11141 // illegal types. We can avoid losing alignment information for exotic types
11142 // pre-legalize.
11143 EVT MemVT = Ld->getMemoryVT();
11144 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
11145 MemVT.getSizeInBits() >= 32)
11146 return SDValue();
11147
11148 SDLoc SL(Ld);
11149
11150 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
11151 "unexpected vector extload");
11152
11153 // TODO: Drop only high part of range.
11154 SDValue Ptr = Ld->getBasePtr();
11155 SDValue NewLoad = DAG.getLoad(
11156 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
11157 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
11158 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
11159 nullptr); // Drop ranges
11160
11161 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
11162 if (MemVT.isFloatingPoint()) {
11164 "unexpected fp extload");
11165 TruncVT = MemVT.changeTypeToInteger();
11166 }
11167
11168 SDValue Cvt = NewLoad;
11169 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
11170 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
11171 DAG.getValueType(TruncVT));
11172 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
11174 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
11175 } else {
11177 }
11178
11179 EVT VT = Ld->getValueType(0);
11180 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
11181
11182 DCI.AddToWorklist(Cvt.getNode());
11183
11184 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
11185 // the appropriate extension from the 32-bit load.
11186 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
11187 DCI.AddToWorklist(Cvt.getNode());
11188
11189 // Handle conversion back to floating point if necessary.
11190 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
11191
11192 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
11193}
11194
11196 const SIMachineFunctionInfo &Info) {
11197 // TODO: Should check if the address can definitely not access stack.
11198 if (Info.isEntryFunction())
11199 return Info.getUserSGPRInfo().hasFlatScratchInit();
11200 return true;
11201}
11202
11203SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
11204 SDLoc DL(Op);
11205 LoadSDNode *Load = cast<LoadSDNode>(Op);
11206 ISD::LoadExtType ExtType = Load->getExtensionType();
11207 EVT MemVT = Load->getMemoryVT();
11208 MachineMemOperand *MMO = Load->getMemOperand();
11209
11210 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
11211 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
11212 return SDValue();
11213
11214 // FIXME: Copied from PPC
11215 // First, load into 32 bits, then truncate to 1 bit.
11216
11217 SDValue Chain = Load->getChain();
11218 SDValue BasePtr = Load->getBasePtr();
11219
11220 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
11221
11222 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
11223 RealMemVT, MMO);
11224
11225 if (!MemVT.isVector()) {
11226 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
11227 NewLD.getValue(1)};
11228
11229 return DAG.getMergeValues(Ops, DL);
11230 }
11231
11233 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
11234 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
11235 DAG.getConstant(I, DL, MVT::i32));
11236
11237 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
11238 }
11239
11240 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
11241
11242 return DAG.getMergeValues(Ops, DL);
11243 }
11244
11245 if (!MemVT.isVector())
11246 return SDValue();
11247
11248 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
11249 "Custom lowering for non-i32 vectors hasn't been implemented.");
11250
11251 Align Alignment = Load->getAlign();
11252 unsigned AS = Load->getAddressSpace();
11253 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
11254 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
11255 return SplitVectorLoad(Op, DAG);
11256 }
11257
11260 // If there is a possibility that flat instruction access scratch memory
11261 // then we need to use the same legalization rules we use for private.
11262 if (AS == AMDGPUAS::FLAT_ADDRESS &&
11264 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
11267
11268 unsigned NumElements = MemVT.getVectorNumElements();
11269
11270 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11272 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
11273 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
11275 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
11276 Alignment >= Align(4) && NumElements < 32) {
11277 if (MemVT.isPow2VectorType() ||
11278 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
11279 return SDValue();
11280 return WidenOrSplitVectorLoad(Op, DAG);
11281 }
11282 // Non-uniform loads will be selected to MUBUF instructions, so they
11283 // have the same legalization requirements as global and private
11284 // loads.
11285 //
11286 }
11287 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11290 if (NumElements > 4)
11291 return SplitVectorLoad(Op, DAG);
11292 // v3 loads not supported on SI.
11293 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11294 return WidenOrSplitVectorLoad(Op, DAG);
11295
11296 // v3 and v4 loads are supported for private and global memory.
11297 return SDValue();
11298 }
11299 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11300 // Depending on the setting of the private_element_size field in the
11301 // resource descriptor, we can only make private accesses up to a certain
11302 // size.
11303 switch (Subtarget->getMaxPrivateElementSize()) {
11304 case 4: {
11305 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
11306 return DAG.getMergeValues({Op0, Op1}, DL);
11307 }
11308 case 8:
11309 if (NumElements > 2)
11310 return SplitVectorLoad(Op, DAG);
11311 return SDValue();
11312 case 16:
11313 // Same as global/flat
11314 if (NumElements > 4)
11315 return SplitVectorLoad(Op, DAG);
11316 // v3 loads not supported on SI.
11317 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11318 return WidenOrSplitVectorLoad(Op, DAG);
11319
11320 return SDValue();
11321 default:
11322 llvm_unreachable("unsupported private_element_size");
11323 }
11324 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
11325 unsigned Fast = 0;
11326 auto Flags = Load->getMemOperand()->getFlags();
11328 Load->getAlign(), Flags, &Fast) &&
11329 Fast > 1)
11330 return SDValue();
11331
11332 if (MemVT.isVector())
11333 return SplitVectorLoad(Op, DAG);
11334 }
11335
11337 MemVT, *Load->getMemOperand())) {
11338 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
11339 return DAG.getMergeValues({Op0, Op1}, DL);
11340 }
11341
11342 return SDValue();
11343}
11344
11345SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
11346 EVT VT = Op.getValueType();
11347 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
11348 VT.getSizeInBits() == 512)
11349 return splitTernaryVectorOp(Op, DAG);
11350
11351 assert(VT.getSizeInBits() == 64);
11352
11353 SDLoc DL(Op);
11354 SDValue Cond = DAG.getFreeze(Op.getOperand(0));
11355
11356 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
11357 SDValue One = DAG.getConstant(1, DL, MVT::i32);
11358
11359 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
11360 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
11361
11362 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
11363 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
11364
11365 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
11366
11367 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
11368 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
11369
11370 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
11371
11372 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
11373 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
11374}
11375
11376// Catch division cases where we can use shortcuts with rcp and rsq
11377// instructions.
11378SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
11379 SelectionDAG &DAG) const {
11380 SDLoc SL(Op);
11381 SDValue LHS = Op.getOperand(0);
11382 SDValue RHS = Op.getOperand(1);
11383 EVT VT = Op.getValueType();
11384 const SDNodeFlags Flags = Op->getFlags();
11385
11386 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
11387
11388 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
11389 // Without !fpmath accuracy information, we can't do more because we don't
11390 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
11391 // f16 is always accurate enough
11392 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
11393 return SDValue();
11394
11395 if (CLHS->isExactlyValue(1.0)) {
11396 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
11397 // the CI documentation has a worst case error of 1 ulp.
11398 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
11399 // use it as long as we aren't trying to use denormals.
11400 //
11401 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
11402
11403 // 1.0 / sqrt(x) -> rsq(x)
11404
11405 // XXX - Is afn sufficient to do this for f64? The maximum ULP
11406 // error seems really high at 2^29 ULP.
11407 // 1.0 / x -> rcp(x)
11408 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
11409 }
11410
11411 // Same as for 1.0, but expand the sign out of the constant.
11412 if (CLHS->isExactlyValue(-1.0)) {
11413 // -1.0 / x -> rcp (fneg x)
11414 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
11415 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
11416 }
11417 }
11418
11419 // For f16 and bf16 require afn or arcp.
11420 // For f32 require afn.
11421 if (!AllowInaccurateRcp &&
11422 ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
11423 return SDValue();
11424
11425 // Turn into multiply by the reciprocal.
11426 // x / y -> x * (1.0 / y)
11427 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
11428 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
11429}
11430
11431SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
11432 SelectionDAG &DAG) const {
11433 SDLoc SL(Op);
11434 SDValue X = Op.getOperand(0);
11435 SDValue Y = Op.getOperand(1);
11436 EVT VT = Op.getValueType();
11437 const SDNodeFlags Flags = Op->getFlags();
11438
11439 bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
11440 if (!AllowInaccurateDiv)
11441 return SDValue();
11442
11443 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
11444 SDValue One = DAG.getConstantFP(1.0, SL, VT);
11445
11446 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
11447 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
11448
11449 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
11450 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
11451 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
11452 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
11453 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
11454 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
11455}
11456
11457static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
11458 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
11459 SDNodeFlags Flags) {
11460 if (GlueChain->getNumValues() <= 1) {
11461 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
11462 }
11463
11464 assert(GlueChain->getNumValues() == 3);
11465
11466 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
11467 switch (Opcode) {
11468 default:
11469 llvm_unreachable("no chain equivalent for opcode");
11470 case ISD::FMUL:
11471 Opcode = AMDGPUISD::FMUL_W_CHAIN;
11472 break;
11473 }
11474
11475 return DAG.getNode(Opcode, SL, VTList,
11476 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
11477 Flags);
11478}
11479
11480static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
11481 EVT VT, SDValue A, SDValue B, SDValue C,
11482 SDValue GlueChain, SDNodeFlags Flags) {
11483 if (GlueChain->getNumValues() <= 1) {
11484 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
11485 }
11486
11487 assert(GlueChain->getNumValues() == 3);
11488
11489 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
11490 switch (Opcode) {
11491 default:
11492 llvm_unreachable("no chain equivalent for opcode");
11493 case ISD::FMA:
11494 Opcode = AMDGPUISD::FMA_W_CHAIN;
11495 break;
11496 }
11497
11498 return DAG.getNode(Opcode, SL, VTList,
11499 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
11500 Flags);
11501}
11502
11503SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
11504 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
11505 return FastLowered;
11506
11507 SDLoc SL(Op);
11508 EVT VT = Op.getValueType();
11509 SDValue LHS = Op.getOperand(0);
11510 SDValue RHS = Op.getOperand(1);
11511
11512 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
11513 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
11514
11515 if (VT == MVT::bf16) {
11516 SDValue ExtDiv =
11517 DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags());
11518 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv,
11519 DAG.getTargetConstant(0, SL, MVT::i32));
11520 }
11521
11522 assert(VT == MVT::f16);
11523
11524 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
11525 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
11526 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
11527 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
11528 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
11529 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
11530 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
11531 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
11532 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
11533 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
11534 // q16.u = opx(V_CVT_F16_F32, q32.u);
11535 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
11536
11537 // We will use ISD::FMA on targets that don't support ISD::FMAD.
11538 unsigned FMADOpCode =
11540 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
11541 SDValue Rcp =
11542 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
11543 SDValue Quot =
11544 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
11545 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
11546 Op->getFlags());
11547 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
11548 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
11549 Op->getFlags());
11550 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
11551 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
11552 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
11553 DAG.getConstant(0xff800000, SL, MVT::i32));
11554 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
11555 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
11556 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
11557 DAG.getTargetConstant(0, SL, MVT::i32));
11558 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
11559 Op->getFlags());
11560}
11561
11562// Faster 2.5 ULP division that does not support denormals.
11563SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
11564 SDNodeFlags Flags = Op->getFlags();
11565 SDLoc SL(Op);
11566 SDValue LHS = Op.getOperand(1);
11567 SDValue RHS = Op.getOperand(2);
11568
11569 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
11570
11571 const APFloat K0Val(0x1p+96f);
11572 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
11573
11574 const APFloat K1Val(0x1p-32f);
11575 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
11576
11577 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
11578
11579 EVT SetCCVT =
11580 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
11581
11582 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
11583
11584 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
11585
11586 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
11587
11588 // rcp does not support denormals.
11589 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
11590
11591 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
11592
11593 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
11594}
11595
11596// Returns immediate value for setting the F32 denorm mode when using the
11597// S_DENORM_MODE instruction.
11599 const SIMachineFunctionInfo *Info,
11600 const GCNSubtarget *ST) {
11601 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
11602 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
11603 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
11604 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
11605}
11606
11607SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
11608 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
11609 return FastLowered;
11610
11611 // The selection matcher assumes anything with a chain selecting to a
11612 // mayRaiseFPException machine instruction. Since we're introducing a chain
11613 // here, we need to explicitly report nofpexcept for the regular fdiv
11614 // lowering.
11615 SDNodeFlags Flags = Op->getFlags();
11616 Flags.setNoFPExcept(true);
11617
11618 SDLoc SL(Op);
11619 SDValue LHS = Op.getOperand(0);
11620 SDValue RHS = Op.getOperand(1);
11621
11622 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
11623
11624 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
11625
11626 SDValue DenominatorScaled =
11627 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
11628 SDValue NumeratorScaled =
11629 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
11630
11631 // Denominator is scaled to not be denormal, so using rcp is ok.
11632 SDValue ApproxRcp =
11633 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
11634 SDValue NegDivScale0 =
11635 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
11636
11637 using namespace AMDGPU::Hwreg;
11638 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
11639 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
11640
11641 const MachineFunction &MF = DAG.getMachineFunction();
11643 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
11644
11645 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
11646 const bool HasDynamicDenormals =
11647 (DenormMode.Input == DenormalMode::Dynamic) ||
11648 (DenormMode.Output == DenormalMode::Dynamic);
11649
11650 SDValue SavedDenormMode;
11651
11652 if (!PreservesDenormals) {
11653 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
11654 // lowering. The chain dependence is insufficient, and we need glue. We do
11655 // not need the glue variants in a strictfp function.
11656
11657 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
11658
11659 SDValue Glue = DAG.getEntryNode();
11660 if (HasDynamicDenormals) {
11661 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
11662 DAG.getVTList(MVT::i32, MVT::Glue),
11663 {BitField, Glue});
11664 SavedDenormMode = SDValue(GetReg, 0);
11665
11666 Glue = DAG.getMergeValues(
11667 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
11668 }
11669
11670 SDNode *EnableDenorm;
11671 if (Subtarget->hasDenormModeInst()) {
11672 const SDValue EnableDenormValue =
11673 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
11674
11675 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
11676 EnableDenormValue)
11677 .getNode();
11678 } else {
11679 const SDValue EnableDenormValue =
11680 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
11681 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
11682 {EnableDenormValue, BitField, Glue});
11683 }
11684
11685 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
11686 SDValue(EnableDenorm, 1)};
11687
11688 NegDivScale0 = DAG.getMergeValues(Ops, SL);
11689 }
11690
11691 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
11692 ApproxRcp, One, NegDivScale0, Flags);
11693
11694 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
11695 ApproxRcp, Fma0, Flags);
11696
11697 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
11698 Fma1, Flags);
11699
11700 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
11701 NumeratorScaled, Mul, Flags);
11702
11703 SDValue Fma3 =
11704 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
11705
11706 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
11707 NumeratorScaled, Fma3, Flags);
11708
11709 if (!PreservesDenormals) {
11710 SDNode *DisableDenorm;
11711 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
11712 const SDValue DisableDenormValue = getSPDenormModeValue(
11713 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
11714
11715 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
11716 DisableDenorm =
11717 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
11718 Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2))
11719 .getNode();
11720 } else {
11721 assert(HasDynamicDenormals == (bool)SavedDenormMode);
11722 const SDValue DisableDenormValue =
11723 HasDynamicDenormals
11724 ? SavedDenormMode
11725 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
11726
11727 DisableDenorm = DAG.getMachineNode(
11728 AMDGPU::S_SETREG_B32, SL, MVT::Other,
11729 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
11730 }
11731
11732 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
11733 SDValue(DisableDenorm, 0), DAG.getRoot());
11734 DAG.setRoot(OutputChain);
11735 }
11736
11737 SDValue Scale = NumeratorScaled.getValue(1);
11738 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
11739 {Fma4, Fma1, Fma3, Scale}, Flags);
11740
11741 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
11742}
11743
11744SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
11745 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
11746 return FastLowered;
11747
11748 SDLoc SL(Op);
11749 SDValue X = Op.getOperand(0);
11750 SDValue Y = Op.getOperand(1);
11751
11752 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
11753
11754 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
11755
11756 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
11757
11758 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
11759
11760 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
11761
11762 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
11763
11764 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
11765
11766 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
11767
11768 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
11769
11770 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
11771 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
11772
11773 SDValue Fma4 =
11774 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
11775
11776 SDValue Scale;
11777
11778 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
11779 // Workaround a hardware bug on SI where the condition output from div_scale
11780 // is not usable.
11781
11782 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
11783
11784 // Figure out if the scale to use for div_fmas.
11785 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
11786 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
11787 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
11788 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
11789
11790 SDValue NumHi =
11791 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
11792 SDValue DenHi =
11793 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
11794
11795 SDValue Scale0Hi =
11796 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
11797 SDValue Scale1Hi =
11798 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
11799
11800 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
11801 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
11802 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
11803 } else {
11804 Scale = DivScale1.getValue(1);
11805 }
11806
11807 SDValue Fmas =
11808 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
11809
11810 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
11811}
11812
11813SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
11814 EVT VT = Op.getValueType();
11815
11816 if (VT == MVT::f32)
11817 return LowerFDIV32(Op, DAG);
11818
11819 if (VT == MVT::f64)
11820 return LowerFDIV64(Op, DAG);
11821
11822 if (VT == MVT::f16 || VT == MVT::bf16)
11823 return LowerFDIV16(Op, DAG);
11824
11825 llvm_unreachable("Unexpected type for fdiv");
11826}
11827
11828SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
11829 SDLoc dl(Op);
11830 SDValue Val = Op.getOperand(0);
11831 EVT VT = Val.getValueType();
11832 EVT ResultExpVT = Op->getValueType(1);
11833 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
11834
11835 SDValue Mant = DAG.getNode(
11837 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
11838
11839 SDValue Exp = DAG.getNode(
11840 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
11841 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
11842
11843 if (Subtarget->hasFractBug()) {
11844 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
11845 SDValue Inf =
11847
11848 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
11849 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
11850 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
11851 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
11852 }
11853
11854 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
11855 return DAG.getMergeValues({Mant, CastExp}, dl);
11856}
11857
11858SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
11859 SDLoc DL(Op);
11860 StoreSDNode *Store = cast<StoreSDNode>(Op);
11861 EVT VT = Store->getMemoryVT();
11862
11863 if (VT == MVT::i1) {
11864 return DAG.getTruncStore(
11865 Store->getChain(), DL,
11866 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
11867 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
11868 }
11869
11870 assert(VT.isVector() &&
11871 Store->getValue().getValueType().getScalarType() == MVT::i32);
11872
11873 unsigned AS = Store->getAddressSpace();
11874 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
11875 Store->getAlign().value() < VT.getStoreSize() &&
11876 VT.getSizeInBits() > 32) {
11877 return SplitVectorStore(Op, DAG);
11878 }
11879
11882 // If there is a possibility that flat instruction access scratch memory
11883 // then we need to use the same legalization rules we use for private.
11884 if (AS == AMDGPUAS::FLAT_ADDRESS &&
11886 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
11889
11890 unsigned NumElements = VT.getVectorNumElements();
11892 if (NumElements > 4)
11893 return SplitVectorStore(Op, DAG);
11894 // v3 stores not supported on SI.
11895 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11896 return SplitVectorStore(Op, DAG);
11897
11899 VT, *Store->getMemOperand()))
11900 return expandUnalignedStore(Store, DAG);
11901
11902 return SDValue();
11903 }
11904 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11905 switch (Subtarget->getMaxPrivateElementSize()) {
11906 case 4:
11907 return scalarizeVectorStore(Store, DAG);
11908 case 8:
11909 if (NumElements > 2)
11910 return SplitVectorStore(Op, DAG);
11911 return SDValue();
11912 case 16:
11913 if (NumElements > 4 ||
11914 (NumElements == 3 && !Subtarget->enableFlatScratch()))
11915 return SplitVectorStore(Op, DAG);
11916 return SDValue();
11917 default:
11918 llvm_unreachable("unsupported private_element_size");
11919 }
11920 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
11921 unsigned Fast = 0;
11922 auto Flags = Store->getMemOperand()->getFlags();
11924 Store->getAlign(), Flags, &Fast) &&
11925 Fast > 1)
11926 return SDValue();
11927
11928 if (VT.isVector())
11929 return SplitVectorStore(Op, DAG);
11930
11931 return expandUnalignedStore(Store, DAG);
11932 }
11933
11934 // Probably an invalid store. If so we'll end up emitting a selection error.
11935 return SDValue();
11936}
11937
11938// Avoid the full correct expansion for f32 sqrt when promoting from f16.
11939SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
11940 SDLoc SL(Op);
11941 assert(!Subtarget->has16BitInsts());
11942 SDNodeFlags Flags = Op->getFlags();
11943 SDValue Ext =
11944 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
11945
11946 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
11947 SDValue Sqrt =
11948 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
11949
11950 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
11951 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
11952}
11953
11954SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
11955 SDLoc DL(Op);
11956 SDNodeFlags Flags = Op->getFlags();
11957 MVT VT = Op.getValueType().getSimpleVT();
11958 const SDValue X = Op.getOperand(0);
11959
11960 if (allowApproxFunc(DAG, Flags)) {
11961 // Instruction is 1ulp but ignores denormals.
11962 return DAG.getNode(
11964 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
11965 }
11966
11967 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
11968 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
11969
11970 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
11971
11972 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
11973
11974 SDValue SqrtX =
11975 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
11976
11977 SDValue SqrtS;
11978 if (needsDenormHandlingF32(DAG, X, Flags)) {
11979 SDValue SqrtID =
11980 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
11981 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
11982
11983 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
11984 SDValue SqrtSNextDownInt =
11985 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11986 DAG.getAllOnesConstant(DL, MVT::i32));
11987 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
11988
11989 SDValue NegSqrtSNextDown =
11990 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
11991
11992 SDValue SqrtVP =
11993 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
11994
11995 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11996 DAG.getConstant(1, DL, MVT::i32));
11997 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
11998
11999 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
12000 SDValue SqrtVS =
12001 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
12002
12003 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
12004 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
12005
12006 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
12007 Flags);
12008
12009 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
12010 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
12011 Flags);
12012 } else {
12013 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
12014
12015 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
12016
12017 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
12018 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
12019 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
12020
12021 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
12022 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
12023 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
12024
12025 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
12026 SDValue SqrtD =
12027 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
12028 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
12029 }
12030
12031 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
12032
12033 SDValue ScaledDown =
12034 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
12035
12036 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
12037 SDValue IsZeroOrInf =
12038 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12039 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12040
12041 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
12042}
12043
12044SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
12045 // For double type, the SQRT and RSQ instructions don't have required
12046 // precision, we apply Goldschmidt's algorithm to improve the result:
12047 //
12048 // y0 = rsq(x)
12049 // g0 = x * y0
12050 // h0 = 0.5 * y0
12051 //
12052 // r0 = 0.5 - h0 * g0
12053 // g1 = g0 * r0 + g0
12054 // h1 = h0 * r0 + h0
12055 //
12056 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
12057 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
12058 // h2 = h1 * r1 + h1
12059 //
12060 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
12061 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
12062 //
12063 // sqrt(x) = g3
12064
12065 SDNodeFlags Flags = Op->getFlags();
12066
12067 SDLoc DL(Op);
12068
12069 SDValue X = Op.getOperand(0);
12070 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
12071
12072 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
12073
12074 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
12075
12076 // Scale up input if it is too small.
12077 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
12078 SDValue ScaleUp =
12079 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
12080 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
12081
12082 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
12083
12084 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
12085
12086 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
12087 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
12088
12089 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
12090 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
12091
12092 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
12093
12094 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
12095
12096 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
12097 SDValue SqrtD0 =
12098 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
12099
12100 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
12101
12102 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
12103 SDValue SqrtD1 =
12104 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
12105
12106 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
12107
12108 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
12109 SDValue ScaleDown =
12110 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
12111 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12112
12113 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
12114 // with finite only or nsz because rsq(+/-0) = +/-inf
12115
12116 // TODO: Check for DAZ and expand to subnormals
12117 SDValue IsZeroOrInf =
12118 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12119 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12120
12121 // If x is +INF, +0, or -0, use its original value
12122 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
12123 Flags);
12124}
12125
12126SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
12127 SDLoc DL(Op);
12128 EVT VT = Op.getValueType();
12129 SDValue Arg = Op.getOperand(0);
12130 SDValue TrigVal;
12131
12132 // Propagate fast-math flags so that the multiply we introduce can be folded
12133 // if Arg is already the result of a multiply by constant.
12134 auto Flags = Op->getFlags();
12135
12136 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
12137
12138 if (Subtarget->hasTrigReducedRange()) {
12139 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12140 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
12141 } else {
12142 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12143 }
12144
12145 switch (Op.getOpcode()) {
12146 case ISD::FCOS:
12147 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
12148 case ISD::FSIN:
12149 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
12150 default:
12151 llvm_unreachable("Wrong trig opcode");
12152 }
12153}
12154
12155SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
12156 SelectionDAG &DAG) const {
12157 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
12158 assert(AtomicNode->isCompareAndSwap());
12159 unsigned AS = AtomicNode->getAddressSpace();
12160
12161 // No custom lowering required for local address space
12163 return Op;
12164
12165 // Non-local address space requires custom lowering for atomic compare
12166 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
12167 SDLoc DL(Op);
12168 SDValue ChainIn = Op.getOperand(0);
12169 SDValue Addr = Op.getOperand(1);
12170 SDValue Old = Op.getOperand(2);
12171 SDValue New = Op.getOperand(3);
12172 EVT VT = Op.getValueType();
12173 MVT SimpleVT = VT.getSimpleVT();
12174 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
12175
12176 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
12177 SDValue Ops[] = {ChainIn, Addr, NewOld};
12178
12180 Op->getVTList(), Ops, VT,
12181 AtomicNode->getMemOperand());
12182}
12183
12184//===----------------------------------------------------------------------===//
12185// Custom DAG optimizations
12186//===----------------------------------------------------------------------===//
12187
12188SDValue
12189SITargetLowering::performUCharToFloatCombine(SDNode *N,
12190 DAGCombinerInfo &DCI) const {
12191 EVT VT = N->getValueType(0);
12192 EVT ScalarVT = VT.getScalarType();
12193 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12194 return SDValue();
12195
12196 SelectionDAG &DAG = DCI.DAG;
12197 SDLoc DL(N);
12198
12199 SDValue Src = N->getOperand(0);
12200 EVT SrcVT = Src.getValueType();
12201
12202 // TODO: We could try to match extracting the higher bytes, which would be
12203 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
12204 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
12205 // about in practice.
12206 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12207 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
12208 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
12209 DCI.AddToWorklist(Cvt.getNode());
12210
12211 // For the f16 case, fold to a cast to f32 and then cast back to f16.
12212 if (ScalarVT != MVT::f32) {
12213 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
12214 DAG.getTargetConstant(0, DL, MVT::i32));
12215 }
12216 return Cvt;
12217 }
12218 }
12219
12220 return SDValue();
12221}
12222
12223SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
12224 DAGCombinerInfo &DCI) const {
12225 SDValue MagnitudeOp = N->getOperand(0);
12226 SDValue SignOp = N->getOperand(1);
12227
12228 // The generic combine for fcopysign + fp cast is too conservative with
12229 // vectors, and also gets confused by the splitting we will perform here, so
12230 // peek through FP casts.
12231 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
12232 SignOp.getOpcode() == ISD::FP_ROUND)
12233 SignOp = SignOp.getOperand(0);
12234
12235 SelectionDAG &DAG = DCI.DAG;
12236 SDLoc DL(N);
12237 EVT SignVT = SignOp.getValueType();
12238
12239 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
12240 // lower half with a copy.
12241 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
12242 EVT MagVT = MagnitudeOp.getValueType();
12243
12244 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
12245
12246 if (MagVT.getScalarType() == MVT::f64) {
12247 EVT F32VT = MagVT.isVector()
12248 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
12249 : MVT::v2f32;
12250
12251 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
12252
12254 for (unsigned I = 0; I != NumElts; ++I) {
12255 SDValue MagLo =
12256 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
12257 DAG.getConstant(2 * I, DL, MVT::i32));
12258 SDValue MagHi =
12259 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
12260 DAG.getConstant(2 * I + 1, DL, MVT::i32));
12261
12262 SDValue SignOpElt =
12263 MagVT.isVector()
12265 SignOp, DAG.getConstant(I, DL, MVT::i32))
12266 : SignOp;
12267
12268 SDValue HiOp =
12269 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
12270
12271 SDValue Vector =
12272 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
12273
12274 SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
12275 NewElts.push_back(NewElt);
12276 }
12277
12278 if (NewElts.size() == 1)
12279 return NewElts[0];
12280
12281 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
12282 }
12283
12284 if (SignVT.getScalarType() != MVT::f64)
12285 return SDValue();
12286
12287 // Reduce width of sign operand, we only need the highest bit.
12288 //
12289 // fcopysign f64:x, f64:y ->
12290 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
12291 // TODO: In some cases it might make sense to go all the way to f16.
12292
12293 EVT F32VT = MagVT.isVector()
12294 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
12295 : MVT::v2f32;
12296
12297 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
12298
12299 SmallVector<SDValue, 8> F32Signs;
12300 for (unsigned I = 0; I != NumElts; ++I) {
12301 // Take sign from odd elements of cast vector
12302 SDValue SignAsF32 =
12303 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
12304 DAG.getConstant(2 * I + 1, DL, MVT::i32));
12305 F32Signs.push_back(SignAsF32);
12306 }
12307
12308 SDValue NewSign =
12309 NumElts == 1
12310 ? F32Signs.back()
12312 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
12313 F32Signs);
12314
12315 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
12316 NewSign);
12317}
12318
12319// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
12320// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
12321// bits
12322
12323// This is a variant of
12324// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
12325//
12326// The normal DAG combiner will do this, but only if the add has one use since
12327// that would increase the number of instructions.
12328//
12329// This prevents us from seeing a constant offset that can be folded into a
12330// memory instruction's addressing mode. If we know the resulting add offset of
12331// a pointer can be folded into an addressing offset, we can replace the pointer
12332// operand with the add of new constant offset. This eliminates one of the uses,
12333// and may allow the remaining use to also be simplified.
12334//
12335SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
12336 EVT MemVT,
12337 DAGCombinerInfo &DCI) const {
12338 SDValue N0 = N->getOperand(0);
12339 SDValue N1 = N->getOperand(1);
12340
12341 // We only do this to handle cases where it's profitable when there are
12342 // multiple uses of the add, so defer to the standard combine.
12343 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
12344 N0->hasOneUse())
12345 return SDValue();
12346
12347 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
12348 if (!CN1)
12349 return SDValue();
12350
12351 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
12352 if (!CAdd)
12353 return SDValue();
12354
12355 SelectionDAG &DAG = DCI.DAG;
12356
12357 if (N0->getOpcode() == ISD::OR &&
12358 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
12359 return SDValue();
12360
12361 // If the resulting offset is too large, we can't fold it into the
12362 // addressing mode offset.
12363 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
12364 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
12365
12366 AddrMode AM;
12367 AM.HasBaseReg = true;
12368 AM.BaseOffs = Offset.getSExtValue();
12369 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
12370 return SDValue();
12371
12372 SDLoc SL(N);
12373 EVT VT = N->getValueType(0);
12374
12375 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
12376 SDValue COffset = DAG.getConstant(Offset, SL, VT);
12377
12379 Flags.setNoUnsignedWrap(
12380 N->getFlags().hasNoUnsignedWrap() &&
12381 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
12382
12383 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
12384}
12385
12386/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
12387/// by the chain and intrinsic ID. Theoretically we would also need to check the
12388/// specific intrinsic, but they all place the pointer operand first.
12389static unsigned getBasePtrIndex(const MemSDNode *N) {
12390 switch (N->getOpcode()) {
12391 case ISD::STORE:
12394 return 2;
12395 default:
12396 return 1;
12397 }
12398}
12399
12400SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
12401 DAGCombinerInfo &DCI) const {
12402 SelectionDAG &DAG = DCI.DAG;
12403
12404 unsigned PtrIdx = getBasePtrIndex(N);
12405 SDValue Ptr = N->getOperand(PtrIdx);
12406
12407 // TODO: We could also do this for multiplies.
12408 if (Ptr.getOpcode() == ISD::SHL) {
12409 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
12410 N->getMemoryVT(), DCI);
12411 if (NewPtr) {
12412 SmallVector<SDValue, 8> NewOps(N->ops());
12413
12414 NewOps[PtrIdx] = NewPtr;
12415 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
12416 }
12417 }
12418
12419 return SDValue();
12420}
12421
12422static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
12423 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
12424 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
12425 (Opc == ISD::XOR && Val == 0);
12426}
12427
12428// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
12429// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
12430// integer combine opportunities since most 64-bit operations are decomposed
12431// this way. TODO: We won't want this for SALU especially if it is an inline
12432// immediate.
12433SDValue SITargetLowering::splitBinaryBitConstantOp(
12434 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
12435 const ConstantSDNode *CRHS) const {
12436 uint64_t Val = CRHS->getZExtValue();
12437 uint32_t ValLo = Lo_32(Val);
12438 uint32_t ValHi = Hi_32(Val);
12440
12441 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
12443 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
12444 // We have 64-bit scalar and/or/xor, but do not have vector forms.
12445 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
12446 !CRHS->user_begin()->isDivergent())
12447 return SDValue();
12448
12449 // If we need to materialize a 64-bit immediate, it will be split up later
12450 // anyway. Avoid creating the harder to understand 64-bit immediate
12451 // materialization.
12452 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
12453 }
12454
12455 return SDValue();
12456}
12457
12459 if (V.getValueType() != MVT::i1)
12460 return false;
12461 switch (V.getOpcode()) {
12462 default:
12463 break;
12464 case ISD::SETCC:
12465 case ISD::IS_FPCLASS:
12467 return true;
12468 case ISD::AND:
12469 case ISD::OR:
12470 case ISD::XOR:
12471 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
12472 case ISD::SADDO:
12473 case ISD::UADDO:
12474 case ISD::SSUBO:
12475 case ISD::USUBO:
12476 case ISD::SMULO:
12477 case ISD::UMULO:
12478 return V.getResNo() == 1;
12480 unsigned IntrinsicID = V.getConstantOperandVal(0);
12481 switch (IntrinsicID) {
12482 case Intrinsic::amdgcn_is_shared:
12483 case Intrinsic::amdgcn_is_private:
12484 return true;
12485 default:
12486 return false;
12487 }
12488
12489 return false;
12490 }
12491 }
12492 return false;
12493}
12494
12495// If a constant has all zeroes or all ones within each byte return it.
12496// Otherwise return 0.
12498 // 0xff for any zero byte in the mask
12499 uint32_t ZeroByteMask = 0;
12500 if (!(C & 0x000000ff))
12501 ZeroByteMask |= 0x000000ff;
12502 if (!(C & 0x0000ff00))
12503 ZeroByteMask |= 0x0000ff00;
12504 if (!(C & 0x00ff0000))
12505 ZeroByteMask |= 0x00ff0000;
12506 if (!(C & 0xff000000))
12507 ZeroByteMask |= 0xff000000;
12508 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
12509 if ((NonZeroByteMask & C) != NonZeroByteMask)
12510 return 0; // Partial bytes selected.
12511 return C;
12512}
12513
12514// Check if a node selects whole bytes from its operand 0 starting at a byte
12515// boundary while masking the rest. Returns select mask as in the v_perm_b32
12516// or -1 if not succeeded.
12517// Note byte select encoding:
12518// value 0-3 selects corresponding source byte;
12519// value 0xc selects zero;
12520// value 0xff selects 0xff.
12522 assert(V.getValueSizeInBits() == 32);
12523
12524 if (V.getNumOperands() != 2)
12525 return ~0;
12526
12527 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
12528 if (!N1)
12529 return ~0;
12530
12531 uint32_t C = N1->getZExtValue();
12532
12533 switch (V.getOpcode()) {
12534 default:
12535 break;
12536 case ISD::AND:
12537 if (uint32_t ConstMask = getConstantPermuteMask(C))
12538 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
12539 break;
12540
12541 case ISD::OR:
12542 if (uint32_t ConstMask = getConstantPermuteMask(C))
12543 return (0x03020100 & ~ConstMask) | ConstMask;
12544 break;
12545
12546 case ISD::SHL:
12547 if (C % 8)
12548 return ~0;
12549
12550 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
12551
12552 case ISD::SRL:
12553 if (C % 8)
12554 return ~0;
12555
12556 return uint32_t(0x0c0c0c0c03020100ull >> C);
12557 }
12558
12559 return ~0;
12560}
12561
12562SDValue SITargetLowering::performAndCombine(SDNode *N,
12563 DAGCombinerInfo &DCI) const {
12564 if (DCI.isBeforeLegalize())
12565 return SDValue();
12566
12567 SelectionDAG &DAG = DCI.DAG;
12568 EVT VT = N->getValueType(0);
12569 SDValue LHS = N->getOperand(0);
12570 SDValue RHS = N->getOperand(1);
12571
12572 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12573 if (VT == MVT::i64 && CRHS) {
12574 if (SDValue Split =
12575 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
12576 return Split;
12577 }
12578
12579 if (CRHS && VT == MVT::i32) {
12580 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
12581 // nb = number of trailing zeroes in mask
12582 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
12583 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
12584 uint64_t Mask = CRHS->getZExtValue();
12585 unsigned Bits = llvm::popcount(Mask);
12586 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
12587 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
12588 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
12589 unsigned Shift = CShift->getZExtValue();
12590 unsigned NB = CRHS->getAPIntValue().countr_zero();
12591 unsigned Offset = NB + Shift;
12592 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
12593 SDLoc SL(N);
12594 SDValue BFE =
12595 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
12596 DAG.getConstant(Offset, SL, MVT::i32),
12597 DAG.getConstant(Bits, SL, MVT::i32));
12598 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
12599 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
12600 DAG.getValueType(NarrowVT));
12601 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
12602 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
12603 return Shl;
12604 }
12605 }
12606 }
12607
12608 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12609 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
12610 isa<ConstantSDNode>(LHS.getOperand(2))) {
12611 uint32_t Sel = getConstantPermuteMask(Mask);
12612 if (!Sel)
12613 return SDValue();
12614
12615 // Select 0xc for all zero bytes
12616 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
12617 SDLoc DL(N);
12618 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12619 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
12620 }
12621 }
12622
12623 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
12624 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
12625 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
12626 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
12627 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
12628
12629 SDValue X = LHS.getOperand(0);
12630 SDValue Y = RHS.getOperand(0);
12631 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
12632 !isTypeLegal(X.getValueType()))
12633 return SDValue();
12634
12635 if (LCC == ISD::SETO) {
12636 if (X != LHS.getOperand(1))
12637 return SDValue();
12638
12639 if (RCC == ISD::SETUNE) {
12640 const ConstantFPSDNode *C1 =
12641 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
12642 if (!C1 || !C1->isInfinity() || C1->isNegative())
12643 return SDValue();
12644
12649
12650 static_assert(
12653 0x3ff) == Mask,
12654 "mask not equal");
12655
12656 SDLoc DL(N);
12657 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
12658 DAG.getConstant(Mask, DL, MVT::i32));
12659 }
12660 }
12661 }
12662
12663 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
12664 std::swap(LHS, RHS);
12665
12666 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
12667 RHS.hasOneUse()) {
12668 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
12669 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
12670 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
12671 // | n_nan)
12672 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
12673 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
12674 (RHS.getOperand(0) == LHS.getOperand(0) &&
12675 LHS.getOperand(0) == LHS.getOperand(1))) {
12676 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
12677 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
12678 : Mask->getZExtValue() & OrdMask;
12679
12680 SDLoc DL(N);
12681 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
12682 DAG.getConstant(NewMask, DL, MVT::i32));
12683 }
12684 }
12685
12686 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
12687 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
12688 // and x, (sext cc from i1) => select cc, x, 0
12689 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
12690 std::swap(LHS, RHS);
12691 if (isBoolSGPR(RHS.getOperand(0)))
12692 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
12693 DAG.getConstant(0, SDLoc(N), MVT::i32));
12694 }
12695
12696 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12698 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
12699 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12700 uint32_t LHSMask = getPermuteMask(LHS);
12701 uint32_t RHSMask = getPermuteMask(RHS);
12702 if (LHSMask != ~0u && RHSMask != ~0u) {
12703 // Canonicalize the expression in an attempt to have fewer unique masks
12704 // and therefore fewer registers used to hold the masks.
12705 if (LHSMask > RHSMask) {
12706 std::swap(LHSMask, RHSMask);
12707 std::swap(LHS, RHS);
12708 }
12709
12710 // Select 0xc for each lane used from source operand. Zero has 0xc mask
12711 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12712 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12713 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12714
12715 // Check of we need to combine values from two sources within a byte.
12716 if (!(LHSUsedLanes & RHSUsedLanes) &&
12717 // If we select high and lower word keep it for SDWA.
12718 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
12719 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12720 // Each byte in each mask is either selector mask 0-3, or has higher
12721 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
12722 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
12723 // mask which is not 0xff wins. By anding both masks we have a correct
12724 // result except that 0x0c shall be corrected to give 0x0c only.
12725 uint32_t Mask = LHSMask & RHSMask;
12726 for (unsigned I = 0; I < 32; I += 8) {
12727 uint32_t ByteSel = 0xff << I;
12728 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
12729 Mask &= (0x0c << I) & 0xffffffff;
12730 }
12731
12732 // Add 4 to each active LHS lane. It will not affect any existing 0xff
12733 // or 0x0c.
12734 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
12735 SDLoc DL(N);
12736
12737 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12738 RHS.getOperand(0),
12739 DAG.getConstant(Sel, DL, MVT::i32));
12740 }
12741 }
12742 }
12743
12744 return SDValue();
12745}
12746
12747// A key component of v_perm is a mapping between byte position of the src
12748// operands, and the byte position of the dest. To provide such, we need: 1. the
12749// node that provides x byte of the dest of the OR, and 2. the byte of the node
12750// used to provide that x byte. calculateByteProvider finds which node provides
12751// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
12752// and finds an ultimate src and byte position For example: The supported
12753// LoadCombine pattern for vector loads is as follows
12754// t1
12755// or
12756// / \
12757// t2 t3
12758// zext shl
12759// | | \
12760// t4 t5 16
12761// or anyext
12762// / \ |
12763// t6 t7 t8
12764// srl shl or
12765// / | / \ / \
12766// t9 t10 t11 t12 t13 t14
12767// trunc* 8 trunc* 8 and and
12768// | | / | | \
12769// t15 t16 t17 t18 t19 t20
12770// trunc* 255 srl -256
12771// | / \
12772// t15 t15 16
12773//
12774// *In this example, the truncs are from i32->i16
12775//
12776// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
12777// respectively. calculateSrcByte would find (given node) -> ultimate src &
12778// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
12779// After finding the mapping, we can combine the tree into vperm t15, t16,
12780// 0x05000407
12781
12782// Find the source and byte position from a node.
12783// \p DestByte is the byte position of the dest of the or that the src
12784// ultimately provides. \p SrcIndex is the byte of the src that maps to this
12785// dest of the or byte. \p Depth tracks how many recursive iterations we have
12786// performed.
12787static const std::optional<ByteProvider<SDValue>>
12788calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
12789 unsigned Depth = 0) {
12790 // We may need to recursively traverse a series of SRLs
12791 if (Depth >= 6)
12792 return std::nullopt;
12793
12794 if (Op.getValueSizeInBits() < 8)
12795 return std::nullopt;
12796
12797 if (Op.getValueType().isVector())
12798 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
12799
12800 switch (Op->getOpcode()) {
12801 case ISD::TRUNCATE: {
12802 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
12803 }
12804
12805 case ISD::SIGN_EXTEND:
12806 case ISD::ZERO_EXTEND:
12808 SDValue NarrowOp = Op->getOperand(0);
12809 auto NarrowVT = NarrowOp.getValueType();
12810 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
12811 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
12812 NarrowVT = VTSign->getVT();
12813 }
12814 if (!NarrowVT.isByteSized())
12815 return std::nullopt;
12816 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
12817
12818 if (SrcIndex >= NarrowByteWidth)
12819 return std::nullopt;
12820 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
12821 }
12822
12823 case ISD::SRA:
12824 case ISD::SRL: {
12825 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12826 if (!ShiftOp)
12827 return std::nullopt;
12828
12829 uint64_t BitShift = ShiftOp->getZExtValue();
12830
12831 if (BitShift % 8 != 0)
12832 return std::nullopt;
12833
12834 SrcIndex += BitShift / 8;
12835
12836 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
12837 }
12838
12839 default: {
12840 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
12841 }
12842 }
12843 llvm_unreachable("fully handled switch");
12844}
12845
12846// For a byte position in the result of an Or, traverse the tree and find the
12847// node (and the byte of the node) which ultimately provides this {Or,
12848// BytePosition}. \p Op is the operand we are currently examining. \p Index is
12849// the byte position of the Op that corresponds with the originally requested
12850// byte of the Or \p Depth tracks how many recursive iterations we have
12851// performed. \p StartingIndex is the originally requested byte of the Or
12852static const std::optional<ByteProvider<SDValue>>
12853calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
12854 unsigned StartingIndex = 0) {
12855 // Finding Src tree of RHS of or typically requires at least 1 additional
12856 // depth
12857 if (Depth > 6)
12858 return std::nullopt;
12859
12860 unsigned BitWidth = Op.getScalarValueSizeInBits();
12861 if (BitWidth % 8 != 0)
12862 return std::nullopt;
12863 if (Index > BitWidth / 8 - 1)
12864 return std::nullopt;
12865
12866 bool IsVec = Op.getValueType().isVector();
12867 switch (Op.getOpcode()) {
12868 case ISD::OR: {
12869 if (IsVec)
12870 return std::nullopt;
12871
12872 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
12873 StartingIndex);
12874 if (!RHS)
12875 return std::nullopt;
12876 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
12877 StartingIndex);
12878 if (!LHS)
12879 return std::nullopt;
12880 // A well formed Or will have two ByteProviders for each byte, one of which
12881 // is constant zero
12882 if (!LHS->isConstantZero() && !RHS->isConstantZero())
12883 return std::nullopt;
12884 if (!LHS || LHS->isConstantZero())
12885 return RHS;
12886 if (!RHS || RHS->isConstantZero())
12887 return LHS;
12888 return std::nullopt;
12889 }
12890
12891 case ISD::AND: {
12892 if (IsVec)
12893 return std::nullopt;
12894
12895 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12896 if (!BitMaskOp)
12897 return std::nullopt;
12898
12899 uint32_t BitMask = BitMaskOp->getZExtValue();
12900 // Bits we expect for our StartingIndex
12901 uint32_t IndexMask = 0xFF << (Index * 8);
12902
12903 if ((IndexMask & BitMask) != IndexMask) {
12904 // If the result of the and partially provides the byte, then it
12905 // is not well formatted
12906 if (IndexMask & BitMask)
12907 return std::nullopt;
12909 }
12910
12911 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
12912 }
12913
12914 case ISD::FSHR: {
12915 if (IsVec)
12916 return std::nullopt;
12917
12918 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
12919 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12920 if (!ShiftOp || Op.getValueType().isVector())
12921 return std::nullopt;
12922
12923 uint64_t BitsProvided = Op.getValueSizeInBits();
12924 if (BitsProvided % 8 != 0)
12925 return std::nullopt;
12926
12927 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
12928 if (BitShift % 8)
12929 return std::nullopt;
12930
12931 uint64_t ConcatSizeInBytes = BitsProvided / 4;
12932 uint64_t ByteShift = BitShift / 8;
12933
12934 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
12935 uint64_t BytesProvided = BitsProvided / 8;
12936 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
12937 NewIndex %= BytesProvided;
12938 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
12939 }
12940
12941 case ISD::SRA:
12942 case ISD::SRL: {
12943 if (IsVec)
12944 return std::nullopt;
12945
12946 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12947 if (!ShiftOp)
12948 return std::nullopt;
12949
12950 uint64_t BitShift = ShiftOp->getZExtValue();
12951 if (BitShift % 8)
12952 return std::nullopt;
12953
12954 auto BitsProvided = Op.getScalarValueSizeInBits();
12955 if (BitsProvided % 8 != 0)
12956 return std::nullopt;
12957
12958 uint64_t BytesProvided = BitsProvided / 8;
12959 uint64_t ByteShift = BitShift / 8;
12960 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
12961 // If the byte we are trying to provide (as tracked by index) falls in this
12962 // range, then the SRL provides the byte. The byte of interest of the src of
12963 // the SRL is Index + ByteShift
12964 return BytesProvided - ByteShift > Index
12965 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
12966 Index + ByteShift)
12968 }
12969
12970 case ISD::SHL: {
12971 if (IsVec)
12972 return std::nullopt;
12973
12974 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12975 if (!ShiftOp)
12976 return std::nullopt;
12977
12978 uint64_t BitShift = ShiftOp->getZExtValue();
12979 if (BitShift % 8 != 0)
12980 return std::nullopt;
12981 uint64_t ByteShift = BitShift / 8;
12982
12983 // If we are shifting by an amount greater than (or equal to)
12984 // the index we are trying to provide, then it provides 0s. If not,
12985 // then this bytes are not definitively 0s, and the corresponding byte
12986 // of interest is Index - ByteShift of the src
12987 return Index < ByteShift
12989 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
12990 Depth + 1, StartingIndex);
12991 }
12992 case ISD::ANY_EXTEND:
12993 case ISD::SIGN_EXTEND:
12994 case ISD::ZERO_EXTEND:
12996 case ISD::AssertZext:
12997 case ISD::AssertSext: {
12998 if (IsVec)
12999 return std::nullopt;
13000
13001 SDValue NarrowOp = Op->getOperand(0);
13002 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
13003 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
13004 Op->getOpcode() == ISD::AssertZext ||
13005 Op->getOpcode() == ISD::AssertSext) {
13006 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13007 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13008 }
13009 if (NarrowBitWidth % 8 != 0)
13010 return std::nullopt;
13011 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13012
13013 if (Index >= NarrowByteWidth)
13014 return Op.getOpcode() == ISD::ZERO_EXTEND
13015 ? std::optional<ByteProvider<SDValue>>(
13017 : std::nullopt;
13018 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
13019 }
13020
13021 case ISD::TRUNCATE: {
13022 if (IsVec)
13023 return std::nullopt;
13024
13025 uint64_t NarrowByteWidth = BitWidth / 8;
13026
13027 if (NarrowByteWidth >= Index) {
13028 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13029 StartingIndex);
13030 }
13031
13032 return std::nullopt;
13033 }
13034
13035 case ISD::CopyFromReg: {
13036 if (BitWidth / 8 > Index)
13037 return calculateSrcByte(Op, StartingIndex, Index);
13038
13039 return std::nullopt;
13040 }
13041
13042 case ISD::LOAD: {
13043 auto *L = cast<LoadSDNode>(Op.getNode());
13044
13045 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13046 if (NarrowBitWidth % 8 != 0)
13047 return std::nullopt;
13048 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13049
13050 // If the width of the load does not reach byte we are trying to provide for
13051 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
13052 // question
13053 if (Index >= NarrowByteWidth) {
13054 return L->getExtensionType() == ISD::ZEXTLOAD
13055 ? std::optional<ByteProvider<SDValue>>(
13057 : std::nullopt;
13058 }
13059
13060 if (NarrowByteWidth > Index) {
13061 return calculateSrcByte(Op, StartingIndex, Index);
13062 }
13063
13064 return std::nullopt;
13065 }
13066
13067 case ISD::BSWAP: {
13068 if (IsVec)
13069 return std::nullopt;
13070
13071 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
13072 Depth + 1, StartingIndex);
13073 }
13074
13076 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13077 if (!IdxOp)
13078 return std::nullopt;
13079 auto VecIdx = IdxOp->getZExtValue();
13080 auto ScalarSize = Op.getScalarValueSizeInBits();
13081 if (ScalarSize < 32)
13082 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13083 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
13084 StartingIndex, Index);
13085 }
13086
13087 case AMDGPUISD::PERM: {
13088 if (IsVec)
13089 return std::nullopt;
13090
13091 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13092 if (!PermMask)
13093 return std::nullopt;
13094
13095 auto IdxMask =
13096 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13097 if (IdxMask > 0x07 && IdxMask != 0x0c)
13098 return std::nullopt;
13099
13100 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13101 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13102
13103 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
13106 }
13107
13108 default: {
13109 return std::nullopt;
13110 }
13111 }
13112
13113 llvm_unreachable("fully handled switch");
13114}
13115
13116// Returns true if the Operand is a scalar and is 16 bits
13117static bool isExtendedFrom16Bits(SDValue &Operand) {
13118
13119 switch (Operand.getOpcode()) {
13120 case ISD::ANY_EXTEND:
13121 case ISD::SIGN_EXTEND:
13122 case ISD::ZERO_EXTEND: {
13123 auto OpVT = Operand.getOperand(0).getValueType();
13124 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
13125 }
13126 case ISD::LOAD: {
13127 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
13128 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
13129 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
13130 ExtType == ISD::EXTLOAD) {
13131 auto MemVT = L->getMemoryVT();
13132 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
13133 }
13134 return L->getMemoryVT().getSizeInBits() == 16;
13135 }
13136 default:
13137 return false;
13138 }
13139}
13140
13141// Returns true if the mask matches consecutive bytes, and the first byte
13142// begins at a power of 2 byte offset from 0th byte
13143static bool addresses16Bits(int Mask) {
13144 int Low8 = Mask & 0xff;
13145 int Hi8 = (Mask & 0xff00) >> 8;
13146
13147 assert(Low8 < 8 && Hi8 < 8);
13148 // Are the bytes contiguous in the order of increasing addresses.
13149 bool IsConsecutive = (Hi8 - Low8 == 1);
13150 // Is the first byte at location that is aligned for 16 bit instructions.
13151 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
13152 // In this case, we still need code to extract the 16 bit operand, so it
13153 // is better to use i8 v_perm
13154 bool Is16Aligned = !(Low8 % 2);
13155
13156 return IsConsecutive && Is16Aligned;
13157}
13158
13159// Do not lower into v_perm if the operands are actually 16 bit
13160// and the selected bits (based on PermMask) correspond with two
13161// easily addressable 16 bit operands.
13163 SDValue &OtherOp) {
13164 int Low16 = PermMask & 0xffff;
13165 int Hi16 = (PermMask & 0xffff0000) >> 16;
13166
13167 auto TempOp = peekThroughBitcasts(Op);
13168 auto TempOtherOp = peekThroughBitcasts(OtherOp);
13169
13170 auto OpIs16Bit =
13171 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
13172 if (!OpIs16Bit)
13173 return true;
13174
13175 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13176 isExtendedFrom16Bits(TempOtherOp);
13177 if (!OtherOpIs16Bit)
13178 return true;
13179
13180 // Do we cleanly address both
13181 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
13182}
13183
13185 unsigned DWordOffset) {
13186 SDValue Ret;
13187
13188 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
13189 // ByteProvider must be at least 8 bits
13190 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13191
13192 if (TypeSize <= 32)
13193 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
13194
13195 if (Src.getValueType().isVector()) {
13196 auto ScalarTySize = Src.getScalarValueSizeInBits();
13197 auto ScalarTy = Src.getValueType().getScalarType();
13198 if (ScalarTySize == 32) {
13199 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
13200 DAG.getConstant(DWordOffset, SL, MVT::i32));
13201 }
13202 if (ScalarTySize > 32) {
13203 Ret = DAG.getNode(
13204 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
13205 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13206 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13207 if (ShiftVal)
13208 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
13209 DAG.getConstant(ShiftVal, SL, MVT::i32));
13210 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13211 }
13212
13213 assert(ScalarTySize < 32);
13214 auto NumElements = TypeSize / ScalarTySize;
13215 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13216 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13217 auto NumElementsIn32 = 32 / ScalarTySize;
13218 auto NumAvailElements = DWordOffset < Trunc32Elements
13219 ? NumElementsIn32
13220 : NumElements - NormalizedTrunc;
13221
13223 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
13224 NumAvailElements);
13225
13226 Ret = DAG.getBuildVector(
13227 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
13228 VecSrcs);
13229 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13230 }
13231
13232 /// Scalar Type
13233 auto ShiftVal = 32 * DWordOffset;
13234 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
13235 DAG.getConstant(ShiftVal, SL, MVT::i32));
13236 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13237}
13238
13240 SelectionDAG &DAG = DCI.DAG;
13241 [[maybe_unused]] EVT VT = N->getValueType(0);
13243
13244 // VT is known to be MVT::i32, so we need to provide 4 bytes.
13245 assert(VT == MVT::i32);
13246 for (int i = 0; i < 4; i++) {
13247 // Find the ByteProvider that provides the ith byte of the result of OR
13248 std::optional<ByteProvider<SDValue>> P =
13249 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
13250 // TODO support constantZero
13251 if (!P || P->isConstantZero())
13252 return SDValue();
13253
13254 PermNodes.push_back(*P);
13255 }
13256 if (PermNodes.size() != 4)
13257 return SDValue();
13258
13259 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
13260 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
13261 uint64_t PermMask = 0x00000000;
13262 for (size_t i = 0; i < PermNodes.size(); i++) {
13263 auto PermOp = PermNodes[i];
13264 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
13265 // by sizeof(Src2) = 4
13266 int SrcByteAdjust = 4;
13267
13268 // If the Src uses a byte from a different DWORD, then it corresponds
13269 // with a difference source
13270 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
13271 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
13272 if (SecondSrc)
13273 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
13274 ((PermOp.SrcOffset / 4) != SecondSrc->second))
13275 return SDValue();
13276
13277 // Set the index of the second distinct Src node
13278 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
13279 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
13280 SrcByteAdjust = 0;
13281 }
13282 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
13284 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
13285 }
13286 SDLoc DL(N);
13287 SDValue Op = *PermNodes[FirstSrc.first].Src;
13288 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
13289 assert(Op.getValueSizeInBits() == 32);
13290
13291 // Check that we are not just extracting the bytes in order from an op
13292 if (!SecondSrc) {
13293 int Low16 = PermMask & 0xffff;
13294 int Hi16 = (PermMask & 0xffff0000) >> 16;
13295
13296 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
13297 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
13298
13299 // The perm op would really just produce Op. So combine into Op
13300 if (WellFormedLow && WellFormedHi)
13301 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
13302 }
13303
13304 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
13305
13306 if (SecondSrc) {
13307 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
13308 assert(OtherOp.getValueSizeInBits() == 32);
13309 }
13310
13311 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
13312
13313 assert(Op.getValueType().isByteSized() &&
13314 OtherOp.getValueType().isByteSized());
13315
13316 // If the ultimate src is less than 32 bits, then we will only be
13317 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
13318 // CalculateByteProvider would not have returned Op as source if we
13319 // used a byte that is outside its ValueType. Thus, we are free to
13320 // ANY_EXTEND as the extended bits are dont-cares.
13321 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
13322 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
13323
13324 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
13325 DAG.getConstant(PermMask, DL, MVT::i32));
13326 }
13327 return SDValue();
13328}
13329
13330SDValue SITargetLowering::performOrCombine(SDNode *N,
13331 DAGCombinerInfo &DCI) const {
13332 SelectionDAG &DAG = DCI.DAG;
13333 SDValue LHS = N->getOperand(0);
13334 SDValue RHS = N->getOperand(1);
13335
13336 EVT VT = N->getValueType(0);
13337 if (VT == MVT::i1) {
13338 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
13339 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13340 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
13341 SDValue Src = LHS.getOperand(0);
13342 if (Src != RHS.getOperand(0))
13343 return SDValue();
13344
13345 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
13346 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13347 if (!CLHS || !CRHS)
13348 return SDValue();
13349
13350 // Only 10 bits are used.
13351 static const uint32_t MaxMask = 0x3ff;
13352
13353 uint32_t NewMask =
13354 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
13355 SDLoc DL(N);
13356 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
13357 DAG.getConstant(NewMask, DL, MVT::i32));
13358 }
13359
13360 return SDValue();
13361 }
13362
13363 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13364 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
13365 LHS.getOpcode() == AMDGPUISD::PERM &&
13366 isa<ConstantSDNode>(LHS.getOperand(2))) {
13367 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
13368 if (!Sel)
13369 return SDValue();
13370
13371 Sel |= LHS.getConstantOperandVal(2);
13372 SDLoc DL(N);
13373 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13374 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13375 }
13376
13377 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13379 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13380 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13381
13382 // If all the uses of an or need to extract the individual elements, do not
13383 // attempt to lower into v_perm
13384 auto usesCombinedOperand = [](SDNode *OrUse) {
13385 // If we have any non-vectorized use, then it is a candidate for v_perm
13386 if (OrUse->getOpcode() != ISD::BITCAST ||
13387 !OrUse->getValueType(0).isVector())
13388 return true;
13389
13390 // If we have any non-vectorized use, then it is a candidate for v_perm
13391 for (auto *VUser : OrUse->users()) {
13392 if (!VUser->getValueType(0).isVector())
13393 return true;
13394
13395 // If the use of a vector is a store, then combining via a v_perm
13396 // is beneficial.
13397 // TODO -- whitelist more uses
13398 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
13399 if (VUser->getOpcode() == VectorwiseOp)
13400 return true;
13401 }
13402 return false;
13403 };
13404
13405 if (!any_of(N->users(), usesCombinedOperand))
13406 return SDValue();
13407
13408 uint32_t LHSMask = getPermuteMask(LHS);
13409 uint32_t RHSMask = getPermuteMask(RHS);
13410
13411 if (LHSMask != ~0u && RHSMask != ~0u) {
13412 // Canonicalize the expression in an attempt to have fewer unique masks
13413 // and therefore fewer registers used to hold the masks.
13414 if (LHSMask > RHSMask) {
13415 std::swap(LHSMask, RHSMask);
13416 std::swap(LHS, RHS);
13417 }
13418
13419 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13420 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13421 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13422 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13423
13424 // Check of we need to combine values from two sources within a byte.
13425 if (!(LHSUsedLanes & RHSUsedLanes) &&
13426 // If we select high and lower word keep it for SDWA.
13427 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13428 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13429 // Kill zero bytes selected by other mask. Zero value is 0xc.
13430 LHSMask &= ~RHSUsedLanes;
13431 RHSMask &= ~LHSUsedLanes;
13432 // Add 4 to each active LHS lane
13433 LHSMask |= LHSUsedLanes & 0x04040404;
13434 // Combine masks
13435 uint32_t Sel = LHSMask | RHSMask;
13436 SDLoc DL(N);
13437
13438 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13439 RHS.getOperand(0),
13440 DAG.getConstant(Sel, DL, MVT::i32));
13441 }
13442 }
13443 if (LHSMask == ~0u || RHSMask == ~0u) {
13444 if (SDValue Perm = matchPERM(N, DCI))
13445 return Perm;
13446 }
13447 }
13448
13449 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
13450 return SDValue();
13451
13452 // TODO: This could be a generic combine with a predicate for extracting the
13453 // high half of an integer being free.
13454
13455 // (or i64:x, (zero_extend i32:y)) ->
13456 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
13457 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
13458 RHS.getOpcode() != ISD::ZERO_EXTEND)
13459 std::swap(LHS, RHS);
13460
13461 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
13462 SDValue ExtSrc = RHS.getOperand(0);
13463 EVT SrcVT = ExtSrc.getValueType();
13464 if (SrcVT == MVT::i32) {
13465 SDLoc SL(N);
13466 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
13467 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
13468
13469 DCI.AddToWorklist(LowOr.getNode());
13470 DCI.AddToWorklist(HiBits.getNode());
13471
13472 SDValue Vec =
13473 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
13474 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
13475 }
13476 }
13477
13478 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
13479 if (CRHS) {
13480 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
13481 N->getOperand(0), CRHS))
13482 return Split;
13483 }
13484
13485 return SDValue();
13486}
13487
13488SDValue SITargetLowering::performXorCombine(SDNode *N,
13489 DAGCombinerInfo &DCI) const {
13490 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
13491 return RV;
13492
13493 SDValue LHS = N->getOperand(0);
13494 SDValue RHS = N->getOperand(1);
13495
13496 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13497 SelectionDAG &DAG = DCI.DAG;
13498
13499 EVT VT = N->getValueType(0);
13500 if (CRHS && VT == MVT::i64) {
13501 if (SDValue Split =
13502 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
13503 return Split;
13504 }
13505
13506 // Make sure to apply the 64-bit constant splitting fold before trying to fold
13507 // fneg-like xors into 64-bit select.
13508 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
13509 // This looks like an fneg, try to fold as a source modifier.
13510 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
13511 shouldFoldFNegIntoSrc(N, LHS)) {
13512 // xor (select c, a, b), 0x80000000 ->
13513 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
13514 SDLoc DL(N);
13515 SDValue CastLHS =
13516 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
13517 SDValue CastRHS =
13518 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
13519 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
13520 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
13521 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
13522 LHS->getOperand(0), FNegLHS, FNegRHS);
13523 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
13524 }
13525 }
13526
13527 return SDValue();
13528}
13529
13530SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
13531 DAGCombinerInfo &DCI) const {
13532 if (!Subtarget->has16BitInsts() ||
13533 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
13534 return SDValue();
13535
13536 EVT VT = N->getValueType(0);
13537 if (VT != MVT::i32)
13538 return SDValue();
13539
13540 SDValue Src = N->getOperand(0);
13541 if (Src.getValueType() != MVT::i16)
13542 return SDValue();
13543
13544 return SDValue();
13545}
13546
13547SDValue
13548SITargetLowering::performSignExtendInRegCombine(SDNode *N,
13549 DAGCombinerInfo &DCI) const {
13550 SDValue Src = N->getOperand(0);
13551 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
13552
13553 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
13554 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
13555 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
13556 VTSign->getVT() == MVT::i8) ||
13557 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
13558 VTSign->getVT() == MVT::i16))) {
13559 assert(Subtarget->hasScalarSubwordLoads() &&
13560 "s_buffer_load_{u8, i8} are supported "
13561 "in GFX12 (or newer) architectures.");
13562 EVT VT = Src.getValueType();
13563 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
13566 SDLoc DL(N);
13567 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
13568 SDValue Ops[] = {
13569 Src.getOperand(0), // source register
13570 Src.getOperand(1), // offset
13571 Src.getOperand(2) // cachePolicy
13572 };
13573 auto *M = cast<MemSDNode>(Src);
13574 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
13575 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
13576 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
13577 return LoadVal;
13578 }
13579 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
13580 VTSign->getVT() == MVT::i8) ||
13581 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
13582 VTSign->getVT() == MVT::i16)) &&
13583 Src.hasOneUse()) {
13584 auto *M = cast<MemSDNode>(Src);
13585 SDValue Ops[] = {Src.getOperand(0), // Chain
13586 Src.getOperand(1), // rsrc
13587 Src.getOperand(2), // vindex
13588 Src.getOperand(3), // voffset
13589 Src.getOperand(4), // soffset
13590 Src.getOperand(5), // offset
13591 Src.getOperand(6), Src.getOperand(7)};
13592 // replace with BUFFER_LOAD_BYTE/SHORT
13593 SDVTList ResList =
13594 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
13595 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
13598 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
13599 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
13600 return DCI.DAG.getMergeValues(
13601 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
13602 }
13603 return SDValue();
13604}
13605
13606SDValue SITargetLowering::performClassCombine(SDNode *N,
13607 DAGCombinerInfo &DCI) const {
13608 SelectionDAG &DAG = DCI.DAG;
13609 SDValue Mask = N->getOperand(1);
13610
13611 // fp_class x, 0 -> false
13612 if (isNullConstant(Mask))
13613 return DAG.getConstant(0, SDLoc(N), MVT::i1);
13614
13615 if (N->getOperand(0).isUndef())
13616 return DAG.getUNDEF(MVT::i1);
13617
13618 return SDValue();
13619}
13620
13621SDValue SITargetLowering::performRcpCombine(SDNode *N,
13622 DAGCombinerInfo &DCI) const {
13623 EVT VT = N->getValueType(0);
13624 SDValue N0 = N->getOperand(0);
13625
13626 if (N0.isUndef()) {
13627 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
13628 SDLoc(N), VT);
13629 }
13630
13631 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
13632 N0.getOpcode() == ISD::SINT_TO_FP)) {
13633 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
13634 N->getFlags());
13635 }
13636
13637 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
13638 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
13639 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
13640 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
13641 N->getFlags());
13642 }
13643
13645}
13646
13648 unsigned MaxDepth) const {
13649 unsigned Opcode = Op.getOpcode();
13650 if (Opcode == ISD::FCANONICALIZE)
13651 return true;
13652
13653 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
13654 const auto &F = CFP->getValueAPF();
13655 if (F.isNaN() && F.isSignaling())
13656 return false;
13657 if (!F.isDenormal())
13658 return true;
13659
13660 DenormalMode Mode =
13661 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
13662 return Mode == DenormalMode::getIEEE();
13663 }
13664
13665 // If source is a result of another standard FP operation it is already in
13666 // canonical form.
13667 if (MaxDepth == 0)
13668 return false;
13669
13670 switch (Opcode) {
13671 // These will flush denorms if required.
13672 case ISD::FADD:
13673 case ISD::FSUB:
13674 case ISD::FMUL:
13675 case ISD::FCEIL:
13676 case ISD::FFLOOR:
13677 case ISD::FMA:
13678 case ISD::FMAD:
13679 case ISD::FSQRT:
13680 case ISD::FDIV:
13681 case ISD::FREM:
13682 case ISD::FP_ROUND:
13683 case ISD::FP_EXTEND:
13684 case ISD::FP16_TO_FP:
13685 case ISD::FP_TO_FP16:
13686 case ISD::BF16_TO_FP:
13687 case ISD::FP_TO_BF16:
13688 case ISD::FLDEXP:
13691 case AMDGPUISD::RCP:
13692 case AMDGPUISD::RSQ:
13696 case AMDGPUISD::LOG:
13697 case AMDGPUISD::EXP:
13701 case AMDGPUISD::FRACT:
13708 case AMDGPUISD::SIN_HW:
13709 case AMDGPUISD::COS_HW:
13710 return true;
13711
13712 // It can/will be lowered or combined as a bit operation.
13713 // Need to check their input recursively to handle.
13714 case ISD::FNEG:
13715 case ISD::FABS:
13716 case ISD::FCOPYSIGN:
13717 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
13718
13719 case ISD::AND:
13720 if (Op.getValueType() == MVT::i32) {
13721 // Be careful as we only know it is a bitcast floating point type. It
13722 // could be f32, v2f16, we have no way of knowing. Luckily the constant
13723 // value that we optimize for, which comes up in fp32 to bf16 conversions,
13724 // is valid to optimize for all types.
13725 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
13726 if (RHS->getZExtValue() == 0xffff0000) {
13727 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
13728 }
13729 }
13730 }
13731 break;
13732
13733 case ISD::FSIN:
13734 case ISD::FCOS:
13735 case ISD::FSINCOS:
13736 return Op.getValueType().getScalarType() != MVT::f16;
13737
13738 case ISD::FMINNUM:
13739 case ISD::FMAXNUM:
13740 case ISD::FMINNUM_IEEE:
13741 case ISD::FMAXNUM_IEEE:
13742 case ISD::FMINIMUM:
13743 case ISD::FMAXIMUM:
13744 case ISD::FMINIMUMNUM:
13745 case ISD::FMAXIMUMNUM:
13746 case AMDGPUISD::CLAMP:
13747 case AMDGPUISD::FMED3:
13748 case AMDGPUISD::FMAX3:
13749 case AMDGPUISD::FMIN3:
13751 case AMDGPUISD::FMINIMUM3: {
13752 // FIXME: Shouldn't treat the generic operations different based these.
13753 // However, we aren't really required to flush the result from
13754 // minnum/maxnum..
13755
13756 // snans will be quieted, so we only need to worry about denormals.
13757 if (Subtarget->supportsMinMaxDenormModes() ||
13758 // FIXME: denormalsEnabledForType is broken for dynamic
13759 denormalsEnabledForType(DAG, Op.getValueType()))
13760 return true;
13761
13762 // Flushing may be required.
13763 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
13764 // targets need to check their input recursively.
13765
13766 // FIXME: Does this apply with clamp? It's implemented with max.
13767 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
13768 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
13769 return false;
13770 }
13771
13772 return true;
13773 }
13774 case ISD::SELECT: {
13775 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
13776 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
13777 }
13778 case ISD::BUILD_VECTOR: {
13779 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
13780 SDValue SrcOp = Op.getOperand(i);
13781 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
13782 return false;
13783 }
13784
13785 return true;
13786 }
13789 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
13790 }
13792 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
13793 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
13794 }
13795 case ISD::UNDEF:
13796 // Could be anything.
13797 return false;
13798
13799 case ISD::BITCAST:
13800 // TODO: This is incorrect as it loses track of the operand's type. We may
13801 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
13802 // same bits that are canonicalized in one type need not be in the other.
13803 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
13804 case ISD::TRUNCATE: {
13805 // Hack round the mess we make when legalizing extract_vector_elt
13806 if (Op.getValueType() == MVT::i16) {
13807 SDValue TruncSrc = Op.getOperand(0);
13808 if (TruncSrc.getValueType() == MVT::i32 &&
13809 TruncSrc.getOpcode() == ISD::BITCAST &&
13810 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
13811 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
13812 }
13813 }
13814 return false;
13815 }
13817 unsigned IntrinsicID = Op.getConstantOperandVal(0);
13818 // TODO: Handle more intrinsics
13819 switch (IntrinsicID) {
13820 case Intrinsic::amdgcn_cvt_pkrtz:
13821 case Intrinsic::amdgcn_cubeid:
13822 case Intrinsic::amdgcn_frexp_mant:
13823 case Intrinsic::amdgcn_fdot2:
13824 case Intrinsic::amdgcn_rcp:
13825 case Intrinsic::amdgcn_rsq:
13826 case Intrinsic::amdgcn_rsq_clamp:
13827 case Intrinsic::amdgcn_rcp_legacy:
13828 case Intrinsic::amdgcn_rsq_legacy:
13829 case Intrinsic::amdgcn_trig_preop:
13830 case Intrinsic::amdgcn_tanh:
13831 case Intrinsic::amdgcn_log:
13832 case Intrinsic::amdgcn_exp2:
13833 case Intrinsic::amdgcn_sqrt:
13834 return true;
13835 default:
13836 break;
13837 }
13838
13839 break;
13840 }
13841 default:
13842 break;
13843 }
13844
13845 // FIXME: denormalsEnabledForType is broken for dynamic
13846 return denormalsEnabledForType(DAG, Op.getValueType()) &&
13847 DAG.isKnownNeverSNaN(Op);
13848}
13849
13851 unsigned MaxDepth) const {
13852 const MachineRegisterInfo &MRI = MF.getRegInfo();
13853 MachineInstr *MI = MRI.getVRegDef(Reg);
13854 unsigned Opcode = MI->getOpcode();
13855
13856 if (Opcode == AMDGPU::G_FCANONICALIZE)
13857 return true;
13858
13859 std::optional<FPValueAndVReg> FCR;
13860 // Constant splat (can be padded with undef) or scalar constant.
13862 if (FCR->Value.isSignaling())
13863 return false;
13864 if (!FCR->Value.isDenormal())
13865 return true;
13866
13867 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
13868 return Mode == DenormalMode::getIEEE();
13869 }
13870
13871 if (MaxDepth == 0)
13872 return false;
13873
13874 switch (Opcode) {
13875 case AMDGPU::G_FADD:
13876 case AMDGPU::G_FSUB:
13877 case AMDGPU::G_FMUL:
13878 case AMDGPU::G_FCEIL:
13879 case AMDGPU::G_FFLOOR:
13880 case AMDGPU::G_FRINT:
13881 case AMDGPU::G_FNEARBYINT:
13882 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
13883 case AMDGPU::G_INTRINSIC_TRUNC:
13884 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
13885 case AMDGPU::G_FMA:
13886 case AMDGPU::G_FMAD:
13887 case AMDGPU::G_FSQRT:
13888 case AMDGPU::G_FDIV:
13889 case AMDGPU::G_FREM:
13890 case AMDGPU::G_FPOW:
13891 case AMDGPU::G_FPEXT:
13892 case AMDGPU::G_FLOG:
13893 case AMDGPU::G_FLOG2:
13894 case AMDGPU::G_FLOG10:
13895 case AMDGPU::G_FPTRUNC:
13896 case AMDGPU::G_AMDGPU_RCP_IFLAG:
13897 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
13898 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
13899 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
13900 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
13901 return true;
13902 case AMDGPU::G_FNEG:
13903 case AMDGPU::G_FABS:
13904 case AMDGPU::G_FCOPYSIGN:
13905 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
13906 case AMDGPU::G_FMINNUM:
13907 case AMDGPU::G_FMAXNUM:
13908 case AMDGPU::G_FMINNUM_IEEE:
13909 case AMDGPU::G_FMAXNUM_IEEE:
13910 case AMDGPU::G_FMINIMUM:
13911 case AMDGPU::G_FMAXIMUM:
13912 case AMDGPU::G_FMINIMUMNUM:
13913 case AMDGPU::G_FMAXIMUMNUM: {
13914 if (Subtarget->supportsMinMaxDenormModes() ||
13915 // FIXME: denormalsEnabledForType is broken for dynamic
13916 denormalsEnabledForType(MRI.getType(Reg), MF))
13917 return true;
13918
13919 [[fallthrough]];
13920 }
13921 case AMDGPU::G_BUILD_VECTOR:
13922 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
13923 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
13924 return false;
13925 return true;
13926 case AMDGPU::G_INTRINSIC:
13927 case AMDGPU::G_INTRINSIC_CONVERGENT:
13928 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
13929 case Intrinsic::amdgcn_fmul_legacy:
13930 case Intrinsic::amdgcn_fmad_ftz:
13931 case Intrinsic::amdgcn_sqrt:
13932 case Intrinsic::amdgcn_fmed3:
13933 case Intrinsic::amdgcn_sin:
13934 case Intrinsic::amdgcn_cos:
13935 case Intrinsic::amdgcn_log:
13936 case Intrinsic::amdgcn_exp2:
13937 case Intrinsic::amdgcn_log_clamp:
13938 case Intrinsic::amdgcn_rcp:
13939 case Intrinsic::amdgcn_rcp_legacy:
13940 case Intrinsic::amdgcn_rsq:
13941 case Intrinsic::amdgcn_rsq_clamp:
13942 case Intrinsic::amdgcn_rsq_legacy:
13943 case Intrinsic::amdgcn_div_scale:
13944 case Intrinsic::amdgcn_div_fmas:
13945 case Intrinsic::amdgcn_div_fixup:
13946 case Intrinsic::amdgcn_fract:
13947 case Intrinsic::amdgcn_cvt_pkrtz:
13948 case Intrinsic::amdgcn_cubeid:
13949 case Intrinsic::amdgcn_cubema:
13950 case Intrinsic::amdgcn_cubesc:
13951 case Intrinsic::amdgcn_cubetc:
13952 case Intrinsic::amdgcn_frexp_mant:
13953 case Intrinsic::amdgcn_fdot2:
13954 case Intrinsic::amdgcn_trig_preop:
13955 case Intrinsic::amdgcn_tanh:
13956 return true;
13957 default:
13958 break;
13959 }
13960
13961 [[fallthrough]];
13962 default:
13963 return false;
13964 }
13965
13966 llvm_unreachable("invalid operation");
13967}
13968
13969// Constant fold canonicalize.
13970SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
13971 const SDLoc &SL, EVT VT,
13972 const APFloat &C) const {
13973 // Flush denormals to 0 if not enabled.
13974 if (C.isDenormal()) {
13975 DenormalMode Mode =
13976 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
13977 if (Mode == DenormalMode::getPreserveSign()) {
13978 return DAG.getConstantFP(
13979 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
13980 }
13981
13982 if (Mode != DenormalMode::getIEEE())
13983 return SDValue();
13984 }
13985
13986 if (C.isNaN()) {
13987 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
13988 if (C.isSignaling()) {
13989 // Quiet a signaling NaN.
13990 // FIXME: Is this supposed to preserve payload bits?
13991 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
13992 }
13993
13994 // Make sure it is the canonical NaN bitpattern.
13995 //
13996 // TODO: Can we use -1 as the canonical NaN value since it's an inline
13997 // immediate?
13998 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
13999 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14000 }
14001
14002 // Already canonical.
14003 return DAG.getConstantFP(C, SL, VT);
14004}
14005
14007 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
14008}
14009
14010SDValue
14011SITargetLowering::performFCanonicalizeCombine(SDNode *N,
14012 DAGCombinerInfo &DCI) const {
14013 SelectionDAG &DAG = DCI.DAG;
14014 SDValue N0 = N->getOperand(0);
14015 EVT VT = N->getValueType(0);
14016
14017 // fcanonicalize undef -> qnan
14018 if (N0.isUndef()) {
14020 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
14021 }
14022
14023 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
14024 EVT VT = N->getValueType(0);
14025 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
14026 }
14027
14028 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
14029 // (fcanonicalize k)
14030 //
14031 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
14032
14033 // TODO: This could be better with wider vectors that will be split to v2f16,
14034 // and to consider uses since there aren't that many packed operations.
14035 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
14036 isTypeLegal(MVT::v2f16)) {
14037 SDLoc SL(N);
14038 SDValue NewElts[2];
14039 SDValue Lo = N0.getOperand(0);
14040 SDValue Hi = N0.getOperand(1);
14041 EVT EltVT = Lo.getValueType();
14042
14044 for (unsigned I = 0; I != 2; ++I) {
14045 SDValue Op = N0.getOperand(I);
14046 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14047 NewElts[I] =
14048 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14049 } else if (Op.isUndef()) {
14050 // Handled below based on what the other operand is.
14051 NewElts[I] = Op;
14052 } else {
14053 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
14054 }
14055 }
14056
14057 // If one half is undef, and one is constant, prefer a splat vector rather
14058 // than the normal qNaN. If it's a register, prefer 0.0 since that's
14059 // cheaper to use and may be free with a packed operation.
14060 if (NewElts[0].isUndef()) {
14061 if (isa<ConstantFPSDNode>(NewElts[1]))
14062 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
14063 ? NewElts[1]
14064 : DAG.getConstantFP(0.0f, SL, EltVT);
14065 }
14066
14067 if (NewElts[1].isUndef()) {
14068 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
14069 ? NewElts[0]
14070 : DAG.getConstantFP(0.0f, SL, EltVT);
14071 }
14072
14073 return DAG.getBuildVector(VT, SL, NewElts);
14074 }
14075 }
14076
14077 return SDValue();
14078}
14079
14080static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
14081 switch (Opc) {
14082 case ISD::FMAXNUM:
14083 case ISD::FMAXNUM_IEEE:
14084 case ISD::FMAXIMUMNUM:
14085 return AMDGPUISD::FMAX3;
14086 case ISD::FMAXIMUM:
14087 return AMDGPUISD::FMAXIMUM3;
14088 case ISD::SMAX:
14089 return AMDGPUISD::SMAX3;
14090 case ISD::UMAX:
14091 return AMDGPUISD::UMAX3;
14092 case ISD::FMINNUM:
14093 case ISD::FMINNUM_IEEE:
14094 case ISD::FMINIMUMNUM:
14095 return AMDGPUISD::FMIN3;
14096 case ISD::FMINIMUM:
14097 return AMDGPUISD::FMINIMUM3;
14098 case ISD::SMIN:
14099 return AMDGPUISD::SMIN3;
14100 case ISD::UMIN:
14101 return AMDGPUISD::UMIN3;
14102 default:
14103 llvm_unreachable("Not a min/max opcode");
14104 }
14105}
14106
14107SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
14108 const SDLoc &SL, SDValue Src,
14109 SDValue MinVal,
14110 SDValue MaxVal,
14111 bool Signed) const {
14112
14113 // med3 comes from
14114 // min(max(x, K0), K1), K0 < K1
14115 // max(min(x, K0), K1), K1 < K0
14116 //
14117 // "MinVal" and "MaxVal" respectively refer to the rhs of the
14118 // min/max op.
14119 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
14120 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
14121
14122 if (!MinK || !MaxK)
14123 return SDValue();
14124
14125 if (Signed) {
14126 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
14127 return SDValue();
14128 } else {
14129 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
14130 return SDValue();
14131 }
14132
14133 EVT VT = MinK->getValueType(0);
14134 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
14135 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
14136 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
14137
14138 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
14139 // not available, but this is unlikely to be profitable as constants
14140 // will often need to be materialized & extended, especially on
14141 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
14142 return SDValue();
14143}
14144
14146 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
14147 return C;
14148
14149 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
14150 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
14151 return C;
14152 }
14153
14154 return nullptr;
14155}
14156
14157SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
14158 const SDLoc &SL, SDValue Op0,
14159 SDValue Op1) const {
14161 if (!K1)
14162 return SDValue();
14163
14165 if (!K0)
14166 return SDValue();
14167
14168 // Ordered >= (although NaN inputs should have folded away by now).
14169 if (K0->getValueAPF() > K1->getValueAPF())
14170 return SDValue();
14171
14172 // med3 with a nan input acts like
14173 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
14174 //
14175 // So the result depends on whether the IEEE mode bit is enabled or not with a
14176 // signaling nan input.
14177 // ieee=1
14178 // s0 snan: yields s2
14179 // s1 snan: yields s2
14180 // s2 snan: qnan
14181
14182 // s0 qnan: min(s1, s2)
14183 // s1 qnan: min(s0, s2)
14184 // s2 qnan: min(s0, s1)
14185
14186 // ieee=0
14187 // s0 snan: min(s1, s2)
14188 // s1 snan: min(s0, s2)
14189 // s2 snan: qnan
14190
14191 // s0 qnan: min(s1, s2)
14192 // s1 qnan: min(s0, s2)
14193 // s2 qnan: min(s0, s1)
14194 const MachineFunction &MF = DAG.getMachineFunction();
14196
14197 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
14198 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
14199 // can only form if op0 is fmaxnum_ieee if IEEE=1.
14200 EVT VT = Op0.getValueType();
14201 if (Info->getMode().DX10Clamp) {
14202 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
14203 // hardware fmed3 behavior converting to a min.
14204 // FIXME: Should this be allowing -0.0?
14205 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
14206 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
14207 }
14208
14209 // med3 for f16 is only available on gfx9+, and not available for v2f16.
14210 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
14211 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
14212 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
14213 // then give the other result, which is different from med3 with a NaN
14214 // input.
14215 SDValue Var = Op0.getOperand(0);
14216 if (!DAG.isKnownNeverSNaN(Var))
14217 return SDValue();
14218
14220
14221 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
14222 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
14223 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
14224 SDValue(K0, 0), SDValue(K1, 0));
14225 }
14226 }
14227
14228 return SDValue();
14229}
14230
14231/// \return true if the subtarget supports minimum3 and maximum3 with the given
14232/// base min/max opcode \p Opc for type \p VT.
14233static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
14234 EVT VT) {
14235 switch (Opc) {
14236 case ISD::FMINNUM:
14237 case ISD::FMAXNUM:
14238 case ISD::FMINNUM_IEEE:
14239 case ISD::FMAXNUM_IEEE:
14240 case ISD::FMINIMUMNUM:
14241 case ISD::FMAXIMUMNUM:
14244 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
14245 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
14246 case ISD::FMINIMUM:
14247 case ISD::FMAXIMUM:
14248 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
14249 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
14250 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
14251 case ISD::SMAX:
14252 case ISD::SMIN:
14253 case ISD::UMAX:
14254 case ISD::UMIN:
14255 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
14256 default:
14257 return false;
14258 }
14259
14260 llvm_unreachable("not a min/max opcode");
14261}
14262
14263SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
14264 DAGCombinerInfo &DCI) const {
14265 SelectionDAG &DAG = DCI.DAG;
14266
14267 EVT VT = N->getValueType(0);
14268 unsigned Opc = N->getOpcode();
14269 SDValue Op0 = N->getOperand(0);
14270 SDValue Op1 = N->getOperand(1);
14271
14272 // Only do this if the inner op has one use since this will just increases
14273 // register pressure for no benefit.
14274
14275 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
14276 // max(max(a, b), c) -> max3(a, b, c)
14277 // min(min(a, b), c) -> min3(a, b, c)
14278 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
14279 SDLoc DL(N);
14280 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
14281 Op0.getOperand(0), Op0.getOperand(1), Op1);
14282 }
14283
14284 // Try commuted.
14285 // max(a, max(b, c)) -> max3(a, b, c)
14286 // min(a, min(b, c)) -> min3(a, b, c)
14287 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
14288 SDLoc DL(N);
14289 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
14290 Op0, Op1.getOperand(0), Op1.getOperand(1));
14291 }
14292 }
14293
14294 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
14295 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
14296 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
14297 if (SDValue Med3 = performIntMed3ImmCombine(
14298 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
14299 return Med3;
14300 }
14301 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
14302 if (SDValue Med3 = performIntMed3ImmCombine(
14303 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
14304 return Med3;
14305 }
14306
14307 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
14308 if (SDValue Med3 = performIntMed3ImmCombine(
14309 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
14310 return Med3;
14311 }
14312 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
14313 if (SDValue Med3 = performIntMed3ImmCombine(
14314 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
14315 return Med3;
14316 }
14317
14318 // if !is_snan(x):
14319 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14320 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14321 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14322 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14323 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
14327 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
14328 (VT == MVT::f32 || VT == MVT::f64 ||
14329 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
14330 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
14331 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
14332 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
14333 Op0.hasOneUse()) {
14334 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
14335 return Res;
14336 }
14337
14338 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
14339 // for some types, but at a higher cost since it's implemented with a 3
14340 // operand form.
14341 const SDNodeFlags Flags = N->getFlags();
14342 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) &&
14343 !Subtarget->hasIEEEMinimumMaximumInsts() && Flags.hasNoNaNs()) {
14344 unsigned NewOpc =
14346 return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
14347 }
14348
14349 return SDValue();
14350}
14351
14353 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
14354 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
14355 // FIXME: Should this be allowing -0.0?
14356 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
14357 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
14358 }
14359 }
14360
14361 return false;
14362}
14363
14364// FIXME: Should only worry about snans for version with chain.
14365SDValue SITargetLowering::performFMed3Combine(SDNode *N,
14366 DAGCombinerInfo &DCI) const {
14367 EVT VT = N->getValueType(0);
14368 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
14369 // NaNs. With a NaN input, the order of the operands may change the result.
14370
14371 SelectionDAG &DAG = DCI.DAG;
14372 SDLoc SL(N);
14373
14374 SDValue Src0 = N->getOperand(0);
14375 SDValue Src1 = N->getOperand(1);
14376 SDValue Src2 = N->getOperand(2);
14377
14378 if (isClampZeroToOne(Src0, Src1)) {
14379 // const_a, const_b, x -> clamp is safe in all cases including signaling
14380 // nans.
14381 // FIXME: Should this be allowing -0.0?
14382 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
14383 }
14384
14385 const MachineFunction &MF = DAG.getMachineFunction();
14387
14388 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
14389 // handling no dx10-clamp?
14390 if (Info->getMode().DX10Clamp) {
14391 // If NaNs is clamped to 0, we are free to reorder the inputs.
14392
14393 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
14394 std::swap(Src0, Src1);
14395
14396 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
14397 std::swap(Src1, Src2);
14398
14399 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
14400 std::swap(Src0, Src1);
14401
14402 if (isClampZeroToOne(Src1, Src2))
14403 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
14404 }
14405
14406 return SDValue();
14407}
14408
14409SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
14410 DAGCombinerInfo &DCI) const {
14411 SDValue Src0 = N->getOperand(0);
14412 SDValue Src1 = N->getOperand(1);
14413 if (Src0.isUndef() && Src1.isUndef())
14414 return DCI.DAG.getUNDEF(N->getValueType(0));
14415 return SDValue();
14416}
14417
14418// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
14419// expanded into a set of cmp/select instructions.
14421 unsigned NumElem,
14422 bool IsDivergentIdx,
14423 const GCNSubtarget *Subtarget) {
14425 return false;
14426
14427 unsigned VecSize = EltSize * NumElem;
14428
14429 // Sub-dword vectors of size 2 dword or less have better implementation.
14430 if (VecSize <= 64 && EltSize < 32)
14431 return false;
14432
14433 // Always expand the rest of sub-dword instructions, otherwise it will be
14434 // lowered via memory.
14435 if (EltSize < 32)
14436 return true;
14437
14438 // Always do this if var-idx is divergent, otherwise it will become a loop.
14439 if (IsDivergentIdx)
14440 return true;
14441
14442 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
14443 unsigned NumInsts = NumElem /* Number of compares */ +
14444 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
14445
14446 // On some architectures (GFX9) movrel is not available and it's better
14447 // to expand.
14448 if (Subtarget->useVGPRIndexMode())
14449 return NumInsts <= 16;
14450
14451 // If movrel is available, use it instead of expanding for vector of 8
14452 // elements.
14453 if (Subtarget->hasMovrel())
14454 return NumInsts <= 15;
14455
14456 return true;
14457}
14458
14460 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
14461 if (isa<ConstantSDNode>(Idx))
14462 return false;
14463
14464 SDValue Vec = N->getOperand(0);
14465 EVT VecVT = Vec.getValueType();
14466 EVT EltVT = VecVT.getVectorElementType();
14467 unsigned EltSize = EltVT.getSizeInBits();
14468 unsigned NumElem = VecVT.getVectorNumElements();
14469
14471 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
14472}
14473
14474SDValue
14475SITargetLowering::performExtractVectorEltCombine(SDNode *N,
14476 DAGCombinerInfo &DCI) const {
14477 SDValue Vec = N->getOperand(0);
14478 SelectionDAG &DAG = DCI.DAG;
14479
14480 EVT VecVT = Vec.getValueType();
14481 EVT VecEltVT = VecVT.getVectorElementType();
14482 EVT ResVT = N->getValueType(0);
14483
14484 unsigned VecSize = VecVT.getSizeInBits();
14485 unsigned VecEltSize = VecEltVT.getSizeInBits();
14486
14487 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
14489 SDLoc SL(N);
14490 SDValue Idx = N->getOperand(1);
14491 SDValue Elt =
14492 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
14493 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
14494 }
14495
14496 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
14497 // =>
14498 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
14499 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
14500 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
14501 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
14502 SDLoc SL(N);
14503 SDValue Idx = N->getOperand(1);
14504 unsigned Opc = Vec.getOpcode();
14505
14506 switch (Opc) {
14507 default:
14508 break;
14509 // TODO: Support other binary operations.
14510 case ISD::FADD:
14511 case ISD::FSUB:
14512 case ISD::FMUL:
14513 case ISD::ADD:
14514 case ISD::UMIN:
14515 case ISD::UMAX:
14516 case ISD::SMIN:
14517 case ISD::SMAX:
14518 case ISD::FMAXNUM:
14519 case ISD::FMINNUM:
14520 case ISD::FMAXNUM_IEEE:
14521 case ISD::FMINNUM_IEEE:
14522 case ISD::FMAXIMUM:
14523 case ISD::FMINIMUM: {
14524 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
14525 Vec.getOperand(0), Idx);
14526 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
14527 Vec.getOperand(1), Idx);
14528
14529 DCI.AddToWorklist(Elt0.getNode());
14530 DCI.AddToWorklist(Elt1.getNode());
14531 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
14532 }
14533 }
14534 }
14535
14536 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
14538 SDLoc SL(N);
14539 SDValue Idx = N->getOperand(1);
14540 SDValue V;
14541 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
14542 SDValue IC = DAG.getVectorIdxConstant(I, SL);
14543 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
14544 if (I == 0)
14545 V = Elt;
14546 else
14547 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
14548 }
14549 return V;
14550 }
14551
14552 if (!DCI.isBeforeLegalize())
14553 return SDValue();
14554
14555 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
14556 // elements. This exposes more load reduction opportunities by replacing
14557 // multiple small extract_vector_elements with a single 32-bit extract.
14558 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
14559 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
14560 VecSize > 32 && VecSize % 32 == 0 && Idx) {
14561 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
14562
14563 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
14564 unsigned EltIdx = BitIndex / 32;
14565 unsigned LeftoverBitIdx = BitIndex % 32;
14566 SDLoc SL(N);
14567
14568 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
14569 DCI.AddToWorklist(Cast.getNode());
14570
14571 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
14572 DAG.getConstant(EltIdx, SL, MVT::i32));
14573 DCI.AddToWorklist(Elt.getNode());
14574 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
14575 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
14576 DCI.AddToWorklist(Srl.getNode());
14577
14578 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
14579 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
14580 DCI.AddToWorklist(Trunc.getNode());
14581
14582 if (VecEltVT == ResVT) {
14583 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
14584 }
14585
14586 assert(ResVT.isScalarInteger());
14587 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
14588 }
14589
14590 return SDValue();
14591}
14592
14593SDValue
14594SITargetLowering::performInsertVectorEltCombine(SDNode *N,
14595 DAGCombinerInfo &DCI) const {
14596 SDValue Vec = N->getOperand(0);
14597 SDValue Idx = N->getOperand(2);
14598 EVT VecVT = Vec.getValueType();
14599 EVT EltVT = VecVT.getVectorElementType();
14600
14601 // INSERT_VECTOR_ELT (<n x e>, var-idx)
14602 // => BUILD_VECTOR n x select (e, const-idx)
14604 return SDValue();
14605
14606 SelectionDAG &DAG = DCI.DAG;
14607 SDLoc SL(N);
14608 SDValue Ins = N->getOperand(1);
14609 EVT IdxVT = Idx.getValueType();
14610
14612 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
14613 SDValue IC = DAG.getConstant(I, SL, IdxVT);
14614 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
14615 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
14616 Ops.push_back(V);
14617 }
14618
14619 return DAG.getBuildVector(VecVT, SL, Ops);
14620}
14621
14622/// Return the source of an fp_extend from f16 to f32, or a converted FP
14623/// constant.
14625 if (Src.getOpcode() == ISD::FP_EXTEND &&
14626 Src.getOperand(0).getValueType() == MVT::f16) {
14627 return Src.getOperand(0);
14628 }
14629
14630 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
14631 APFloat Val = CFP->getValueAPF();
14632 bool LosesInfo = true;
14634 if (!LosesInfo)
14635 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
14636 }
14637
14638 return SDValue();
14639}
14640
14641SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
14642 DAGCombinerInfo &DCI) const {
14643 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
14644 "combine only useful on gfx8");
14645
14646 SDValue TruncSrc = N->getOperand(0);
14647 EVT VT = N->getValueType(0);
14648 if (VT != MVT::f16)
14649 return SDValue();
14650
14651 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
14652 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
14653 return SDValue();
14654
14655 SelectionDAG &DAG = DCI.DAG;
14656 SDLoc SL(N);
14657
14658 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
14659 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
14660 // casting back.
14661
14662 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
14663 // fmin(fmax(a, b), fmax(fmin(a, b), c))
14664 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
14665 if (!A)
14666 return SDValue();
14667
14668 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
14669 if (!B)
14670 return SDValue();
14671
14672 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
14673 if (!C)
14674 return SDValue();
14675
14676 // This changes signaling nan behavior. If an input is a signaling nan, it
14677 // would have been quieted by the fpext originally. We don't care because
14678 // these are unconstrained ops. If we needed to insert quieting canonicalizes
14679 // we would be worse off than just doing the promotion.
14680 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
14681 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
14682 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
14683 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
14684}
14685
14686unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
14687 const SDNode *N0,
14688 const SDNode *N1) const {
14689 EVT VT = N0->getValueType(0);
14690
14691 // Only do this if we are not trying to support denormals. v_mad_f32 does not
14692 // support denormals ever.
14693 if (((VT == MVT::f32 &&
14695 (VT == MVT::f16 && Subtarget->hasMadF16() &&
14698 return ISD::FMAD;
14699
14700 const TargetOptions &Options = DAG.getTarget().Options;
14701 if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
14702 (N0->getFlags().hasAllowContract() &&
14703 N1->getFlags().hasAllowContract())) &&
14705 return ISD::FMA;
14706 }
14707
14708 return 0;
14709}
14710
14711// For a reassociatable opcode perform:
14712// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
14713SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
14714 SelectionDAG &DAG) const {
14715 EVT VT = N->getValueType(0);
14716 if (VT != MVT::i32 && VT != MVT::i64)
14717 return SDValue();
14718
14719 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
14720 return SDValue();
14721
14722 unsigned Opc = N->getOpcode();
14723 SDValue Op0 = N->getOperand(0);
14724 SDValue Op1 = N->getOperand(1);
14725
14726 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
14727 return SDValue();
14728
14729 if (Op0->isDivergent())
14730 std::swap(Op0, Op1);
14731
14732 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
14733 return SDValue();
14734
14735 SDValue Op2 = Op1.getOperand(1);
14736 Op1 = Op1.getOperand(0);
14737 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
14738 return SDValue();
14739
14740 if (Op1->isDivergent())
14741 std::swap(Op1, Op2);
14742
14743 SDLoc SL(N);
14744 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
14745 return DAG.getNode(Opc, SL, VT, Add1, Op2);
14746}
14747
14748static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
14749 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
14751 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
14752 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
14753 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
14754}
14755
14756// Fold
14757// y = lshr i64 x, 32
14758// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
14759// with Const.hi == -1
14760// To
14761// res = mad_u64_u32 y.lo ,Const.lo, x.lo
14763 SDValue MulLHS, SDValue MulRHS,
14764 SDValue AddRHS) {
14765 if (MulRHS.getOpcode() == ISD::SRL)
14766 std::swap(MulLHS, MulRHS);
14767
14768 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
14769 return SDValue();
14770
14771 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
14772 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
14773 MulLHS.getOperand(0) != AddRHS)
14774 return SDValue();
14775
14776 ConstantSDNode *Const = dyn_cast<ConstantSDNode>(MulRHS.getNode());
14777 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
14778 return SDValue();
14779
14780 SDValue ConstMul =
14781 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
14782 return getMad64_32(DAG, SL, MVT::i64,
14783 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
14784 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
14785}
14786
14787// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
14788// multiplies, if any.
14789//
14790// Full 64-bit multiplies that feed into an addition are lowered here instead
14791// of using the generic expansion. The generic expansion ends up with
14792// a tree of ADD nodes that prevents us from using the "add" part of the
14793// MAD instruction. The expansion produced here results in a chain of ADDs
14794// instead of a tree.
14795SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
14796 DAGCombinerInfo &DCI) const {
14797 assert(N->isAnyAdd());
14798
14799 SelectionDAG &DAG = DCI.DAG;
14800 EVT VT = N->getValueType(0);
14801 SDLoc SL(N);
14802 SDValue LHS = N->getOperand(0);
14803 SDValue RHS = N->getOperand(1);
14804
14805 if (VT.isVector())
14806 return SDValue();
14807
14808 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
14809 // result in scalar registers for uniform values.
14810 if (!N->isDivergent() && Subtarget->hasSMulHi())
14811 return SDValue();
14812
14813 unsigned NumBits = VT.getScalarSizeInBits();
14814 if (NumBits <= 32 || NumBits > 64)
14815 return SDValue();
14816
14817 if (LHS.getOpcode() != ISD::MUL) {
14818 assert(RHS.getOpcode() == ISD::MUL);
14819 std::swap(LHS, RHS);
14820 }
14821
14822 // Avoid the fold if it would unduly increase the number of multiplies due to
14823 // multiple uses, except on hardware with full-rate multiply-add (which is
14824 // part of full-rate 64-bit ops).
14825 if (!Subtarget->hasFullRate64Ops()) {
14826 unsigned NumUsers = 0;
14827 for (SDNode *User : LHS->users()) {
14828 // There is a use that does not feed into addition, so the multiply can't
14829 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
14830 if (!User->isAnyAdd())
14831 return SDValue();
14832
14833 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
14834 // MUL + 3xADD + 3xADDC over 3xMAD.
14835 ++NumUsers;
14836 if (NumUsers >= 3)
14837 return SDValue();
14838 }
14839 }
14840
14841 SDValue MulLHS = LHS.getOperand(0);
14842 SDValue MulRHS = LHS.getOperand(1);
14843 SDValue AddRHS = RHS;
14844
14845 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
14846 return FoldedMAD;
14847
14848 // Always check whether operands are small unsigned values, since that
14849 // knowledge is useful in more cases. Check for small signed values only if
14850 // doing so can unlock a shorter code sequence.
14851 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
14852 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
14853
14854 bool MulSignedLo = false;
14855 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
14856 MulSignedLo =
14857 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
14858 }
14859
14860 // The operands and final result all have the same number of bits. If
14861 // operands need to be extended, they can be extended with garbage. The
14862 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
14863 // truncated away in the end.
14864 if (VT != MVT::i64) {
14865 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
14866 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
14867 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
14868 }
14869
14870 // The basic code generated is conceptually straightforward. Pseudo code:
14871 //
14872 // accum = mad_64_32 lhs.lo, rhs.lo, accum
14873 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
14874 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
14875 //
14876 // The second and third lines are optional, depending on whether the factors
14877 // are {sign,zero}-extended or not.
14878 //
14879 // The actual DAG is noisier than the pseudo code, but only due to
14880 // instructions that disassemble values into low and high parts, and
14881 // assemble the final result.
14882 SDValue One = DAG.getConstant(1, SL, MVT::i32);
14883
14884 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
14885 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
14886 SDValue Accum =
14887 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
14888
14889 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
14890 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
14891
14892 if (!MulLHSUnsigned32) {
14893 auto MulLHSHi =
14894 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
14895 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
14896 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
14897 }
14898
14899 if (!MulRHSUnsigned32) {
14900 auto MulRHSHi =
14901 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
14902 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
14903 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
14904 }
14905
14906 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
14907 Accum = DAG.getBitcast(MVT::i64, Accum);
14908 }
14909
14910 if (VT != MVT::i64)
14911 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
14912 return Accum;
14913}
14914
14915SDValue
14916SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
14917 DAGCombinerInfo &DCI) const {
14918 SDValue RHS = N->getOperand(1);
14919 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14920 if (!CRHS)
14921 return SDValue();
14922
14923 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
14924 // common.
14925 uint64_t Val = CRHS->getZExtValue();
14926 if (countr_zero(Val) >= 32) {
14927 SelectionDAG &DAG = DCI.DAG;
14928 SDLoc SL(N);
14929 SDValue LHS = N->getOperand(0);
14930
14931 // Avoid carry machinery if we know the low half of the add does not
14932 // contribute to the final result.
14933 //
14934 // add i64:x, K if computeTrailingZeros(K) >= 32
14935 // => build_pair (add x.hi, K.hi), x.lo
14936
14937 // Breaking the 64-bit add here with this strange constant is unlikely
14938 // to interfere with addressing mode patterns.
14939
14940 SDValue Hi = getHiHalf64(LHS, DAG);
14941 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
14942 unsigned Opcode = N->getOpcode();
14943 if (Opcode == ISD::PTRADD)
14944 Opcode = ISD::ADD;
14945 SDValue AddHi =
14946 DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
14947
14948 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
14949 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
14950 }
14951
14952 return SDValue();
14953}
14954
14955// Collect the ultimate src of each of the mul node's operands, and confirm
14956// each operand is 8 bytes.
14957static std::optional<ByteProvider<SDValue>>
14958handleMulOperand(const SDValue &MulOperand) {
14959 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
14960 if (!Byte0 || Byte0->isConstantZero()) {
14961 return std::nullopt;
14962 }
14963 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
14964 if (Byte1 && !Byte1->isConstantZero()) {
14965 return std::nullopt;
14966 }
14967 return Byte0;
14968}
14969
14970static unsigned addPermMasks(unsigned First, unsigned Second) {
14971 unsigned FirstCs = First & 0x0c0c0c0c;
14972 unsigned SecondCs = Second & 0x0c0c0c0c;
14973 unsigned FirstNoCs = First & ~0x0c0c0c0c;
14974 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
14975
14976 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
14977 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
14978 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
14979 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
14980
14981 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
14982}
14983
14984struct DotSrc {
14986 int64_t PermMask;
14988};
14989
14993 SmallVectorImpl<DotSrc> &Src1s, int Step) {
14994
14995 assert(Src0.Src.has_value() && Src1.Src.has_value());
14996 // Src0s and Src1s are empty, just place arbitrarily.
14997 if (Step == 0) {
14998 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
14999 Src0.SrcOffset / 4});
15000 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
15001 Src1.SrcOffset / 4});
15002 return;
15003 }
15004
15005 for (int BPI = 0; BPI < 2; BPI++) {
15006 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
15007 if (BPI == 1) {
15008 BPP = {Src1, Src0};
15009 }
15010 unsigned ZeroMask = 0x0c0c0c0c;
15011 unsigned FMask = 0xFF << (8 * (3 - Step));
15012
15013 unsigned FirstMask =
15014 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15015 unsigned SecondMask =
15016 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15017 // Attempt to find Src vector which contains our SDValue, if so, add our
15018 // perm mask to the existing one. If we are unable to find a match for the
15019 // first SDValue, attempt to find match for the second.
15020 int FirstGroup = -1;
15021 for (int I = 0; I < 2; I++) {
15022 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
15023 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
15024 return IterElt.SrcOp == *BPP.first.Src &&
15025 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15026 };
15027
15028 auto *Match = llvm::find_if(Srcs, MatchesFirst);
15029 if (Match != Srcs.end()) {
15030 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
15031 FirstGroup = I;
15032 break;
15033 }
15034 }
15035 if (FirstGroup != -1) {
15036 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
15037 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
15038 return IterElt.SrcOp == *BPP.second.Src &&
15039 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15040 };
15041 auto *Match = llvm::find_if(Srcs, MatchesSecond);
15042 if (Match != Srcs.end()) {
15043 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
15044 } else
15045 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15046 return;
15047 }
15048 }
15049
15050 // If we have made it here, then we could not find a match in Src0s or Src1s
15051 // for either Src0 or Src1, so just place them arbitrarily.
15052
15053 unsigned ZeroMask = 0x0c0c0c0c;
15054 unsigned FMask = 0xFF << (8 * (3 - Step));
15055
15056 Src0s.push_back(
15057 {*Src0.Src,
15058 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15059 Src0.SrcOffset / 4});
15060 Src1s.push_back(
15061 {*Src1.Src,
15062 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15063 Src1.SrcOffset / 4});
15064}
15065
15067 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
15068 bool IsAny) {
15069
15070 // If we just have one source, just permute it accordingly.
15071 if (Srcs.size() == 1) {
15072 auto *Elt = Srcs.begin();
15073 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
15074
15075 // v_perm will produce the original value
15076 if (Elt->PermMask == 0x3020100)
15077 return EltOp;
15078
15079 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15080 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
15081 }
15082
15083 auto *FirstElt = Srcs.begin();
15084 auto *SecondElt = std::next(FirstElt);
15085
15087
15088 // If we have multiple sources in the chain, combine them via perms (using
15089 // calculated perm mask) and Ors.
15090 while (true) {
15091 auto FirstMask = FirstElt->PermMask;
15092 auto SecondMask = SecondElt->PermMask;
15093
15094 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15095 unsigned FirstPlusFour = FirstMask | 0x04040404;
15096 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
15097 // original 0x0C.
15098 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15099
15100 auto PermMask = addPermMasks(FirstMask, SecondMask);
15101 auto FirstVal =
15102 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15103 auto SecondVal =
15104 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
15105
15106 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
15107 SecondVal,
15108 DAG.getConstant(PermMask, SL, MVT::i32)));
15109
15110 FirstElt = std::next(SecondElt);
15111 if (FirstElt == Srcs.end())
15112 break;
15113
15114 SecondElt = std::next(FirstElt);
15115 // If we only have a FirstElt, then just combine that into the cumulative
15116 // source node.
15117 if (SecondElt == Srcs.end()) {
15118 auto EltOp =
15119 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15120
15121 Perms.push_back(
15122 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15123 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
15124 break;
15125 }
15126 }
15127
15128 assert(Perms.size() == 1 || Perms.size() == 2);
15129 return Perms.size() == 2
15130 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
15131 : Perms[0];
15132}
15133
15134static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
15135 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15136 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15137 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15138 EntryMask += ZeroMask;
15139 }
15140}
15141
15142static bool isMul(const SDValue Op) {
15143 auto Opcode = Op.getOpcode();
15144
15145 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
15146 Opcode == AMDGPUISD::MUL_I24);
15147}
15148
15149static std::optional<bool>
15151 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
15152 const SDValue &S1Op, const SelectionDAG &DAG) {
15153 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
15154 // of the dot4 is irrelevant.
15155 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
15156 return false;
15157
15158 auto Known0 = DAG.computeKnownBits(S0Op, 0);
15159 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
15160 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
15161 auto Known1 = DAG.computeKnownBits(S1Op, 0);
15162 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
15163 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
15164
15165 assert(!(S0IsUnsigned && S0IsSigned));
15166 assert(!(S1IsUnsigned && S1IsSigned));
15167
15168 // There are 9 possible permutations of
15169 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
15170
15171 // In two permutations, the sign bits are known to be the same for both Ops,
15172 // so simply return Signed / Unsigned corresponding to the MSB
15173
15174 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
15175 return S0IsSigned;
15176
15177 // In another two permutations, the sign bits are known to be opposite. In
15178 // this case return std::nullopt to indicate a bad match.
15179
15180 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
15181 return std::nullopt;
15182
15183 // In the remaining five permutations, we don't know the value of the sign
15184 // bit for at least one Op. Since we have a valid ByteProvider, we know that
15185 // the upper bits must be extension bits. Thus, the only ways for the sign
15186 // bit to be unknown is if it was sign extended from unknown value, or if it
15187 // was any extended. In either case, it is correct to use the signed
15188 // version of the signedness semantics of dot4
15189
15190 // In two of such permutations, we known the sign bit is set for
15191 // one op, and the other is unknown. It is okay to used signed version of
15192 // dot4.
15193 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
15194 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
15195 return true;
15196
15197 // In one such permutation, we don't know either of the sign bits. It is okay
15198 // to used the signed version of dot4.
15199 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
15200 return true;
15201
15202 // In two of such permutations, we known the sign bit is unset for
15203 // one op, and the other is unknown. Return std::nullopt to indicate a
15204 // bad match.
15205 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
15206 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
15207 return std::nullopt;
15208
15209 llvm_unreachable("Fully covered condition");
15210}
15211
15212SDValue SITargetLowering::performAddCombine(SDNode *N,
15213 DAGCombinerInfo &DCI) const {
15214 SelectionDAG &DAG = DCI.DAG;
15215 EVT VT = N->getValueType(0);
15216 SDLoc SL(N);
15217 SDValue LHS = N->getOperand(0);
15218 SDValue RHS = N->getOperand(1);
15219
15220 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
15221 if (Subtarget->hasMad64_32()) {
15222 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
15223 return Folded;
15224 }
15225 }
15226
15227 if (SDValue V = reassociateScalarOps(N, DAG)) {
15228 return V;
15229 }
15230
15231 if (VT == MVT::i64) {
15232 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
15233 return Folded;
15234 }
15235
15236 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
15237 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
15238 SDValue TempNode(N, 0);
15239 std::optional<bool> IsSigned;
15243
15244 // Match the v_dot4 tree, while collecting src nodes.
15245 int ChainLength = 0;
15246 for (int I = 0; I < 4; I++) {
15247 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
15248 if (MulIdx == -1)
15249 break;
15250 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
15251 if (!Src0)
15252 break;
15253 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
15254 if (!Src1)
15255 break;
15256
15257 auto IterIsSigned = checkDot4MulSignedness(
15258 TempNode->getOperand(MulIdx), *Src0, *Src1,
15259 TempNode->getOperand(MulIdx)->getOperand(0),
15260 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
15261 if (!IterIsSigned)
15262 break;
15263 if (!IsSigned)
15264 IsSigned = *IterIsSigned;
15265 if (*IterIsSigned != *IsSigned)
15266 break;
15267 placeSources(*Src0, *Src1, Src0s, Src1s, I);
15268 auto AddIdx = 1 - MulIdx;
15269 // Allow the special case where add (add (mul24, 0), mul24) became ->
15270 // add (mul24, mul24).
15271 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
15272 Src2s.push_back(TempNode->getOperand(AddIdx));
15273 auto Src0 =
15274 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
15275 if (!Src0)
15276 break;
15277 auto Src1 =
15278 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
15279 if (!Src1)
15280 break;
15281 auto IterIsSigned = checkDot4MulSignedness(
15282 TempNode->getOperand(AddIdx), *Src0, *Src1,
15283 TempNode->getOperand(AddIdx)->getOperand(0),
15284 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
15285 if (!IterIsSigned)
15286 break;
15287 assert(IsSigned);
15288 if (*IterIsSigned != *IsSigned)
15289 break;
15290 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
15291 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
15292 ChainLength = I + 2;
15293 break;
15294 }
15295
15296 TempNode = TempNode->getOperand(AddIdx);
15297 Src2s.push_back(TempNode);
15298 ChainLength = I + 1;
15299 if (TempNode->getNumOperands() < 2)
15300 break;
15301 LHS = TempNode->getOperand(0);
15302 RHS = TempNode->getOperand(1);
15303 }
15304
15305 if (ChainLength < 2)
15306 return SDValue();
15307
15308 // Masks were constructed with assumption that we would find a chain of
15309 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
15310 // 0x0c) so they do not affect dot calculation.
15311 if (ChainLength < 4) {
15312 fixMasks(Src0s, ChainLength);
15313 fixMasks(Src1s, ChainLength);
15314 }
15315
15316 SDValue Src0, Src1;
15317
15318 // If we are just using a single source for both, and have permuted the
15319 // bytes consistently, we can just use the sources without permuting
15320 // (commutation).
15321 bool UseOriginalSrc = false;
15322 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
15323 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
15324 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
15325 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
15326 SmallVector<unsigned, 4> SrcBytes;
15327 auto Src0Mask = Src0s.begin()->PermMask;
15328 SrcBytes.push_back(Src0Mask & 0xFF000000);
15329 bool UniqueEntries = true;
15330 for (auto I = 1; I < 4; I++) {
15331 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
15332
15333 if (is_contained(SrcBytes, NextByte)) {
15334 UniqueEntries = false;
15335 break;
15336 }
15337 SrcBytes.push_back(NextByte);
15338 }
15339
15340 if (UniqueEntries) {
15341 UseOriginalSrc = true;
15342
15343 auto *FirstElt = Src0s.begin();
15344 auto FirstEltOp =
15345 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15346
15347 auto *SecondElt = Src1s.begin();
15348 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
15349 SecondElt->DWordOffset);
15350
15351 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
15352 MVT::getIntegerVT(32));
15353 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
15354 MVT::getIntegerVT(32));
15355 }
15356 }
15357
15358 if (!UseOriginalSrc) {
15359 Src0 = resolveSources(DAG, SL, Src0s, false, true);
15360 Src1 = resolveSources(DAG, SL, Src1s, false, true);
15361 }
15362
15363 assert(IsSigned);
15364 SDValue Src2 =
15365 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
15366
15367 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
15368 : Intrinsic::amdgcn_udot4,
15369 SL, MVT::i64);
15370
15371 assert(!VT.isVector());
15372 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
15373 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
15374
15375 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
15376 }
15377
15378 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
15379 return SDValue();
15380
15381 // add x, zext (setcc) => uaddo_carry x, 0, setcc
15382 // add x, sext (setcc) => usubo_carry x, 0, setcc
15383 unsigned Opc = LHS.getOpcode();
15386 std::swap(RHS, LHS);
15387
15388 Opc = RHS.getOpcode();
15389 switch (Opc) {
15390 default:
15391 break;
15392 case ISD::ZERO_EXTEND:
15393 case ISD::SIGN_EXTEND:
15394 case ISD::ANY_EXTEND: {
15395 auto Cond = RHS.getOperand(0);
15396 // If this won't be a real VOPC output, we would still need to insert an
15397 // extra instruction anyway.
15398 if (!isBoolSGPR(Cond))
15399 break;
15400 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
15401 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
15403 return DAG.getNode(Opc, SL, VTList, Args);
15404 }
15405 case ISD::UADDO_CARRY: {
15406 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
15407 if (!isNullConstant(RHS.getOperand(1)))
15408 break;
15409 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
15410 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
15411 }
15412 }
15413 return SDValue();
15414}
15415
15416SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
15417 DAGCombinerInfo &DCI) const {
15418 SelectionDAG &DAG = DCI.DAG;
15419 SDLoc DL(N);
15420 EVT VT = N->getValueType(0);
15421 SDValue N0 = N->getOperand(0);
15422 SDValue N1 = N->getOperand(1);
15423
15424 // The following folds transform PTRADDs into regular arithmetic in cases
15425 // where the PTRADD wouldn't be folded as an immediate offset into memory
15426 // instructions anyway. They are target-specific in that other targets might
15427 // prefer to not lose information about the pointer arithmetic.
15428
15429 // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
15430 // Adapted from DAGCombiner::visitADDLikeCommutative.
15431 SDValue V, K;
15432 if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
15433 SDNodeFlags ShlFlags = N1->getFlags();
15434 // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
15435 // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
15436 // preserved.
15437 SDNodeFlags NewShlFlags =
15438 ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
15440 : SDNodeFlags();
15441 SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
15442 DCI.AddToWorklist(Inner.getNode());
15443 return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
15444 }
15445
15446 // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
15447 // performAddCombine.
15448 if (N1.getOpcode() == ISD::MUL) {
15449 if (Subtarget->hasMad64_32()) {
15450 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
15451 return Folded;
15452 }
15453 }
15454
15455 // If the 32 low bits of the constant are all zero, there is nothing to fold
15456 // into an immediate offset, so it's better to eliminate the unnecessary
15457 // addition for the lower 32 bits than to preserve the PTRADD.
15458 // Analogous to a fold in performAddCombine.
15459 if (VT == MVT::i64) {
15460 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
15461 return Folded;
15462 }
15463
15464 if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) {
15465 // Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with
15466 // global address GA and constant c, such that c can be folded into GA.
15467 SDValue GAValue = N0.getOperand(0);
15468 if (const GlobalAddressSDNode *GA =
15469 dyn_cast<GlobalAddressSDNode>(GAValue)) {
15470 if (DCI.isBeforeLegalizeOps() && isOffsetFoldingLegal(GA)) {
15471 // If both additions in the original were NUW, reassociation preserves
15472 // that.
15474 (N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
15475 SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags);
15476 DCI.AddToWorklist(Inner.getNode());
15477 return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags);
15478 }
15479 }
15480 }
15481
15482 if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
15483 return SDValue();
15484
15485 // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
15486 // y is not, and (add y, z) is used only once.
15487 // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
15488 // z is not, and (add y, z) is used only once.
15489 // The goal is to move constant offsets to the outermost ptradd, to create
15490 // more opportunities to fold offsets into memory instructions.
15491 // Together with the generic combines in DAGCombiner.cpp, this also
15492 // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
15493 //
15494 // This transform is here instead of in the general DAGCombiner as it can
15495 // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
15496 // AArch64's CPA.
15497 SDValue X = N0;
15498 SDValue Y = N1.getOperand(0);
15499 SDValue Z = N1.getOperand(1);
15500 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
15501 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
15502
15503 // If both additions in the original were NUW, reassociation preserves that.
15504 SDNodeFlags ReassocFlags =
15505 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
15506
15507 if (ZIsConstant != YIsConstant) {
15508 if (YIsConstant)
15509 std::swap(Y, Z);
15510 SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
15511 DCI.AddToWorklist(Inner.getNode());
15512 return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags);
15513 }
15514
15515 // If one of Y and Z is constant, they have been handled above. If both were
15516 // constant, the addition would have been folded in SelectionDAG::getNode
15517 // already. This ensures that the generic DAG combines won't undo the
15518 // following reassociation.
15519 assert(!YIsConstant && !ZIsConstant);
15520
15521 if (!X->isDivergent() && Y->isDivergent() != Z->isDivergent()) {
15522 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
15523 // y are uniform and z isn't.
15524 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
15525 // z are uniform and y isn't.
15526 // The goal is to push uniform operands up in the computation, so that they
15527 // can be handled with scalar operations. We can't use reassociateScalarOps
15528 // for this since it requires two identical commutative operations to
15529 // reassociate.
15530 if (Y->isDivergent())
15531 std::swap(Y, Z);
15532 SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
15533 DCI.AddToWorklist(UniformInner.getNode());
15534 return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
15535 }
15536
15537 return SDValue();
15538}
15539
15540SDValue SITargetLowering::performSubCombine(SDNode *N,
15541 DAGCombinerInfo &DCI) const {
15542 SelectionDAG &DAG = DCI.DAG;
15543 EVT VT = N->getValueType(0);
15544
15545 if (VT == MVT::i64) {
15546 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
15547 return Folded;
15548 }
15549
15550 if (VT != MVT::i32)
15551 return SDValue();
15552
15553 SDLoc SL(N);
15554 SDValue LHS = N->getOperand(0);
15555 SDValue RHS = N->getOperand(1);
15556
15557 // sub x, zext (setcc) => usubo_carry x, 0, setcc
15558 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
15559 unsigned Opc = RHS.getOpcode();
15560 switch (Opc) {
15561 default:
15562 break;
15563 case ISD::ZERO_EXTEND:
15564 case ISD::SIGN_EXTEND:
15565 case ISD::ANY_EXTEND: {
15566 auto Cond = RHS.getOperand(0);
15567 // If this won't be a real VOPC output, we would still need to insert an
15568 // extra instruction anyway.
15569 if (!isBoolSGPR(Cond))
15570 break;
15571 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
15572 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
15574 return DAG.getNode(Opc, SL, VTList, Args);
15575 }
15576 }
15577
15578 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
15579 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
15580 if (!isNullConstant(LHS.getOperand(1)))
15581 return SDValue();
15582 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
15583 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
15584 }
15585 return SDValue();
15586}
15587
15588SDValue
15589SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
15590 DAGCombinerInfo &DCI) const {
15591
15592 if (N->getValueType(0) != MVT::i32)
15593 return SDValue();
15594
15595 if (!isNullConstant(N->getOperand(1)))
15596 return SDValue();
15597
15598 SelectionDAG &DAG = DCI.DAG;
15599 SDValue LHS = N->getOperand(0);
15600
15601 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
15602 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
15603 unsigned LHSOpc = LHS.getOpcode();
15604 unsigned Opc = N->getOpcode();
15605 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
15606 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
15607 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
15608 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
15609 }
15610 return SDValue();
15611}
15612
15613SDValue SITargetLowering::performFAddCombine(SDNode *N,
15614 DAGCombinerInfo &DCI) const {
15615 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
15616 return SDValue();
15617
15618 SelectionDAG &DAG = DCI.DAG;
15619 EVT VT = N->getValueType(0);
15620
15621 SDLoc SL(N);
15622 SDValue LHS = N->getOperand(0);
15623 SDValue RHS = N->getOperand(1);
15624
15625 // These should really be instruction patterns, but writing patterns with
15626 // source modifiers is a pain.
15627
15628 // fadd (fadd (a, a), b) -> mad 2.0, a, b
15629 if (LHS.getOpcode() == ISD::FADD) {
15630 SDValue A = LHS.getOperand(0);
15631 if (A == LHS.getOperand(1)) {
15632 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
15633 if (FusedOp != 0) {
15634 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
15635 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
15636 }
15637 }
15638 }
15639
15640 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
15641 if (RHS.getOpcode() == ISD::FADD) {
15642 SDValue A = RHS.getOperand(0);
15643 if (A == RHS.getOperand(1)) {
15644 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
15645 if (FusedOp != 0) {
15646 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
15647 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
15648 }
15649 }
15650 }
15651
15652 return SDValue();
15653}
15654
15655SDValue SITargetLowering::performFSubCombine(SDNode *N,
15656 DAGCombinerInfo &DCI) const {
15657 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
15658 return SDValue();
15659
15660 SelectionDAG &DAG = DCI.DAG;
15661 SDLoc SL(N);
15662 EVT VT = N->getValueType(0);
15663 assert(!VT.isVector());
15664
15665 // Try to get the fneg to fold into the source modifier. This undoes generic
15666 // DAG combines and folds them into the mad.
15667 //
15668 // Only do this if we are not trying to support denormals. v_mad_f32 does
15669 // not support denormals ever.
15670 SDValue LHS = N->getOperand(0);
15671 SDValue RHS = N->getOperand(1);
15672 if (LHS.getOpcode() == ISD::FADD) {
15673 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
15674 SDValue A = LHS.getOperand(0);
15675 if (A == LHS.getOperand(1)) {
15676 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
15677 if (FusedOp != 0) {
15678 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
15679 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
15680
15681 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
15682 }
15683 }
15684 }
15685
15686 if (RHS.getOpcode() == ISD::FADD) {
15687 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
15688
15689 SDValue A = RHS.getOperand(0);
15690 if (A == RHS.getOperand(1)) {
15691 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
15692 if (FusedOp != 0) {
15693 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
15694 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
15695 }
15696 }
15697 }
15698
15699 return SDValue();
15700}
15701
15702SDValue SITargetLowering::performFDivCombine(SDNode *N,
15703 DAGCombinerInfo &DCI) const {
15704 SelectionDAG &DAG = DCI.DAG;
15705 SDLoc SL(N);
15706 EVT VT = N->getValueType(0);
15707 if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts())
15708 return SDValue();
15709
15710 SDValue LHS = N->getOperand(0);
15711 SDValue RHS = N->getOperand(1);
15712
15713 SDNodeFlags Flags = N->getFlags();
15714 SDNodeFlags RHSFlags = RHS->getFlags();
15715 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
15716 !RHS->hasOneUse())
15717 return SDValue();
15718
15719 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
15720 bool IsNegative = false;
15721 if (CLHS->isExactlyValue(1.0) ||
15722 (IsNegative = CLHS->isExactlyValue(-1.0))) {
15723 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
15724 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
15725 if (RHS.getOpcode() == ISD::FSQRT) {
15726 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
15727 SDValue Rsq =
15728 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
15729 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
15730 }
15731 }
15732 }
15733
15734 return SDValue();
15735}
15736
15737SDValue SITargetLowering::performFMulCombine(SDNode *N,
15738 DAGCombinerInfo &DCI) const {
15739 SelectionDAG &DAG = DCI.DAG;
15740 EVT VT = N->getValueType(0);
15741 EVT ScalarVT = VT.getScalarType();
15742 EVT IntVT = VT.changeElementType(MVT::i32);
15743
15744 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
15745 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
15746 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
15747 return SDValue();
15748 }
15749
15750 SDValue LHS = N->getOperand(0);
15751 SDValue RHS = N->getOperand(1);
15752
15753 // It is cheaper to realize i32 inline constants as compared against
15754 // materializing f16 or f64 (or even non-inline f32) values,
15755 // possible via ldexp usage, as shown below :
15756 //
15757 // Given : A = 2^a & B = 2^b ; where a and b are integers.
15758 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
15759 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
15760 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
15761 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
15762 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
15763 if (!TrueNode)
15764 return SDValue();
15765 const ConstantFPSDNode *FalseNode =
15766 isConstOrConstSplatFP(RHS.getOperand(2));
15767 if (!FalseNode)
15768 return SDValue();
15769
15770 if (TrueNode->isNegative() != FalseNode->isNegative())
15771 return SDValue();
15772
15773 // For f32, only non-inline constants should be transformed.
15775 if (ScalarVT == MVT::f32 &&
15776 TII->isInlineConstant(TrueNode->getValueAPF()) &&
15777 TII->isInlineConstant(FalseNode->getValueAPF()))
15778 return SDValue();
15779
15780 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
15781 if (TrueNodeExpVal == INT_MIN)
15782 return SDValue();
15783 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
15784 if (FalseNodeExpVal == INT_MIN)
15785 return SDValue();
15786
15787 SDLoc SL(N);
15788 SDValue SelectNode =
15789 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
15790 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
15791 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
15792
15793 LHS = TrueNode->isNegative()
15794 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
15795 : LHS;
15796
15797 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
15798 }
15799
15800 return SDValue();
15801}
15802
15803SDValue SITargetLowering::performFMACombine(SDNode *N,
15804 DAGCombinerInfo &DCI) const {
15805 SelectionDAG &DAG = DCI.DAG;
15806 EVT VT = N->getValueType(0);
15807 SDLoc SL(N);
15808
15809 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
15810 return SDValue();
15811
15812 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
15813 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
15814 SDValue Op1 = N->getOperand(0);
15815 SDValue Op2 = N->getOperand(1);
15816 SDValue FMA = N->getOperand(2);
15817
15818 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
15819 Op2.getOpcode() != ISD::FP_EXTEND)
15820 return SDValue();
15821
15822 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
15823 // regardless of the denorm mode setting. Therefore,
15824 // fp-contract is sufficient to allow generating fdot2.
15825 const TargetOptions &Options = DAG.getTarget().Options;
15826 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
15827 (N->getFlags().hasAllowContract() &&
15828 FMA->getFlags().hasAllowContract())) {
15829 Op1 = Op1.getOperand(0);
15830 Op2 = Op2.getOperand(0);
15831 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15833 return SDValue();
15834
15835 SDValue Vec1 = Op1.getOperand(0);
15836 SDValue Idx1 = Op1.getOperand(1);
15837 SDValue Vec2 = Op2.getOperand(0);
15838
15839 SDValue FMAOp1 = FMA.getOperand(0);
15840 SDValue FMAOp2 = FMA.getOperand(1);
15841 SDValue FMAAcc = FMA.getOperand(2);
15842
15843 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
15844 FMAOp2.getOpcode() != ISD::FP_EXTEND)
15845 return SDValue();
15846
15847 FMAOp1 = FMAOp1.getOperand(0);
15848 FMAOp2 = FMAOp2.getOperand(0);
15849 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15851 return SDValue();
15852
15853 SDValue Vec3 = FMAOp1.getOperand(0);
15854 SDValue Vec4 = FMAOp2.getOperand(0);
15855 SDValue Idx2 = FMAOp1.getOperand(1);
15856
15857 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
15858 // Idx1 and Idx2 cannot be the same.
15859 Idx1 == Idx2)
15860 return SDValue();
15861
15862 if (Vec1 == Vec2 || Vec3 == Vec4)
15863 return SDValue();
15864
15865 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
15866 return SDValue();
15867
15868 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
15869 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
15870 DAG.getTargetConstant(0, SL, MVT::i1));
15871 }
15872 }
15873 return SDValue();
15874}
15875
15876SDValue SITargetLowering::performSetCCCombine(SDNode *N,
15877 DAGCombinerInfo &DCI) const {
15878 SelectionDAG &DAG = DCI.DAG;
15879 SDLoc SL(N);
15880
15881 SDValue LHS = N->getOperand(0);
15882 SDValue RHS = N->getOperand(1);
15883 EVT VT = LHS.getValueType();
15884 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
15885
15886 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
15887 if (!CRHS) {
15888 CRHS = dyn_cast<ConstantSDNode>(LHS);
15889 if (CRHS) {
15890 std::swap(LHS, RHS);
15891 CC = getSetCCSwappedOperands(CC);
15892 }
15893 }
15894
15895 if (CRHS) {
15896 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
15897 isBoolSGPR(LHS.getOperand(0))) {
15898 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
15899 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
15900 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
15901 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
15902 if ((CRHS->isAllOnes() &&
15903 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
15904 (CRHS->isZero() &&
15905 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
15906 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
15907 DAG.getAllOnesConstant(SL, MVT::i1));
15908 if ((CRHS->isAllOnes() &&
15909 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
15910 (CRHS->isZero() &&
15911 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
15912 return LHS.getOperand(0);
15913 }
15914
15915 const APInt &CRHSVal = CRHS->getAPIntValue();
15916 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
15917 LHS.getOpcode() == ISD::SELECT &&
15918 isa<ConstantSDNode>(LHS.getOperand(1)) &&
15919 isa<ConstantSDNode>(LHS.getOperand(2)) &&
15920 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
15921 isBoolSGPR(LHS.getOperand(0))) {
15922 // Given CT != FT:
15923 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
15924 // setcc (select cc, CT, CF), CF, ne => cc
15925 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
15926 // setcc (select cc, CT, CF), CT, eq => cc
15927 const APInt &CT = LHS.getConstantOperandAPInt(1);
15928 const APInt &CF = LHS.getConstantOperandAPInt(2);
15929
15930 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
15931 (CT == CRHSVal && CC == ISD::SETNE))
15932 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
15933 DAG.getAllOnesConstant(SL, MVT::i1));
15934 if ((CF == CRHSVal && CC == ISD::SETNE) ||
15935 (CT == CRHSVal && CC == ISD::SETEQ))
15936 return LHS.getOperand(0);
15937 }
15938 }
15939
15940 if (VT != MVT::f32 && VT != MVT::f64 &&
15941 (!Subtarget->has16BitInsts() || VT != MVT::f16))
15942 return SDValue();
15943
15944 // Match isinf/isfinite pattern
15945 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
15946 // (fcmp one (fabs x), inf) -> (fp_class x,
15947 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
15948 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
15949 LHS.getOpcode() == ISD::FABS) {
15950 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
15951 if (!CRHS)
15952 return SDValue();
15953
15954 const APFloat &APF = CRHS->getValueAPF();
15955 if (APF.isInfinity() && !APF.isNegative()) {
15956 const unsigned IsInfMask =
15958 const unsigned IsFiniteMask =
15962 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
15963 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
15964 DAG.getConstant(Mask, SL, MVT::i32));
15965 }
15966 }
15967
15968 return SDValue();
15969}
15970
15971SDValue
15972SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
15973 DAGCombinerInfo &DCI) const {
15974 SelectionDAG &DAG = DCI.DAG;
15975 SDLoc SL(N);
15976 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
15977
15978 SDValue Src = N->getOperand(0);
15979 SDValue Shift = N->getOperand(0);
15980
15981 // TODO: Extend type shouldn't matter (assuming legal types).
15982 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
15983 Shift = Shift.getOperand(0);
15984
15985 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
15986 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
15987 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
15988 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
15989 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
15990 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
15991 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
15992 SDValue Shifted = DAG.getZExtOrTrunc(
15993 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
15994
15995 unsigned ShiftOffset = 8 * Offset;
15996 if (Shift.getOpcode() == ISD::SHL)
15997 ShiftOffset -= C->getZExtValue();
15998 else
15999 ShiftOffset += C->getZExtValue();
16000
16001 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16002 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
16003 MVT::f32, Shifted);
16004 }
16005 }
16006 }
16007
16008 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16009 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
16010 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
16011 // We simplified Src. If this node is not dead, visit it again so it is
16012 // folded properly.
16013 if (N->getOpcode() != ISD::DELETED_NODE)
16014 DCI.AddToWorklist(N);
16015 return SDValue(N, 0);
16016 }
16017
16018 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
16019 if (SDValue DemandedSrc =
16021 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
16022
16023 return SDValue();
16024}
16025
16026SDValue SITargetLowering::performClampCombine(SDNode *N,
16027 DAGCombinerInfo &DCI) const {
16028 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
16029 if (!CSrc)
16030 return SDValue();
16031
16032 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16033 const APFloat &F = CSrc->getValueAPF();
16034 APFloat Zero = APFloat::getZero(F.getSemantics());
16035 if (F < Zero ||
16036 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16037 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
16038 }
16039
16040 APFloat One(F.getSemantics(), "1.0");
16041 if (F > One)
16042 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
16043
16044 return SDValue(CSrc, 0);
16045}
16046
16047SDValue SITargetLowering::performSelectCombine(SDNode *N,
16048 DAGCombinerInfo &DCI) const {
16049
16050 // Try to fold CMP + SELECT patterns with shared constants (both FP and
16051 // integer).
16052 // Detect when CMP and SELECT use the same constant and fold them to avoid
16053 // loading the constant twice. Specifically handles patterns like:
16054 // %cmp = icmp eq i32 %val, 4242
16055 // %sel = select i1 %cmp, i32 4242, i32 %other
16056 // It can be optimized to reuse %val instead of 4242 in select.
16057 SDValue Cond = N->getOperand(0);
16058 SDValue TrueVal = N->getOperand(1);
16059 SDValue FalseVal = N->getOperand(2);
16060
16061 // Check if condition is a comparison.
16062 if (Cond.getOpcode() != ISD::SETCC)
16063 return SDValue();
16064
16065 SDValue LHS = Cond.getOperand(0);
16066 SDValue RHS = Cond.getOperand(1);
16067 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
16068
16069 bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
16070 bool isInteger = LHS.getValueType().isInteger();
16071
16072 // Handle simple floating-point and integer types only.
16073 if (!isFloatingPoint && !isInteger)
16074 return SDValue();
16075
16076 bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
16077 bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
16078 if (!isEquality && !isNonEquality)
16079 return SDValue();
16080
16081 SDValue ArgVal, ConstVal;
16082 if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
16083 (isInteger && isa<ConstantSDNode>(RHS))) {
16084 ConstVal = RHS;
16085 ArgVal = LHS;
16086 } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
16087 (isInteger && isa<ConstantSDNode>(LHS))) {
16088 ConstVal = LHS;
16089 ArgVal = RHS;
16090 } else {
16091 return SDValue();
16092 }
16093
16094 // Skip optimization for inlinable immediates.
16095 if (isFloatingPoint) {
16096 const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
16097 if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
16098 return SDValue();
16099 } else {
16101 cast<ConstantSDNode>(ConstVal)->getSExtValue()))
16102 return SDValue();
16103 }
16104
16105 // For equality and non-equality comparisons, patterns:
16106 // select (setcc x, const), const, y -> select (setcc x, const), x, y
16107 // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
16108 if (!(isEquality && TrueVal == ConstVal) &&
16109 !(isNonEquality && FalseVal == ConstVal))
16110 return SDValue();
16111
16112 SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
16113 SDValue SelectRHS =
16114 (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
16115 return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
16116 SelectLHS, SelectRHS);
16117}
16118
16120 DAGCombinerInfo &DCI) const {
16121 switch (N->getOpcode()) {
16122 case ISD::ADD:
16123 case ISD::SUB:
16124 case ISD::SHL:
16125 case ISD::SRL:
16126 case ISD::SRA:
16127 case ISD::AND:
16128 case ISD::OR:
16129 case ISD::XOR:
16130 case ISD::MUL:
16131 case ISD::SETCC:
16132 case ISD::SELECT:
16133 case ISD::SMIN:
16134 case ISD::SMAX:
16135 case ISD::UMIN:
16136 case ISD::UMAX:
16137 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
16138 return Res;
16139 break;
16140 default:
16141 break;
16142 }
16143
16144 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
16145 return SDValue();
16146
16147 switch (N->getOpcode()) {
16148 case ISD::ADD:
16149 return performAddCombine(N, DCI);
16150 case ISD::PTRADD:
16151 return performPtrAddCombine(N, DCI);
16152 case ISD::SUB:
16153 return performSubCombine(N, DCI);
16154 case ISD::UADDO_CARRY:
16155 case ISD::USUBO_CARRY:
16156 return performAddCarrySubCarryCombine(N, DCI);
16157 case ISD::FADD:
16158 return performFAddCombine(N, DCI);
16159 case ISD::FSUB:
16160 return performFSubCombine(N, DCI);
16161 case ISD::FDIV:
16162 return performFDivCombine(N, DCI);
16163 case ISD::FMUL:
16164 return performFMulCombine(N, DCI);
16165 case ISD::SETCC:
16166 return performSetCCCombine(N, DCI);
16167 case ISD::SELECT:
16168 if (auto Res = performSelectCombine(N, DCI))
16169 return Res;
16170 break;
16171 case ISD::FMAXNUM:
16172 case ISD::FMINNUM:
16173 case ISD::FMAXNUM_IEEE:
16174 case ISD::FMINNUM_IEEE:
16175 case ISD::FMAXIMUM:
16176 case ISD::FMINIMUM:
16177 case ISD::FMAXIMUMNUM:
16178 case ISD::FMINIMUMNUM:
16179 case ISD::SMAX:
16180 case ISD::SMIN:
16181 case ISD::UMAX:
16182 case ISD::UMIN:
16185 return performMinMaxCombine(N, DCI);
16186 case ISD::FMA:
16187 return performFMACombine(N, DCI);
16188 case ISD::AND:
16189 return performAndCombine(N, DCI);
16190 case ISD::OR:
16191 return performOrCombine(N, DCI);
16192 case ISD::FSHR: {
16194 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
16195 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
16196 return matchPERM(N, DCI);
16197 }
16198 break;
16199 }
16200 case ISD::XOR:
16201 return performXorCombine(N, DCI);
16202 case ISD::ZERO_EXTEND:
16203 return performZeroExtendCombine(N, DCI);
16205 return performSignExtendInRegCombine(N, DCI);
16207 return performClassCombine(N, DCI);
16208 case ISD::FCANONICALIZE:
16209 return performFCanonicalizeCombine(N, DCI);
16210 case AMDGPUISD::RCP:
16211 return performRcpCombine(N, DCI);
16212 case ISD::FLDEXP:
16213 case AMDGPUISD::FRACT:
16214 case AMDGPUISD::RSQ:
16217 case AMDGPUISD::RSQ_CLAMP: {
16218 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
16219 SDValue Src = N->getOperand(0);
16220 if (Src.isUndef())
16221 return Src;
16222 break;
16223 }
16224 case ISD::SINT_TO_FP:
16225 case ISD::UINT_TO_FP:
16226 return performUCharToFloatCombine(N, DCI);
16227 case ISD::FCOPYSIGN:
16228 return performFCopySignCombine(N, DCI);
16233 return performCvtF32UByteNCombine(N, DCI);
16234 case AMDGPUISD::FMED3:
16235 return performFMed3Combine(N, DCI);
16237 return performCvtPkRTZCombine(N, DCI);
16238 case AMDGPUISD::CLAMP:
16239 return performClampCombine(N, DCI);
16240 case ISD::SCALAR_TO_VECTOR: {
16241 SelectionDAG &DAG = DCI.DAG;
16242 EVT VT = N->getValueType(0);
16243
16244 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
16245 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
16246 SDLoc SL(N);
16247 SDValue Src = N->getOperand(0);
16248 EVT EltVT = Src.getValueType();
16249 if (EltVT != MVT::i16)
16250 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
16251
16252 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
16253 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
16254 }
16255
16256 break;
16257 }
16259 return performExtractVectorEltCombine(N, DCI);
16261 return performInsertVectorEltCombine(N, DCI);
16262 case ISD::FP_ROUND:
16263 return performFPRoundCombine(N, DCI);
16264 case ISD::LOAD: {
16265 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
16266 return Widened;
16267 [[fallthrough]];
16268 }
16269 default: {
16270 if (!DCI.isBeforeLegalize()) {
16271 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
16272 return performMemSDNodeCombine(MemNode, DCI);
16273 }
16274
16275 break;
16276 }
16277 }
16278
16280}
16281
16282/// Helper function for adjustWritemask
16283static unsigned SubIdx2Lane(unsigned Idx) {
16284 switch (Idx) {
16285 default:
16286 return ~0u;
16287 case AMDGPU::sub0:
16288 return 0;
16289 case AMDGPU::sub1:
16290 return 1;
16291 case AMDGPU::sub2:
16292 return 2;
16293 case AMDGPU::sub3:
16294 return 3;
16295 case AMDGPU::sub4:
16296 return 4; // Possible with TFE/LWE
16297 }
16298}
16299
16300/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
16301SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
16302 SelectionDAG &DAG) const {
16303 unsigned Opcode = Node->getMachineOpcode();
16304
16305 // Subtract 1 because the vdata output is not a MachineSDNode operand.
16306 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
16307 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
16308 return Node; // not implemented for D16
16309
16310 SDNode *Users[5] = {nullptr};
16311 unsigned Lane = 0;
16312 unsigned DmaskIdx =
16313 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
16314 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
16315 unsigned NewDmask = 0;
16316 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
16317 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
16318 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
16319 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx));
16320 unsigned TFCLane = 0;
16321 bool HasChain = Node->getNumValues() > 1;
16322
16323 if (OldDmask == 0) {
16324 // These are folded out, but on the chance it happens don't assert.
16325 return Node;
16326 }
16327
16328 unsigned OldBitsSet = llvm::popcount(OldDmask);
16329 // Work out which is the TFE/LWE lane if that is enabled.
16330 if (UsesTFC) {
16331 TFCLane = OldBitsSet;
16332 }
16333
16334 // Try to figure out the used register components
16335 for (SDUse &Use : Node->uses()) {
16336
16337 // Don't look at users of the chain.
16338 if (Use.getResNo() != 0)
16339 continue;
16340
16341 SDNode *User = Use.getUser();
16342
16343 // Abort if we can't understand the usage
16344 if (!User->isMachineOpcode() ||
16345 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
16346 return Node;
16347
16348 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
16349 // Note that subregs are packed, i.e. Lane==0 is the first bit set
16350 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
16351 // set, etc.
16352 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
16353 if (Lane == ~0u)
16354 return Node;
16355
16356 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
16357 if (UsesTFC && Lane == TFCLane) {
16358 Users[Lane] = User;
16359 } else {
16360 // Set which texture component corresponds to the lane.
16361 unsigned Comp;
16362 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
16363 Comp = llvm::countr_zero(Dmask);
16364 Dmask &= ~(1 << Comp);
16365 }
16366
16367 // Abort if we have more than one user per component.
16368 if (Users[Lane])
16369 return Node;
16370
16371 Users[Lane] = User;
16372 NewDmask |= 1 << Comp;
16373 }
16374 }
16375
16376 // Don't allow 0 dmask, as hardware assumes one channel enabled.
16377 bool NoChannels = !NewDmask;
16378 if (NoChannels) {
16379 if (!UsesTFC) {
16380 // No uses of the result and not using TFC. Then do nothing.
16381 return Node;
16382 }
16383 // If the original dmask has one channel - then nothing to do
16384 if (OldBitsSet == 1)
16385 return Node;
16386 // Use an arbitrary dmask - required for the instruction to work
16387 NewDmask = 1;
16388 }
16389 // Abort if there's no change
16390 if (NewDmask == OldDmask)
16391 return Node;
16392
16393 unsigned BitsSet = llvm::popcount(NewDmask);
16394
16395 // Check for TFE or LWE - increase the number of channels by one to account
16396 // for the extra return value
16397 // This will need adjustment for D16 if this is also included in
16398 // adjustWriteMask (this function) but at present D16 are excluded.
16399 unsigned NewChannels = BitsSet + UsesTFC;
16400
16401 int NewOpcode =
16402 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
16403 assert(NewOpcode != -1 &&
16404 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
16405 "failed to find equivalent MIMG op");
16406
16407 // Adjust the writemask in the node
16409 llvm::append_range(Ops, Node->ops().take_front(DmaskIdx));
16410 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
16411 llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1));
16412
16413 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
16414
16415 MVT ResultVT = NewChannels == 1
16416 ? SVT
16417 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
16418 : NewChannels == 5 ? 8
16419 : NewChannels);
16420 SDVTList NewVTList =
16421 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
16422
16423 MachineSDNode *NewNode =
16424 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
16425
16426 if (HasChain) {
16427 // Update chain.
16428 DAG.setNodeMemRefs(NewNode, Node->memoperands());
16429 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
16430 }
16431
16432 if (NewChannels == 1) {
16433 assert(Node->hasNUsesOfValue(1, 0));
16434 SDNode *Copy =
16435 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
16436 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
16437 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
16438 return nullptr;
16439 }
16440
16441 // Update the users of the node with the new indices
16442 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
16443 SDNode *User = Users[i];
16444 if (!User) {
16445 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
16446 // Users[0] is still nullptr because channel 0 doesn't really have a use.
16447 if (i || !NoChannels)
16448 continue;
16449 } else {
16450 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
16451 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
16452 if (NewUser != User) {
16453 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
16454 DAG.RemoveDeadNode(User);
16455 }
16456 }
16457
16458 switch (Idx) {
16459 default:
16460 break;
16461 case AMDGPU::sub0:
16462 Idx = AMDGPU::sub1;
16463 break;
16464 case AMDGPU::sub1:
16465 Idx = AMDGPU::sub2;
16466 break;
16467 case AMDGPU::sub2:
16468 Idx = AMDGPU::sub3;
16469 break;
16470 case AMDGPU::sub3:
16471 Idx = AMDGPU::sub4;
16472 break;
16473 }
16474 }
16475
16476 DAG.RemoveDeadNode(Node);
16477 return nullptr;
16478}
16479
16481 if (Op.getOpcode() == ISD::AssertZext)
16482 Op = Op.getOperand(0);
16483
16484 return isa<FrameIndexSDNode>(Op);
16485}
16486
16487/// Legalize target independent instructions (e.g. INSERT_SUBREG)
16488/// with frame index operands.
16489/// LLVM assumes that inputs are to these instructions are registers.
16490SDNode *
16492 SelectionDAG &DAG) const {
16493 if (Node->getOpcode() == ISD::CopyToReg) {
16494 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
16495 SDValue SrcVal = Node->getOperand(2);
16496
16497 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
16498 // to try understanding copies to physical registers.
16499 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
16500 SDLoc SL(Node);
16502 SDValue VReg = DAG.getRegister(
16503 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
16504
16505 SDNode *Glued = Node->getGluedNode();
16506 SDValue ToVReg = DAG.getCopyToReg(
16507 Node->getOperand(0), SL, VReg, SrcVal,
16508 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
16509 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
16510 VReg, ToVReg.getValue(1));
16511 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
16512 DAG.RemoveDeadNode(Node);
16513 return ToResultReg.getNode();
16514 }
16515 }
16516
16518 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
16519 if (!isFrameIndexOp(Node->getOperand(i))) {
16520 Ops.push_back(Node->getOperand(i));
16521 continue;
16522 }
16523
16524 SDLoc DL(Node);
16525 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
16526 Node->getOperand(i).getValueType(),
16527 Node->getOperand(i)),
16528 0));
16529 }
16530
16531 return DAG.UpdateNodeOperands(Node, Ops);
16532}
16533
16534/// Fold the instructions after selecting them.
16535/// Returns null if users were already updated.
16537 SelectionDAG &DAG) const {
16539 unsigned Opcode = Node->getMachineOpcode();
16540
16541 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
16542 !TII->isGather4(Opcode) &&
16543 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
16544 return adjustWritemask(Node, DAG);
16545 }
16546
16547 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
16549 return Node;
16550 }
16551
16552 switch (Opcode) {
16553 case AMDGPU::V_DIV_SCALE_F32_e64:
16554 case AMDGPU::V_DIV_SCALE_F64_e64: {
16555 // Satisfy the operand register constraint when one of the inputs is
16556 // undefined. Ordinarily each undef value will have its own implicit_def of
16557 // a vreg, so force these to use a single register.
16558 SDValue Src0 = Node->getOperand(1);
16559 SDValue Src1 = Node->getOperand(3);
16560 SDValue Src2 = Node->getOperand(5);
16561
16562 if ((Src0.isMachineOpcode() &&
16563 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
16564 (Src0 == Src1 || Src0 == Src2))
16565 break;
16566
16567 MVT VT = Src0.getValueType().getSimpleVT();
16568 const TargetRegisterClass *RC =
16569 getRegClassFor(VT, Src0.getNode()->isDivergent());
16570
16572 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
16573
16574 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
16575 Src0, SDValue());
16576
16577 // src0 must be the same register as src1 or src2, even if the value is
16578 // undefined, so make sure we don't violate this constraint.
16579 if (Src0.isMachineOpcode() &&
16580 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
16581 if (Src1.isMachineOpcode() &&
16582 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
16583 Src0 = Src1;
16584 else if (Src2.isMachineOpcode() &&
16585 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
16586 Src0 = Src2;
16587 else {
16588 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
16589 Src0 = UndefReg;
16590 Src1 = UndefReg;
16591 }
16592 } else
16593 break;
16594
16595 SmallVector<SDValue, 9> Ops(Node->ops());
16596 Ops[1] = Src0;
16597 Ops[3] = Src1;
16598 Ops[5] = Src2;
16599 Ops.push_back(ImpDef.getValue(1));
16600 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
16601 }
16602 default:
16603 break;
16604 }
16605
16606 return Node;
16607}
16608
16609// Any MIMG instructions that use tfe or lwe require an initialization of the
16610// result register that will be written in the case of a memory access failure.
16611// The required code is also added to tie this init code to the result of the
16612// img instruction.
16615 const SIRegisterInfo &TRI = TII->getRegisterInfo();
16616 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
16617 MachineBasicBlock &MBB = *MI.getParent();
16618
16619 int DstIdx =
16620 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
16621 unsigned InitIdx = 0;
16622
16623 if (TII->isImage(MI)) {
16624 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
16625 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
16626 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
16627
16628 if (!TFE && !LWE) // intersect_ray
16629 return;
16630
16631 unsigned TFEVal = TFE ? TFE->getImm() : 0;
16632 unsigned LWEVal = LWE ? LWE->getImm() : 0;
16633 unsigned D16Val = D16 ? D16->getImm() : 0;
16634
16635 if (!TFEVal && !LWEVal)
16636 return;
16637
16638 // At least one of TFE or LWE are non-zero
16639 // We have to insert a suitable initialization of the result value and
16640 // tie this to the dest of the image instruction.
16641
16642 // Calculate which dword we have to initialize to 0.
16643 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
16644
16645 // check that dmask operand is found.
16646 assert(MO_Dmask && "Expected dmask operand in instruction");
16647
16648 unsigned dmask = MO_Dmask->getImm();
16649 // Determine the number of active lanes taking into account the
16650 // Gather4 special case
16651 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
16652
16653 bool Packed = !Subtarget->hasUnpackedD16VMem();
16654
16655 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
16656
16657 // Abandon attempt if the dst size isn't large enough
16658 // - this is in fact an error but this is picked up elsewhere and
16659 // reported correctly.
16660 uint32_t DstSize =
16661 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
16662 if (DstSize < InitIdx)
16663 return;
16664 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
16665 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
16666 } else {
16667 return;
16668 }
16669
16670 const DebugLoc &DL = MI.getDebugLoc();
16671
16672 // Create a register for the initialization value.
16673 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
16674 unsigned NewDst = 0; // Final initialized value will be in here
16675
16676 // If PRTStrictNull feature is enabled (the default) then initialize
16677 // all the result registers to 0, otherwise just the error indication
16678 // register (VGPRn+1)
16679 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
16680 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
16681
16682 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
16683 for (; SizeLeft; SizeLeft--, CurrIdx++) {
16684 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
16685 // Initialize dword
16686 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
16687 // clang-format off
16688 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
16689 .addImm(0);
16690 // clang-format on
16691 // Insert into the super-reg
16692 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
16693 .addReg(PrevDst)
16694 .addReg(SubReg)
16696
16697 PrevDst = NewDst;
16698 }
16699
16700 // Add as an implicit operand
16701 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
16702
16703 // Tie the just added implicit operand to the dst
16704 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
16705}
16706
16707/// Assign the register class depending on the number of
16708/// bits set in the writemask
16710 SDNode *Node) const {
16712
16713 MachineFunction *MF = MI.getParent()->getParent();
16716
16717 if (TII->isVOP3(MI.getOpcode())) {
16718 // Make sure constant bus requirements are respected.
16719 TII->legalizeOperandsVOP3(MRI, MI);
16720
16721 // Prefer VGPRs over AGPRs in mAI instructions where possible.
16722 // This saves a chain-copy of registers and better balance register
16723 // use between vgpr and agpr as agpr tuples tend to be big.
16724 if (!MI.getDesc().operands().empty()) {
16725 unsigned Opc = MI.getOpcode();
16726 bool HasAGPRs = Info->mayNeedAGPRs();
16727 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16728 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
16729 for (auto I :
16730 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
16731 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
16732 if (I == -1)
16733 break;
16734 if ((I == Src2Idx) && (HasAGPRs))
16735 break;
16736 MachineOperand &Op = MI.getOperand(I);
16737 if (!Op.isReg() || !Op.getReg().isVirtual())
16738 continue;
16739 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
16740 if (!TRI->hasAGPRs(RC))
16741 continue;
16742 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
16743 if (!Src || !Src->isCopy() ||
16744 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
16745 continue;
16746 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
16747 // All uses of agpr64 and agpr32 can also accept vgpr except for
16748 // v_accvgpr_read, but we do not produce agpr reads during selection,
16749 // so no use checks are needed.
16750 MRI.setRegClass(Op.getReg(), NewRC);
16751 }
16752
16753 if (TII->isMAI(MI)) {
16754 // The ordinary src0, src1, src2 were legalized above.
16755 //
16756 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
16757 // as a separate instruction.
16758 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
16759 AMDGPU::OpName::scale_src0);
16760 if (Src0Idx != -1) {
16761 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
16762 AMDGPU::OpName::scale_src1);
16763 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
16764 TII->usesConstantBus(MRI, MI, Src1Idx))
16765 TII->legalizeOpWithMove(MI, Src1Idx);
16766 }
16767 }
16768
16769 if (!HasAGPRs)
16770 return;
16771
16772 // Resolve the rest of AV operands to AGPRs.
16773 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
16774 if (Src2->isReg() && Src2->getReg().isVirtual()) {
16775 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
16776 if (TRI->isVectorSuperClass(RC)) {
16777 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
16778 MRI.setRegClass(Src2->getReg(), NewRC);
16779 if (Src2->isTied())
16780 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
16781 }
16782 }
16783 }
16784 }
16785
16786 return;
16787 }
16788
16789 if (TII->isImage(MI))
16790 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
16791}
16792
16794 uint64_t Val) {
16795 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
16796 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
16797}
16798
16800 const SDLoc &DL,
16801 SDValue Ptr) const {
16803
16804 // Build the half of the subregister with the constants before building the
16805 // full 128-bit register. If we are building multiple resource descriptors,
16806 // this will allow CSEing of the 2-component register.
16807 const SDValue Ops0[] = {
16808 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
16809 buildSMovImm32(DAG, DL, 0),
16810 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
16811 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
16812 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
16813
16814 SDValue SubRegHi = SDValue(
16815 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
16816
16817 // Combine the constants and the pointer.
16818 const SDValue Ops1[] = {
16819 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
16820 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
16821 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
16822
16823 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
16824}
16825
16826/// Return a resource descriptor with the 'Add TID' bit enabled
16827/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
16828/// of the resource descriptor) to create an offset, which is added to
16829/// the resource pointer.
16831 SDValue Ptr, uint32_t RsrcDword1,
16832 uint64_t RsrcDword2And3) const {
16833 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
16834 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
16835 if (RsrcDword1) {
16836 PtrHi =
16837 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
16838 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
16839 0);
16840 }
16841
16842 SDValue DataLo =
16843 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
16844 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
16845
16846 const SDValue Ops[] = {
16847 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
16848 PtrLo,
16849 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
16850 PtrHi,
16851 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
16852 DataLo,
16853 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
16854 DataHi,
16855 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
16856
16857 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
16858}
16859
16860//===----------------------------------------------------------------------===//
16861// SI Inline Assembly Support
16862//===----------------------------------------------------------------------===//
16863
16864std::pair<unsigned, const TargetRegisterClass *>
16866 StringRef Constraint,
16867 MVT VT) const {
16868 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
16869
16870 const TargetRegisterClass *RC = nullptr;
16871 if (Constraint.size() == 1) {
16872 // Check if we cannot determine the bit size of the given value type. This
16873 // can happen, for example, in this situation where we have an empty struct
16874 // (size 0): `call void asm "", "v"({} poison)`-
16875 if (VT == MVT::Other)
16876 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
16877 const unsigned BitWidth = VT.getSizeInBits();
16878 switch (Constraint[0]) {
16879 default:
16880 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
16881 case 's':
16882 case 'r':
16883 switch (BitWidth) {
16884 case 16:
16885 RC = &AMDGPU::SReg_32RegClass;
16886 break;
16887 case 64:
16888 RC = &AMDGPU::SGPR_64RegClass;
16889 break;
16890 default:
16892 if (!RC)
16893 return std::pair(0U, nullptr);
16894 break;
16895 }
16896 break;
16897 case 'v':
16898 switch (BitWidth) {
16899 case 16:
16900 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
16901 : &AMDGPU::VGPR_32RegClass;
16902 break;
16903 default:
16904 RC = TRI->getVGPRClassForBitWidth(BitWidth);
16905 if (!RC)
16906 return std::pair(0U, nullptr);
16907 break;
16908 }
16909 break;
16910 case 'a':
16911 if (!Subtarget->hasMAIInsts())
16912 break;
16913 switch (BitWidth) {
16914 case 16:
16915 RC = &AMDGPU::AGPR_32RegClass;
16916 break;
16917 default:
16918 RC = TRI->getAGPRClassForBitWidth(BitWidth);
16919 if (!RC)
16920 return std::pair(0U, nullptr);
16921 break;
16922 }
16923 break;
16924 }
16925 } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
16926 const unsigned BitWidth = VT.getSizeInBits();
16927 switch (BitWidth) {
16928 case 16:
16929 RC = &AMDGPU::AV_32RegClass;
16930 break;
16931 default:
16932 RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
16933 if (!RC)
16934 return std::pair(0U, nullptr);
16935 break;
16936 }
16937 }
16938
16939 // We actually support i128, i16 and f16 as inline parameters
16940 // even if they are not reported as legal
16941 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
16942 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
16943 return std::pair(0U, RC);
16944
16945 auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
16946 if (Kind != '\0') {
16947 if (Kind == 'v') {
16948 RC = &AMDGPU::VGPR_32RegClass;
16949 } else if (Kind == 's') {
16950 RC = &AMDGPU::SGPR_32RegClass;
16951 } else if (Kind == 'a') {
16952 RC = &AMDGPU::AGPR_32RegClass;
16953 }
16954
16955 if (RC) {
16956 if (NumRegs > 1) {
16957 if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
16958 return std::pair(0U, nullptr);
16959
16960 uint32_t Width = NumRegs * 32;
16961 // Prohibit constraints for register ranges with a width that does not
16962 // match the required type.
16963 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
16964 return std::pair(0U, nullptr);
16965
16966 MCRegister Reg = RC->getRegister(Idx);
16968 RC = TRI->getVGPRClassForBitWidth(Width);
16969 else if (SIRegisterInfo::isSGPRClass(RC))
16970 RC = TRI->getSGPRClassForBitWidth(Width);
16971 else if (SIRegisterInfo::isAGPRClass(RC))
16972 RC = TRI->getAGPRClassForBitWidth(Width);
16973 if (RC) {
16974 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
16975 if (!Reg) {
16976 // The register class does not contain the requested register,
16977 // e.g., because it is an SGPR pair that would violate alignment
16978 // requirements.
16979 return std::pair(0U, nullptr);
16980 }
16981 return std::pair(Reg, RC);
16982 }
16983 }
16984
16985 // Check for lossy scalar/vector conversions.
16986 if (VT.isVector() && VT.getSizeInBits() != 32)
16987 return std::pair(0U, nullptr);
16988 if (Idx < RC->getNumRegs())
16989 return std::pair(RC->getRegister(Idx), RC);
16990 }
16991 }
16992
16993 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
16994 if (Ret.first)
16995 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
16996
16997 return Ret;
16998}
16999
17000static bool isImmConstraint(StringRef Constraint) {
17001 if (Constraint.size() == 1) {
17002 switch (Constraint[0]) {
17003 default:
17004 break;
17005 case 'I':
17006 case 'J':
17007 case 'A':
17008 case 'B':
17009 case 'C':
17010 return true;
17011 }
17012 } else if (Constraint == "DA" || Constraint == "DB") {
17013 return true;
17014 }
17015 return false;
17016}
17017
17020 if (Constraint.size() == 1) {
17021 switch (Constraint[0]) {
17022 default:
17023 break;
17024 case 's':
17025 case 'v':
17026 case 'a':
17027 return C_RegisterClass;
17028 }
17029 } else if (Constraint.size() == 2) {
17030 if (Constraint == "VA")
17031 return C_RegisterClass;
17032 }
17033 if (isImmConstraint(Constraint)) {
17034 return C_Other;
17035 }
17036 return TargetLowering::getConstraintType(Constraint);
17037}
17038
17039static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
17041 Val = Val & maskTrailingOnes<uint64_t>(Size);
17042 }
17043 return Val;
17044}
17045
17047 StringRef Constraint,
17048 std::vector<SDValue> &Ops,
17049 SelectionDAG &DAG) const {
17050 if (isImmConstraint(Constraint)) {
17051 uint64_t Val;
17052 if (getAsmOperandConstVal(Op, Val) &&
17053 checkAsmConstraintVal(Op, Constraint, Val)) {
17054 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
17055 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
17056 }
17057 } else {
17058 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
17059 }
17060}
17061
17063 unsigned Size = Op.getScalarValueSizeInBits();
17064 if (Size > 64)
17065 return false;
17066
17067 if (Size == 16 && !Subtarget->has16BitInsts())
17068 return false;
17069
17070 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
17071 Val = C->getSExtValue();
17072 return true;
17073 }
17074 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
17075 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17076 return true;
17077 }
17078 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) {
17079 if (Size != 16 || Op.getNumOperands() != 2)
17080 return false;
17081 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
17082 return false;
17083 if (ConstantSDNode *C = V->getConstantSplatNode()) {
17084 Val = C->getSExtValue();
17085 return true;
17086 }
17087 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
17088 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17089 return true;
17090 }
17091 }
17092
17093 return false;
17094}
17095
17097 uint64_t Val) const {
17098 if (Constraint.size() == 1) {
17099 switch (Constraint[0]) {
17100 case 'I':
17102 case 'J':
17103 return isInt<16>(Val);
17104 case 'A':
17105 return checkAsmConstraintValA(Op, Val);
17106 case 'B':
17107 return isInt<32>(Val);
17108 case 'C':
17109 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
17111 default:
17112 break;
17113 }
17114 } else if (Constraint.size() == 2) {
17115 if (Constraint == "DA") {
17116 int64_t HiBits = static_cast<int32_t>(Val >> 32);
17117 int64_t LoBits = static_cast<int32_t>(Val);
17118 return checkAsmConstraintValA(Op, HiBits, 32) &&
17119 checkAsmConstraintValA(Op, LoBits, 32);
17120 }
17121 if (Constraint == "DB") {
17122 return true;
17123 }
17124 }
17125 llvm_unreachable("Invalid asm constraint");
17126}
17127
17129 unsigned MaxSize) const {
17130 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
17131 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
17132 if (Size == 16) {
17133 MVT VT = Op.getSimpleValueType();
17134 switch (VT.SimpleTy) {
17135 default:
17136 return false;
17137 case MVT::i16:
17138 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
17139 case MVT::f16:
17140 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
17141 case MVT::bf16:
17142 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
17143 case MVT::v2i16:
17144 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
17145 case MVT::v2f16:
17146 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
17147 case MVT::v2bf16:
17148 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
17149 }
17150 }
17151 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
17152 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
17153 return true;
17154 return false;
17155}
17156
17157static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
17158 switch (UnalignedClassID) {
17159 case AMDGPU::VReg_64RegClassID:
17160 return AMDGPU::VReg_64_Align2RegClassID;
17161 case AMDGPU::VReg_96RegClassID:
17162 return AMDGPU::VReg_96_Align2RegClassID;
17163 case AMDGPU::VReg_128RegClassID:
17164 return AMDGPU::VReg_128_Align2RegClassID;
17165 case AMDGPU::VReg_160RegClassID:
17166 return AMDGPU::VReg_160_Align2RegClassID;
17167 case AMDGPU::VReg_192RegClassID:
17168 return AMDGPU::VReg_192_Align2RegClassID;
17169 case AMDGPU::VReg_224RegClassID:
17170 return AMDGPU::VReg_224_Align2RegClassID;
17171 case AMDGPU::VReg_256RegClassID:
17172 return AMDGPU::VReg_256_Align2RegClassID;
17173 case AMDGPU::VReg_288RegClassID:
17174 return AMDGPU::VReg_288_Align2RegClassID;
17175 case AMDGPU::VReg_320RegClassID:
17176 return AMDGPU::VReg_320_Align2RegClassID;
17177 case AMDGPU::VReg_352RegClassID:
17178 return AMDGPU::VReg_352_Align2RegClassID;
17179 case AMDGPU::VReg_384RegClassID:
17180 return AMDGPU::VReg_384_Align2RegClassID;
17181 case AMDGPU::VReg_512RegClassID:
17182 return AMDGPU::VReg_512_Align2RegClassID;
17183 case AMDGPU::VReg_1024RegClassID:
17184 return AMDGPU::VReg_1024_Align2RegClassID;
17185 case AMDGPU::AReg_64RegClassID:
17186 return AMDGPU::AReg_64_Align2RegClassID;
17187 case AMDGPU::AReg_96RegClassID:
17188 return AMDGPU::AReg_96_Align2RegClassID;
17189 case AMDGPU::AReg_128RegClassID:
17190 return AMDGPU::AReg_128_Align2RegClassID;
17191 case AMDGPU::AReg_160RegClassID:
17192 return AMDGPU::AReg_160_Align2RegClassID;
17193 case AMDGPU::AReg_192RegClassID:
17194 return AMDGPU::AReg_192_Align2RegClassID;
17195 case AMDGPU::AReg_256RegClassID:
17196 return AMDGPU::AReg_256_Align2RegClassID;
17197 case AMDGPU::AReg_512RegClassID:
17198 return AMDGPU::AReg_512_Align2RegClassID;
17199 case AMDGPU::AReg_1024RegClassID:
17200 return AMDGPU::AReg_1024_Align2RegClassID;
17201 default:
17202 return -1;
17203 }
17204}
17205
17206// Figure out which registers should be reserved for stack access. Only after
17207// the function is legalized do we know all of the non-spill stack objects or if
17208// calls are present.
17212 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
17213 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17214 const SIInstrInfo *TII = ST.getInstrInfo();
17215
17216 if (Info->isEntryFunction()) {
17217 // Callable functions have fixed registers used for stack access.
17219 }
17220
17221 // TODO: Move this logic to getReservedRegs()
17222 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
17223 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
17224 Register SReg = ST.isWave32()
17225 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
17226 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
17227 &AMDGPU::SGPR_64RegClass);
17228 Info->setSGPRForEXECCopy(SReg);
17229
17230 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
17231 Info->getStackPtrOffsetReg()));
17232 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
17233 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
17234
17235 // We need to worry about replacing the default register with itself in case
17236 // of MIR testcases missing the MFI.
17237 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
17238 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
17239
17240 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
17241 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
17242
17243 Info->limitOccupancy(MF);
17244
17245 if (ST.isWave32() && !MF.empty()) {
17246 for (auto &MBB : MF) {
17247 for (auto &MI : MBB) {
17248 TII->fixImplicitOperands(MI);
17249 }
17250 }
17251 }
17252
17253 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
17254 // classes if required. Ideally the register class constraints would differ
17255 // per-subtarget, but there's no easy way to achieve that right now. This is
17256 // not a problem for VGPRs because the correctly aligned VGPR class is implied
17257 // from using them as the register class for legal types.
17258 if (ST.needsAlignedVGPRs()) {
17259 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
17260 const Register Reg = Register::index2VirtReg(I);
17261 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
17262 if (!RC)
17263 continue;
17264 int NewClassID = getAlignedAGPRClassID(RC->getID());
17265 if (NewClassID != -1)
17266 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
17267 }
17268 }
17269
17271}
17272
17274 KnownBits &Known,
17275 const APInt &DemandedElts,
17276 const SelectionDAG &DAG,
17277 unsigned Depth) const {
17278 Known.resetAll();
17279 unsigned Opc = Op.getOpcode();
17280 switch (Opc) {
17282 unsigned IID = Op.getConstantOperandVal(0);
17283 switch (IID) {
17284 case Intrinsic::amdgcn_mbcnt_lo:
17285 case Intrinsic::amdgcn_mbcnt_hi: {
17286 const GCNSubtarget &ST =
17288 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
17289 // most 31 + src1.
17290 Known.Zero.setBitsFrom(
17291 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
17292 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
17293 Known = KnownBits::add(Known, Known2);
17294 return;
17295 }
17296 }
17297 break;
17298 }
17299 }
17301 Op, Known, DemandedElts, DAG, Depth);
17302}
17303
17305 const int FI, KnownBits &Known, const MachineFunction &MF) const {
17307
17308 // Set the high bits to zero based on the maximum allowed scratch size per
17309 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
17310 // calculation won't overflow, so assume the sign bit is never set.
17311 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
17312}
17313
17315 GISelValueTracking &VT, KnownBits &Known,
17316 unsigned Dim) {
17317 unsigned MaxValue =
17318 ST.getMaxWorkitemID(VT.getMachineFunction().getFunction(), Dim);
17319 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
17320}
17321
17323 KnownBits &Known, const APInt &DemandedElts,
17324 unsigned BFEWidth, bool SExt, unsigned Depth) {
17326 const MachineOperand &Src1 = MI.getOperand(2);
17327
17328 unsigned Src1Cst = 0;
17329 if (Src1.isImm()) {
17330 Src1Cst = Src1.getImm();
17331 } else if (Src1.isReg()) {
17332 auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI);
17333 if (!Cst)
17334 return;
17335 Src1Cst = Cst->Value.getZExtValue();
17336 } else {
17337 return;
17338 }
17339
17340 // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
17341 // Width is always [22:16].
17342 const unsigned Offset =
17343 Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
17344 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
17345
17346 if (Width >= BFEWidth) // Ill-formed.
17347 return;
17348
17349 VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
17350 Depth + 1);
17351
17352 Known = Known.extractBits(Width, Offset);
17353
17354 if (SExt)
17355 Known = Known.sext(BFEWidth);
17356 else
17357 Known = Known.zext(BFEWidth);
17358}
17359
17361 GISelValueTracking &VT, Register R, KnownBits &Known,
17362 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
17363 unsigned Depth) const {
17364 Known.resetAll();
17365 const MachineInstr *MI = MRI.getVRegDef(R);
17366 switch (MI->getOpcode()) {
17367 case AMDGPU::S_BFE_I32:
17368 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
17369 /*SExt=*/true, Depth);
17370 case AMDGPU::S_BFE_U32:
17371 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
17372 /*SExt=*/false, Depth);
17373 case AMDGPU::S_BFE_I64:
17374 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
17375 /*SExt=*/true, Depth);
17376 case AMDGPU::S_BFE_U64:
17377 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
17378 /*SExt=*/false, Depth);
17379 case AMDGPU::G_INTRINSIC:
17380 case AMDGPU::G_INTRINSIC_CONVERGENT: {
17381 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
17382 switch (IID) {
17383 case Intrinsic::amdgcn_workitem_id_x:
17384 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 0);
17385 break;
17386 case Intrinsic::amdgcn_workitem_id_y:
17387 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 1);
17388 break;
17389 case Intrinsic::amdgcn_workitem_id_z:
17390 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 2);
17391 break;
17392 case Intrinsic::amdgcn_mbcnt_lo:
17393 case Intrinsic::amdgcn_mbcnt_hi: {
17394 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
17395 // most 31 + src1.
17396 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
17397 ? getSubtarget()->getWavefrontSizeLog2()
17398 : 5);
17399 KnownBits Known2;
17400 VT.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
17401 Depth + 1);
17402 Known = KnownBits::add(Known, Known2);
17403 break;
17404 }
17405 case Intrinsic::amdgcn_groupstaticsize: {
17406 // We can report everything over the maximum size as 0. We can't report
17407 // based on the actual size because we don't know if it's accurate or not
17408 // at any given point.
17409 Known.Zero.setHighBits(
17410 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
17411 break;
17412 }
17413 }
17414 break;
17415 }
17416 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
17417 Known.Zero.setHighBits(24);
17418 break;
17419 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
17420 Known.Zero.setHighBits(16);
17421 break;
17422 case AMDGPU::G_AMDGPU_SMED3:
17423 case AMDGPU::G_AMDGPU_UMED3: {
17424 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
17425
17426 KnownBits Known2;
17427 VT.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
17428 if (Known2.isUnknown())
17429 break;
17430
17431 KnownBits Known1;
17432 VT.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
17433 if (Known1.isUnknown())
17434 break;
17435
17436 KnownBits Known0;
17437 VT.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
17438 if (Known0.isUnknown())
17439 break;
17440
17441 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
17442 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
17443 Known.One = Known0.One & Known1.One & Known2.One;
17444 break;
17445 }
17446 }
17447}
17448
17451 unsigned Depth) const {
17452 const MachineInstr *MI = MRI.getVRegDef(R);
17453 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
17454 // FIXME: Can this move to generic code? What about the case where the call
17455 // site specifies a lower alignment?
17456 Intrinsic::ID IID = GI->getIntrinsicID();
17458 AttributeList Attrs =
17459 Intrinsic::getAttributes(Ctx, IID, Intrinsic::getType(Ctx, IID));
17460 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
17461 return *RetAlign;
17462 }
17463 return Align(1);
17464}
17465
17468 const Align CacheLineAlign = Align(64);
17469
17470 // Pre-GFX10 target did not benefit from loop alignment
17471 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
17472 getSubtarget()->hasInstFwdPrefetchBug())
17473 return PrefAlign;
17474
17475 // On GFX10 I$ is 4 x 64 bytes cache lines.
17476 // By default prefetcher keeps one cache line behind and reads two ahead.
17477 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
17478 // behind and one ahead.
17479 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
17480 // If loop fits 64 bytes it always spans no more than two cache lines and
17481 // does not need an alignment.
17482 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
17483 // Else if loop is less or equal 192 bytes we need two lines behind.
17484
17486 const MachineBasicBlock *Header = ML->getHeader();
17487 if (Header->getAlignment() != PrefAlign)
17488 return Header->getAlignment(); // Already processed.
17489
17490 unsigned LoopSize = 0;
17491 for (const MachineBasicBlock *MBB : ML->blocks()) {
17492 // If inner loop block is aligned assume in average half of the alignment
17493 // size to be added as nops.
17494 if (MBB != Header)
17495 LoopSize += MBB->getAlignment().value() / 2;
17496
17497 for (const MachineInstr &MI : *MBB) {
17498 LoopSize += TII->getInstSizeInBytes(MI);
17499 if (LoopSize > 192)
17500 return PrefAlign;
17501 }
17502 }
17503
17504 if (LoopSize <= 64)
17505 return PrefAlign;
17506
17507 if (LoopSize <= 128)
17508 return CacheLineAlign;
17509
17510 // If any of parent loops is surrounded by prefetch instructions do not
17511 // insert new for inner loop, which would reset parent's settings.
17512 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
17513 if (MachineBasicBlock *Exit = P->getExitBlock()) {
17514 auto I = Exit->getFirstNonDebugInstr();
17515 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
17516 return CacheLineAlign;
17517 }
17518 }
17519
17520 MachineBasicBlock *Pre = ML->getLoopPreheader();
17521 MachineBasicBlock *Exit = ML->getExitBlock();
17522
17523 if (Pre && Exit) {
17524 auto PreTerm = Pre->getFirstTerminator();
17525 if (PreTerm == Pre->begin() ||
17526 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
17527 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
17528 .addImm(1); // prefetch 2 lines behind PC
17529
17530 auto ExitHead = Exit->getFirstNonDebugInstr();
17531 if (ExitHead == Exit->end() ||
17532 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
17533 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
17534 .addImm(2); // prefetch 1 line behind PC
17535 }
17536
17537 return CacheLineAlign;
17538}
17539
17541static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
17542 assert(N->getOpcode() == ISD::CopyFromReg);
17543 do {
17544 // Follow the chain until we find an INLINEASM node.
17545 N = N->getOperand(0).getNode();
17546 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
17547 return true;
17548 } while (N->getOpcode() == ISD::CopyFromReg);
17549 return false;
17550}
17551
17554 UniformityInfo *UA) const {
17555 switch (N->getOpcode()) {
17556 case ISD::CopyFromReg: {
17557 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
17558 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
17559 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17560 Register Reg = R->getReg();
17561
17562 // FIXME: Why does this need to consider isLiveIn?
17563 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
17564 return !TRI->isSGPRReg(MRI, Reg);
17565
17566 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
17567 return UA->isDivergent(V);
17568
17570 return !TRI->isSGPRReg(MRI, Reg);
17571 }
17572 case ISD::LOAD: {
17573 const LoadSDNode *L = cast<LoadSDNode>(N);
17574 unsigned AS = L->getAddressSpace();
17575 // A flat load may access private memory.
17577 }
17578 case ISD::CALLSEQ_END:
17579 return true;
17581 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
17583 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
17602 // Target-specific read-modify-write atomics are sources of divergence.
17603 return true;
17604 default:
17605 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
17606 // Generic read-modify-write atomics are sources of divergence.
17607 return A->readMem() && A->writeMem();
17608 }
17609 return false;
17610 }
17611}
17612
17614 EVT VT) const {
17615 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
17616 case MVT::f32:
17618 case MVT::f64:
17619 case MVT::f16:
17621 default:
17622 return false;
17623 }
17624}
17625
17627 LLT Ty, const MachineFunction &MF) const {
17628 switch (Ty.getScalarSizeInBits()) {
17629 case 32:
17630 return !denormalModeIsFlushAllF32(MF);
17631 case 64:
17632 case 16:
17633 return !denormalModeIsFlushAllF64F16(MF);
17634 default:
17635 return false;
17636 }
17637}
17638
17640 const APInt &DemandedElts,
17641 const SelectionDAG &DAG,
17642 bool SNaN,
17643 unsigned Depth) const {
17644 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
17645 const MachineFunction &MF = DAG.getMachineFunction();
17647
17648 if (Info->getMode().DX10Clamp)
17649 return true; // Clamped to 0.
17650 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
17651 }
17652
17654 DAG, SNaN, Depth);
17655}
17656
17657// On older subtargets, global FP atomic instructions have a hardcoded FP mode
17658// and do not support FP32 denormals, and only support v2f16/f64 denormals.
17660 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
17661 return true;
17662
17664 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
17665 if (DenormMode == DenormalMode::getPreserveSign())
17666 return true;
17667
17668 // TODO: Remove this.
17669 return RMW->getFunction()
17670 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
17671 .getValueAsBool();
17672}
17673
17675 LLVMContext &Ctx = RMW->getContext();
17676 StringRef MemScope =
17677 Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system");
17678
17679 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
17680 << "Hardware instruction generated for atomic "
17681 << RMW->getOperationName(RMW->getOperation())
17682 << " operation at memory scope " << MemScope;
17683}
17684
17685static bool isV2F16OrV2BF16(Type *Ty) {
17686 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
17687 Type *EltTy = VT->getElementType();
17688 return VT->getNumElements() == 2 &&
17689 (EltTy->isHalfTy() || EltTy->isBFloatTy());
17690 }
17691
17692 return false;
17693}
17694
17695static bool isV2F16(Type *Ty) {
17696 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
17697 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
17698}
17699
17700static bool isV2BF16(Type *Ty) {
17701 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
17702 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
17703}
17704
17705/// \return true if atomicrmw integer ops work for the type.
17706static bool isAtomicRMWLegalIntTy(Type *Ty) {
17707 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
17708 unsigned BW = IT->getBitWidth();
17709 return BW == 32 || BW == 64;
17710 }
17711
17712 return false;
17713}
17714
17715/// \return true if this atomicrmw xchg type can be selected.
17716static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
17717 Type *Ty = RMW->getType();
17718 if (isAtomicRMWLegalIntTy(Ty))
17719 return true;
17720
17721 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
17722 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
17723 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
17724 return BW == 32 || BW == 64;
17725 }
17726
17727 if (Ty->isFloatTy() || Ty->isDoubleTy())
17728 return true;
17729
17730 if (FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty)) {
17731 return VT->getNumElements() == 2 &&
17732 VT->getElementType()->getPrimitiveSizeInBits() == 16;
17733 }
17734
17735 return false;
17736}
17737
17738/// \returns true if it's valid to emit a native instruction for \p RMW, based
17739/// on the properties of the target memory.
17740static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
17741 const AtomicRMWInst *RMW,
17742 bool HasSystemScope) {
17743 // The remote/fine-grained access logic is different from the integer
17744 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
17745 // fine-grained access does not work, even for a device local allocation.
17746 //
17747 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
17748 // allocations work.
17749 if (HasSystemScope) {
17751 RMW->hasMetadata("amdgpu.no.remote.memory"))
17752 return true;
17753 if (Subtarget.hasEmulatedSystemScopeAtomics())
17754 return true;
17756 return true;
17757
17758 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
17759}
17760
17761/// \return Action to perform on AtomicRMWInsts for integer operations.
17764 return isAtomicRMWLegalIntTy(RMW->getType())
17767}
17768
17769/// Return if a flat address space atomicrmw can access private memory.
17771 const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace);
17772 return !MD ||
17774}
17775
17778 unsigned AS = RMW->getPointerAddressSpace();
17779 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
17781
17782 // 64-bit flat atomics that dynamically reside in private memory will silently
17783 // be dropped.
17784 //
17785 // Note that we will emit a new copy of the original atomic in the expansion,
17786 // which will be incrementally relegalized.
17787 const DataLayout &DL = RMW->getFunction()->getDataLayout();
17788 if (AS == AMDGPUAS::FLAT_ADDRESS &&
17789 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
17792
17793 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
17795 ORE.emit([=]() {
17796 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
17797 });
17798 return Kind;
17799 };
17800
17801 auto SSID = RMW->getSyncScopeID();
17802 bool HasSystemScope =
17803 SSID == SyncScope::System ||
17804 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
17805
17806 auto Op = RMW->getOperation();
17807 switch (Op) {
17809 // PCIe supports add and xchg for system atomics.
17810 return isAtomicRMWLegalXChgTy(RMW)
17813 case AtomicRMWInst::Add:
17814 // PCIe supports add and xchg for system atomics.
17816 case AtomicRMWInst::Sub:
17817 case AtomicRMWInst::And:
17818 case AtomicRMWInst::Or:
17819 case AtomicRMWInst::Xor:
17820 case AtomicRMWInst::Max:
17821 case AtomicRMWInst::Min:
17828 if (Subtarget->hasEmulatedSystemScopeAtomics())
17830
17831 // On most subtargets, for atomicrmw operations other than add/xchg,
17832 // whether or not the instructions will behave correctly depends on where
17833 // the address physically resides and what interconnect is used in the
17834 // system configuration. On some some targets the instruction will nop,
17835 // and in others synchronization will only occur at degraded device scope.
17836 //
17837 // If the allocation is known local to the device, the instructions should
17838 // work correctly.
17839 if (RMW->hasMetadata("amdgpu.no.remote.memory"))
17841
17842 // If fine-grained remote memory works at device scope, we don't need to
17843 // do anything.
17844 if (!HasSystemScope &&
17847
17848 // If we are targeting a remote allocated address, it depends what kind of
17849 // allocation the address belongs to.
17850 //
17851 // If the allocation is fine-grained (in host memory, or in PCIe peer
17852 // device memory), the operation will fail depending on the target.
17853 //
17854 // Note fine-grained host memory access does work on APUs or if XGMI is
17855 // used, but we do not know if we are targeting an APU or the system
17856 // configuration from the ISA version/target-cpu.
17857 if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
17859
17862 // Atomic sub/or/xor do not work over PCI express, but atomic add
17863 // does. InstCombine transforms these with 0 to or, so undo that.
17864 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
17865 ConstVal && ConstVal->isNullValue())
17867 }
17868
17869 // If the allocation could be in remote, fine-grained memory, the rmw
17870 // instructions may fail. cmpxchg should work, so emit that. On some
17871 // system configurations, PCIe atomics aren't supported so cmpxchg won't
17872 // even work, so you're out of luck anyway.
17873
17874 // In summary:
17875 //
17876 // Cases that may fail:
17877 // - fine-grained pinned host memory
17878 // - fine-grained migratable host memory
17879 // - fine-grained PCIe peer device
17880 //
17881 // Cases that should work, but may be treated overly conservatively.
17882 // - fine-grained host memory on an APU
17883 // - fine-grained XGMI peer device
17885 }
17886
17888 }
17889 case AtomicRMWInst::FAdd: {
17890 Type *Ty = RMW->getType();
17891
17892 // TODO: Handle REGION_ADDRESS
17893 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
17894 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
17895 // is fixed to round-to-nearest-even.
17896 //
17897 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
17898 // round-to-nearest-even.
17899 //
17900 // We ignore the rounding mode problem, even in strictfp. The C++ standard
17901 // suggests it is OK if the floating-point mode may not match the calling
17902 // thread.
17903 if (Ty->isFloatTy()) {
17906 }
17907
17908 if (Ty->isDoubleTy()) {
17909 // Ignores denormal mode, but we don't consider flushing mandatory.
17912 }
17913
17914 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
17916
17918 }
17919
17920 // LDS atomics respect the denormal mode from the mode register.
17921 //
17922 // Traditionally f32 global/buffer memory atomics would unconditionally
17923 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
17924 // flush.
17925 //
17926 // On targets with flat atomic fadd, denormals would flush depending on
17927 // whether the target address resides in LDS or global memory. We consider
17928 // this flat-maybe-flush as will-flush.
17929 if (Ty->isFloatTy() &&
17933
17934 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
17935 // safe. The message phrasing also should be better.
17936 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
17937 if (AS == AMDGPUAS::FLAT_ADDRESS) {
17938 // gfx942, gfx12
17939 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
17940 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17941 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
17942 // gfx90a, gfx942, gfx12
17943 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
17944 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17945
17946 // gfx942, gfx12
17947 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
17948 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17949 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
17950 // gfx90a, gfx942, gfx12
17951 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
17952 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17953
17954 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
17955 // buffer. gfx12 does have the buffer version.
17956 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
17957 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17958 }
17959
17960 // global and flat atomic fadd f64: gfx90a, gfx942.
17961 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
17962 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17963
17964 if (AS != AMDGPUAS::FLAT_ADDRESS) {
17965 if (Ty->isFloatTy()) {
17966 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
17967 // gfx11+.
17968 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
17969 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17970 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
17971 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
17972 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17973 } else {
17974 // gfx908
17975 if (RMW->use_empty() &&
17977 isV2F16(Ty))
17978 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17979 }
17980 }
17981
17982 // flat atomic fadd f32: gfx942, gfx11+.
17983 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
17984 if (Subtarget->hasFlatAtomicFaddF32Inst())
17985 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17986
17987 // If it is in flat address space, and the type is float, we will try to
17988 // expand it, if the target supports global and lds atomic fadd. The
17989 // reason we need that is, in the expansion, we emit the check of
17990 // address space. If it is in global address space, we emit the global
17991 // atomic fadd; if it is in shared address space, we emit the LDS atomic
17992 // fadd.
17993 if (Subtarget->hasLDSFPAtomicAddF32()) {
17994 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
17996 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
17998 }
17999 }
18000 }
18001
18003 }
18005 case AtomicRMWInst::FMax: {
18006 Type *Ty = RMW->getType();
18007
18008 // LDS float and double fmin/fmax were always supported.
18009 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18010 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
18012 }
18013
18014 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18015 // For flat and global cases:
18016 // float, double in gfx7. Manual claims denormal support.
18017 // Removed in gfx8.
18018 // float, double restored in gfx10.
18019 // double removed again in gfx11, so only f32 for gfx11/gfx12.
18020 //
18021 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
18022 // no f32.
18023 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18024 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18025 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18026 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18027 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18028 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
18030 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18031 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18032 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18033 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18034 }
18035 }
18036
18038 }
18041 default:
18043 }
18044
18045 llvm_unreachable("covered atomicrmw op switch");
18046}
18047
18053}
18054
18057 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
18060}
18061
18064 unsigned AddrSpace = CmpX->getPointerAddressSpace();
18065 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
18067
18068 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
18070
18071 const DataLayout &DL = CmpX->getDataLayout();
18072
18073 Type *ValTy = CmpX->getNewValOperand()->getType();
18074
18075 // If a 64-bit flat atomic may alias private, we need to avoid using the
18076 // atomic in the private case.
18077 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand
18079}
18080
18081const TargetRegisterClass *
18082SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
18084 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18085 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18086 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18087 : &AMDGPU::SReg_32RegClass;
18088 if (!TRI->isSGPRClass(RC) && !isDivergent)
18089 return TRI->getEquivalentSGPRClass(RC);
18090 if (TRI->isSGPRClass(RC) && isDivergent)
18091 return TRI->getEquivalentVGPRClass(RC);
18092
18093 return RC;
18094}
18095
18096// FIXME: This is a workaround for DivergenceAnalysis not understanding always
18097// uniform values (as produced by the mask results of control flow intrinsics)
18098// used outside of divergent blocks. The phi users need to also be treated as
18099// always uniform.
18100//
18101// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
18102static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
18103 unsigned WaveSize) {
18104 // FIXME: We assume we never cast the mask results of a control flow
18105 // intrinsic.
18106 // Early exit if the type won't be consistent as a compile time hack.
18107 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
18108 if (!IT || IT->getBitWidth() != WaveSize)
18109 return false;
18110
18111 if (!isa<Instruction>(V))
18112 return false;
18113 if (!Visited.insert(V).second)
18114 return false;
18115 bool Result = false;
18116 for (const auto *U : V->users()) {
18117 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
18118 if (V == U->getOperand(1)) {
18119 switch (Intrinsic->getIntrinsicID()) {
18120 default:
18121 Result = false;
18122 break;
18123 case Intrinsic::amdgcn_if_break:
18124 case Intrinsic::amdgcn_if:
18125 case Intrinsic::amdgcn_else:
18126 Result = true;
18127 break;
18128 }
18129 }
18130 if (V == U->getOperand(0)) {
18131 switch (Intrinsic->getIntrinsicID()) {
18132 default:
18133 Result = false;
18134 break;
18135 case Intrinsic::amdgcn_end_cf:
18136 case Intrinsic::amdgcn_loop:
18137 Result = true;
18138 break;
18139 }
18140 }
18141 } else {
18142 Result = hasCFUser(U, Visited, WaveSize);
18143 }
18144 if (Result)
18145 break;
18146 }
18147 return Result;
18148}
18149
18151 const Value *V) const {
18152 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
18153 if (CI->isInlineAsm()) {
18154 // FIXME: This cannot give a correct answer. This should only trigger in
18155 // the case where inline asm returns mixed SGPR and VGPR results, used
18156 // outside the defining block. We don't have a specific result to
18157 // consider, so this assumes if any value is SGPR, the overall register
18158 // also needs to be SGPR.
18159 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
18161 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
18162 for (auto &TC : TargetConstraints) {
18163 if (TC.Type == InlineAsm::isOutput) {
18165 const TargetRegisterClass *RC =
18166 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
18167 TC.ConstraintVT)
18168 .second;
18169 if (RC && SIRI->isSGPRClass(RC))
18170 return true;
18171 }
18172 }
18173 }
18174 }
18176 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
18177}
18178
18180 for (SDUse &Use : N->uses()) {
18181 if (MemSDNode *M = dyn_cast<MemSDNode>(Use.getUser())) {
18182 if (getBasePtrIndex(M) == Use.getOperandNo())
18183 return true;
18184 }
18185 }
18186 return false;
18187}
18188
18190 SDValue N1) const {
18191 if (!N0.hasOneUse())
18192 return false;
18193 // Take care of the opportunity to keep N0 uniform
18194 if (N0->isDivergent() || !N1->isDivergent())
18195 return true;
18196 // Check if we have a good chance to form the memory access pattern with the
18197 // base and offset
18198 return (DAG.isBaseWithConstantOffset(N0) &&
18200}
18201
18203 Register N0, Register N1) const {
18204 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
18205}
18206
18209 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
18211 if (I.getMetadata("amdgpu.noclobber"))
18212 Flags |= MONoClobber;
18213 if (I.getMetadata("amdgpu.last.use"))
18214 Flags |= MOLastUse;
18215 return Flags;
18216}
18217
18219 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
18220 const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const {
18221 if (User->getOpcode() != ISD::CopyToReg)
18222 return false;
18223 if (!Def->isMachineOpcode())
18224 return false;
18225 MachineSDNode *MDef = dyn_cast<MachineSDNode>(Def);
18226 if (!MDef)
18227 return false;
18228
18229 unsigned ResNo = User->getOperand(Op).getResNo();
18230 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
18231 return false;
18232 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
18233 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
18234 PhysReg = AMDGPU::SCC;
18235 const TargetRegisterClass *RC =
18236 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
18237 Cost = RC->getCopyCost();
18238 return true;
18239 }
18240 return false;
18241}
18242
18244 Instruction *AI) const {
18245 // Given: atomicrmw fadd ptr %addr, float %val ordering
18246 //
18247 // With this expansion we produce the following code:
18248 // [...]
18249 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
18250 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
18251 //
18252 // atomicrmw.shared:
18253 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
18254 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
18255 // float %val ordering
18256 // br label %atomicrmw.phi
18257 //
18258 // atomicrmw.check.private:
18259 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
18260 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
18261 //
18262 // atomicrmw.private:
18263 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
18264 // %loaded.private = load float, ptr addrspace(5) %cast.private
18265 // %val.new = fadd float %loaded.private, %val
18266 // store float %val.new, ptr addrspace(5) %cast.private
18267 // br label %atomicrmw.phi
18268 //
18269 // atomicrmw.global:
18270 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
18271 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
18272 // float %val ordering
18273 // br label %atomicrmw.phi
18274 //
18275 // atomicrmw.phi:
18276 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
18277 // [ %loaded.private, %atomicrmw.private ],
18278 // [ %loaded.global, %atomicrmw.global ]
18279 // br label %atomicrmw.end
18280 //
18281 // atomicrmw.end:
18282 // [...]
18283 //
18284 //
18285 // For 64-bit atomics which may reside in private memory, we perform a simpler
18286 // version that only inserts the private check, and uses the flat operation.
18287
18288 IRBuilder<> Builder(AI);
18289 LLVMContext &Ctx = Builder.getContext();
18290
18291 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
18292 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
18294 Value *Addr = AI->getOperand(PtrOpIdx);
18295
18296 /// TODO: Only need to check private, then emit flat-known-not private (no
18297 /// need for shared block, or cast to global).
18298 AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);
18299
18300 Align Alignment;
18301 if (RMW)
18302 Alignment = RMW->getAlign();
18303 else if (CX)
18304 Alignment = CX->getAlign();
18305 else
18306 llvm_unreachable("unhandled atomic operation");
18307
18308 // FullFlatEmulation is true if we need to issue the private, shared, and
18309 // global cases.
18310 //
18311 // If this is false, we are only dealing with the flat-targeting-private case,
18312 // where we only insert a check for private and still use the flat instruction
18313 // for global and shared.
18314
18315 bool FullFlatEmulation =
18316 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
18317 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
18319 RMW->getType()->isDoubleTy()));
18320
18321 // If the return value isn't used, do not introduce a false use in the phi.
18322 bool ReturnValueIsUsed = !AI->use_empty();
18323
18324 BasicBlock *BB = Builder.GetInsertBlock();
18325 Function *F = BB->getParent();
18326 BasicBlock *ExitBB =
18327 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
18328 BasicBlock *SharedBB = nullptr;
18329
18330 BasicBlock *CheckPrivateBB = BB;
18331 if (FullFlatEmulation) {
18332 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
18333 CheckPrivateBB =
18334 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
18335 }
18336
18337 BasicBlock *PrivateBB =
18338 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
18339 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
18340 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
18341
18342 std::prev(BB->end())->eraseFromParent();
18343 Builder.SetInsertPoint(BB);
18344
18345 Value *LoadedShared = nullptr;
18346 if (FullFlatEmulation) {
18347 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
18348 {Addr}, nullptr, "is.shared");
18349 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
18350 Builder.SetInsertPoint(SharedBB);
18351 Value *CastToLocal = Builder.CreateAddrSpaceCast(
18353
18354 Instruction *Clone = AI->clone();
18355 Clone->insertInto(SharedBB, SharedBB->end());
18356 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
18357 LoadedShared = Clone;
18358
18359 Builder.CreateBr(PhiBB);
18360 Builder.SetInsertPoint(CheckPrivateBB);
18361 }
18362
18363 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
18364 {Addr}, nullptr, "is.private");
18365 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
18366
18367 Builder.SetInsertPoint(PrivateBB);
18368
18369 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
18371
18372 Value *LoadedPrivate;
18373 if (RMW) {
18374 LoadedPrivate = Builder.CreateAlignedLoad(
18375 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
18376
18377 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
18378 LoadedPrivate, RMW->getValOperand());
18379
18380 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
18381 } else {
18382 auto [ResultLoad, Equal] =
18383 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
18384 CX->getNewValOperand(), CX->getAlign());
18385
18386 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
18387 ResultLoad, 0);
18388 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
18389 }
18390
18391 Builder.CreateBr(PhiBB);
18392
18393 Builder.SetInsertPoint(GlobalBB);
18394
18395 // Continue using a flat instruction if we only emitted the check for private.
18396 Instruction *LoadedGlobal = AI;
18397 if (FullFlatEmulation) {
18398 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
18400 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
18401 }
18402
18403 AI->removeFromParent();
18404 AI->insertInto(GlobalBB, GlobalBB->end());
18405
18406 // The new atomicrmw may go through another round of legalization later.
18407 if (!FullFlatEmulation) {
18408 // We inserted the runtime check already, make sure we do not try to
18409 // re-expand this.
18410 // TODO: Should union with any existing metadata.
18411 MDBuilder MDB(F->getContext());
18412 MDNode *RangeNotPrivate =
18415 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
18416 RangeNotPrivate);
18417 }
18418
18419 Builder.CreateBr(PhiBB);
18420
18421 Builder.SetInsertPoint(PhiBB);
18422
18423 if (ReturnValueIsUsed) {
18424 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
18425 AI->replaceAllUsesWith(Loaded);
18426 if (FullFlatEmulation)
18427 Loaded->addIncoming(LoadedShared, SharedBB);
18428 Loaded->addIncoming(LoadedPrivate, PrivateBB);
18429 Loaded->addIncoming(LoadedGlobal, GlobalBB);
18430 Loaded->takeName(AI);
18431 }
18432
18433 Builder.CreateBr(ExitBB);
18434}
18435
18438
18441 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
18442 ConstVal && ConstVal->isNullValue()) {
18443 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
18445
18446 // We may still need the private-alias-flat handling below.
18447
18448 // TODO: Skip this for cases where we cannot access remote memory.
18449 }
18450 }
18451
18452 // The non-flat expansions should only perform the de-canonicalization of
18453 // identity values.
18455 return;
18456
18458}
18459
18462}
18463
18464LoadInst *
18466 IRBuilder<> Builder(AI);
18467 auto Order = AI->getOrdering();
18468
18469 // The optimization removes store aspect of the atomicrmw. Therefore, cache
18470 // must be flushed if the atomic ordering had a release semantics. This is
18471 // not necessary a fence, a release fence just coincides to do that flush.
18472 // Avoid replacing of an atomicrmw with a release semantics.
18473 if (isReleaseOrStronger(Order))
18474 return nullptr;
18475
18476 LoadInst *LI = Builder.CreateAlignedLoad(
18477 AI->getType(), AI->getPointerOperand(), AI->getAlign());
18478 LI->setAtomic(Order, AI->getSyncScopeID());
18479 LI->copyMetadata(*AI);
18480 LI->takeName(AI);
18481 AI->replaceAllUsesWith(LI);
18482 AI->eraseFromParent();
18483 return LI;
18484}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
constexpr LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:298
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Addr
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
static bool isUndef(const MachineInstr &MI)
Register const TargetRegisterInfo * TRI
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition: SIDefines.h:1234
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition: SIDefines.h:1231
static cl::opt< bool > UseSelectionDAGPTRADD("amdgpu-use-sdag-ptradd", cl::Hidden, cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the " "SelectionDAG ISel"), cl::init(false))
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static uint32_t getIdentityValueForWaveReduction(unsigned Opc)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
raw_pwrite_stream & OS
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:480
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
#define LLVM_DEBUG(...)
Definition: Debug.h:119
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool hasBF16PackedInsts() const
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
bool hasCvtPkF16F32Inst() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool hasBF16ConversionInsts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
bool hasBF16TransInsts() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition: APFloat.h:1120
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:6057
LLVM_READONLY int getExactLog2Abs() const
Definition: APFloat.h:1497
bool isNegative() const
Definition: APFloat.h:1449
bool isNormal() const
Definition: APFloat.h:1453
APInt bitcastToAPInt() const
Definition: APFloat.h:1353
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition: APFloat.h:1138
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:1098
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:1079
bool isInfinity() const
Definition: APFloat.h:1446
Class for arbitrary precision integers.
Definition: APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1391
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1385
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:380
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition: APInt.h:466
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1639
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
Definition: APInt.h:366
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1237
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1221
This class represents an incoming formal argument to a Function.
Definition: Argument.h:32
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition: Function.cpp:339
const Function * getParent() const
Definition: Argument.h:44
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:142
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:506
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:645
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:549
static unsigned getPointerOperandIndex()
Definition: Instructions.h:636
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:709
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:843
static unsigned getPointerOperandIndex()
Definition: Instructions.h:888
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:721
@ Add
*p = old + v
Definition: Instructions.h:725
@ FAdd
*p = old + v
Definition: Instructions.h:746
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:739
@ Or
*p = old | v
Definition: Instructions.h:733
@ Sub
*p = old - v
Definition: Instructions.h:727
@ And
*p = old & v
Definition: Instructions.h:729
@ Xor
*p = old ^ v
Definition: Instructions.h:735
@ FSub
*p = old - v
Definition: Instructions.h:749
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:769
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:737
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:743
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:757
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:741
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:753
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:773
@ Nand
*p = ~(old & v)
Definition: Instructions.h:731
Value * getPointerOperand()
Definition: Instructions.h:886
void setOperation(BinOp Operation)
Definition: Instructions.h:837
BinOp getOperation() const
Definition: Instructions.h:819
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition: Instructions.h:877
Value * getValOperand()
Definition: Instructions.h:890
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition: Instructions.h:863
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:894
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
Definition: Attributes.cpp:386
LLVM Basic Block Representation.
Definition: BasicBlock.h:62
iterator end()
Definition: BasicBlock.h:472
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:206
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:555
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:213
BitVector & set()
Definition: BitVector.h:351
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
Definition: ByteProvider.h:32
static ByteProvider getConstantZero()
Definition: ByteProvider.h:67
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
Definition: ByteProvider.h:60
std::optional< ISelOp > Src
Definition: ByteProvider.h:51
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1348
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
Definition: InstrTypes.h:1458
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1292
unsigned arg_size() const
Definition: InstrTypes.h:1290
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:678
@ ICMP_NE
not equal
Definition: InstrTypes.h:700
bool isSigned() const
Definition: InstrTypes.h:932
bool isFPPredicate() const
Definition: InstrTypes.h:784
bool isIntPredicate() const
Definition: InstrTypes.h:785
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition: Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:214
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition: Constant.h:43
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:842
bool isBigEndian() const
Definition: DataLayout.h:199
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
A debug info location.
Definition: DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:592
unsigned getNumElements() const
Definition: DerivedTypes.h:635
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Definition: DerivedTypes.h:105
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:137
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:209
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:363
iterator_range< arg_iterator > args()
Definition: Function.h:890
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:762
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:359
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition: Function.cpp:803
Argument * getArg(unsigned i) const
Definition: Function.h:884
bool hasMemoryAtomicFaddF32DenormalSupport() const
Definition: GCNSubtarget.h:948
bool hasD16Images() const
Definition: GCNSubtarget.h:751
bool hasMinimum3Maximum3F32() const
bool useVGPRIndexMode() const
bool hasAtomicDsPkAdd16Insts() const
Definition: GCNSubtarget.h:910
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:522
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:513
bool hasAtomicFMinFMaxF64FlatInsts() const
Definition: GCNSubtarget.h:906
bool hasDot7Insts() const
Definition: GCNSubtarget.h:850
bool hasApertureRegs() const
Definition: GCNSubtarget.h:648
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:678
bool hasAtomicFMinFMaxF32FlatInsts() const
Definition: GCNSubtarget.h:902
bool hasIEEEMinimumMaximumInsts() const
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasRelaxedBufferOOBMode() const
Definition: GCNSubtarget.h:646
bool hasDLInsts() const
Definition: GCNSubtarget.h:820
bool hasBCNT(unsigned Size) const
Definition: GCNSubtarget.h:454
bool hasMAIInsts() const
Definition: GCNSubtarget.h:878
bool hasLDSLoadB96_B128() const
Returns true if the target supports global_load_lds_dwordx3/global_load_lds_dwordx4 or buffer_load_dw...
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
Definition: GCNSubtarget.h:955
bool hasMultiDwordFlatScratchAddressing() const
Definition: GCNSubtarget.h:731
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
Definition: GCNSubtarget.h:572
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
Definition: GCNSubtarget.h:630
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:308
bool hasDot1Insts() const
Definition: GCNSubtarget.h:826
bool hasAtomicFaddRtnInsts() const
Definition: GCNSubtarget.h:918
bool hasSafeSmemPrefetch() const
bool hasPkMovB32() const
Align getStackAlignment() const
bool hasScalarSubwordLoads() const
Definition: GCNSubtarget.h:500
bool enableFlatScratch() const
Definition: GCNSubtarget.h:703
bool hasMadF16() const
bool hasMin3Max3PKF16() const
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
Definition: GCNSubtarget.h:674
bool hasFmaMixBF16Insts() const
Definition: GCNSubtarget.h:478
bool hasVMemToLDSLoad() const
bool supportsGetDoorbellID() const
Definition: GCNSubtarget.h:506
bool hasFlatAtomicFaddF32Inst() const
Definition: GCNSubtarget.h:938
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:320
bool hasMad64_32() const
Definition: GCNSubtarget.h:796
bool useDS128() const
Definition: GCNSubtarget.h:582
bool hasBVHDualAndBVH8Insts() const
bool hasMinimum3Maximum3PKF16() const
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
bool hasGloballyAddressableScratch() const
bool has64BitLiterals() const
TrapHandlerAbi getTrapHandlerAbi() const
Definition: GCNSubtarget.h:502
const SIFrameLowering * getFrameLowering() const override
Definition: GCNSubtarget.h:312
bool hasMinimum3Maximum3F16() const
bool hasAtomicFMinFMaxF32GlobalInsts() const
Definition: GCNSubtarget.h:894
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
Definition: GCNSubtarget.h:470
bool hasIntClamp() const
Definition: GCNSubtarget.h:400
bool hasGFX10_AEncoding() const
unsigned getMaxNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
bool hasEmulatedSystemScopeAtomics() const
Definition: GCNSubtarget.h:961
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:420
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:652
bool hasLDSFPAtomicAddF64() const
bool hasFlatGlobalInsts() const
Definition: GCNSubtarget.h:682
bool getScalarizeGlobalBehavior() const
bool hasScalarSMulU64() const
Definition: GCNSubtarget.h:785
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
Definition: GCNSubtarget.h:375
bool hasIntMinMax64() const
bool hasShaderCyclesHiLoRegisters() const
Definition: GCNSubtarget.h:991
bool hasFFBL() const
Definition: GCNSubtarget.h:458
bool hasNSAEncoding() const
bool hasSMemRealTime() const
bool usePRTStrictNull() const
Definition: GCNSubtarget.h:604
bool hasVmemPrefInsts() const
bool hasAtomicFMinFMaxF64GlobalInsts() const
Definition: GCNSubtarget.h:898
bool hasMed3_16() const
Definition: GCNSubtarget.h:466
bool hasUnalignedScratchAccessEnabled() const
Definition: GCNSubtarget.h:638
bool hasMovrel() const
bool hasAtomicFlatPkAdd16Insts() const
Definition: GCNSubtarget.h:912
bool hasBFI() const
Definition: GCNSubtarget.h:446
bool isWave32() const
bool hasUnalignedBufferAccessEnabled() const
Definition: GCNSubtarget.h:622
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:383
bool hasImageGather4D16Bug() const
bool hasDot10Insts() const
Definition: GCNSubtarget.h:862
bool supportsMinMaxDenormModes() const
Definition: GCNSubtarget.h:567
bool hasFFBH() const
Definition: GCNSubtarget.h:462
bool hasAtomicFaddInsts() const
Definition: GCNSubtarget.h:914
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const
Definition: GCNSubtarget.h:922
bool hasAtomicBufferPkAddBF16Inst() const
Definition: GCNSubtarget.h:934
bool hasAtomicFaddNoRtnInsts() const
Definition: GCNSubtarget.h:920
bool hasFlatBufferGlobalAtomicFaddF64Inst() const
Definition: GCNSubtarget.h:942
bool hasScalarDwordx3Loads() const
bool hasLDSFPAtomicAddF32() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
Definition: GCNSubtarget.h:592
bool hasDot8Insts() const
Definition: GCNSubtarget.h:854
bool hasVectorMulU64() const
bool hasDS96AndDS128() const
Definition: GCNSubtarget.h:587
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:576
Generation getGeneration() const
Definition: GCNSubtarget.h:356
bool hasAtomicBufferGlobalPkAddF16Insts() const
Definition: GCNSubtarget.h:926
bool hasScalarAddSub64() const
Definition: GCNSubtarget.h:783
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:787
bool hasAtomicGlobalPkAddBF16Inst() const
Definition: GCNSubtarget.h:930
bool hasAddr64() const
Definition: GCNSubtarget.h:424
bool isWave64() const
bool hasFmaMixInsts() const
Definition: GCNSubtarget.h:474
bool hasPackedTID() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:779
bool hasFractBug() const
Definition: GCNSubtarget.h:438
bool hasGDS() const
bool hasBFE() const
Definition: GCNSubtarget.h:442
bool hasGWSAutoReplay() const
Definition: GCNSubtarget.h:766
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
const MachineFunction & getMachineFunction() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
Definition: GlobalValue.h:513
unsigned getAddressSpace() const
Definition: GlobalValue.h:207
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:663
Type * getValueType() const
Definition: GlobalValue.h:298
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2625
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1864
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:202
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:201
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:834
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2494
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1197
LLVMContext & getContext() const
Definition: IRBuilder.h:203
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1191
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:207
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1883
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2209
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2780
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:90
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
Definition: Instruction.h:406
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:82
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1718
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Definition: Instruction.cpp:86
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
Definition: DerivedTypes.h:42
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:49
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:265
constexpr bool isScalar() const
Definition: LowLevelType.h:147
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:43
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Definition: LowLevelType.h:58
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:191
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
Definition: LowLevelType.h:219
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
LLVM_ABI std::optional< StringRef > getSyncScopeName(SyncScope::ID Id) const
getSyncScopeName - Returns the name of a SyncScope::ID registered with LLVMContext,...
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
Definition: Instructions.h:180
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:265
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:245
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:199
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition: MDBuilder.cpp:96
Metadata node.
Definition: Metadata.h:1077
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:247
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
Definition: MachineInstr.h:72
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:595
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition: ModRef.h:221
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition: ModRef.h:215
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition: ModRef.h:218
Root of the metadata hierarchy.
Definition: Metadata.h:63
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:278
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1885
Register getReg() const
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition: Register.h:67
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:78
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
bool isInlineConstant(const APInt &Imm) const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:229
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:758
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:578
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:500
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
Definition: SelectionDAG.h:813
const Pass * getPass() const
Definition: SelectionDAG.h:494
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:504
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:868
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
Definition: SelectionDAG.h:839
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:498
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:719
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:499
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:707
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:493
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:885
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:511
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:587
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:581
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:401
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:541
bool empty() const
Definition: SmallVector.h:82
size_t size() const
Definition: SmallVector.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:684
void resize(size_type N)
Definition: SmallVector.h:639
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
An instruction for storing to memory.
Definition: Instructions.h:296
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:862
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:154
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:43
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:68
R Default(T Value)
Definition: StringSwitch.h:177
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:83
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:47
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:417
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:311
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition: Type.h:258
LLVM_ABI const fltSemantics & getFltSemantics() const
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:240
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:352
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
LLVM_ABI void set(Value *Val)
Definition: Value.h:905
User * getUser() const
Returns the User that contains this Use.
Definition: Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition: Use.cpp:35
const Use & getOperandUse(unsigned i) const
Definition: User.h:245
Value * getOperand(unsigned i) const
Definition: User.h:232
LLVM Value Representation.
Definition: Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:546
iterator_range< user_iterator > users()
Definition: Value.h:426
bool use_empty() const
Definition: Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1098
iterator_range< use_iterator > uses()
Definition: Value.h:380
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:396
Type * getElementType() const
Definition: DerivedTypes.h:463
constexpr bool isZero() const
Definition: TypeSize.h:157
const ParentTy * getParent() const
Definition: ilist_node.h:34
self_iterator getIterator()
Definition: ilist_node.h:134
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: Lint.cpp:82
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:126
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ MaxID
The highest possible ID. Must be some 2^k - 1.
Definition: CallingConv.h:291
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:801
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:256
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1232
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:774
@ ATOMIC_LOAD_FMAX
Definition: ISDOpcodes.h:1386
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:45
@ SET_FPENV
Sets the current floating-point environment.
Definition: ISDOpcodes.h:1108
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:270
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1379
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:765
@ ConstantFP
Definition: ISDOpcodes.h:87
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1381
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1351
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1382
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:259
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1141
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:215
@ GlobalAddress
Definition: ISDOpcodes.h:88
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:1364
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:738
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:985
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1377
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:975
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:249
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1378
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:1018
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1568
@ ATOMIC_LOAD_FADD
Definition: ISDOpcodes.h:1384
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:957
@ CONVERGENCECTRL_GLUE
Definition: ISDOpcodes.h:1541
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:826
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:656
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
Definition: ISDOpcodes.h:1298
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1157
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:773
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1331
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1090
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:1002
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1187
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:347
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1380
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:528
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition: ISDOpcodes.h:535
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:778
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1347
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:228
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:242
@ ATOMIC_LOAD_FMIN
Definition: ISDOpcodes.h:1387
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:695
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1126
@ GET_FPENV
Gets the current floating-point environment.
Definition: ISDOpcodes.h:1103
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:636
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1375
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:601
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1075
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:832
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1321
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:793
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1358
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1383
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
Definition: ISDOpcodes.h:1059
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:351
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1151
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:718
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:960
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:323
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:1207
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:994
@ ATOMIC_LOAD_UDEC_WRAP
Definition: ISDOpcodes.h:1391
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1373
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:493
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1081
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1374
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:908
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1292
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:730
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1318
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:200
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:53
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1372
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:1025
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:941
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:434
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:979
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:1204
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:838
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1180
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:815
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:62
@ ATOMIC_LOAD_UINC_WRAP
Definition: ISDOpcodes.h:1390
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:360
@ AssertZext
Definition: ISDOpcodes.h:63
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
Definition: ISDOpcodes.h:1086
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:543
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
Definition: ISDOpcodes.h:1718
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1685
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1665
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
Definition: Intrinsics.cpp:743
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
Definition: Intrinsics.cpp:762
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
Definition: Intrinsics.cpp:596
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
Offsets
Offsets in bytes from the start of the input buffer.
Definition: SIInstrInfo.h:1695
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:58
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
constexpr double inv_pi
Definition: MathExtras.h:54
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:338
@ Offset
Definition: DWP.cpp:477
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition: Analysis.cpp:241
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:307
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition: STLExtras.h:870
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
Definition: LowerAtomic.cpp:40
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
@ Done
Definition: Threading.h:60
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition: MathExtras.h:232
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:270
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2155
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:551
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:390
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:157
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:282
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:44
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:336
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:203
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:288
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition: Error.cpp:167
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition: Analysis.cpp:207
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:164
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
Definition: LowerAtomic.cpp:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:399
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
@ AfterLegalizeDAG
Definition: DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition: DAGCombine.h:18
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:376
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition: MathExtras.h:241
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:223
@ DS_Warning
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition: Utils.cpp:433
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1777
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1916
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:858
#define N
SDValue SrcOp
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:304
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:264
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition: SCCPSolver.h:42
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition: ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:390
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:295
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:238
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition: ValueTypes.h:113
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:465
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:407
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:251
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:216
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition: ValueTypes.cpp:330
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
unsigned getOrigArgIndex() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:66
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition: KnownBits.h:165
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:74
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition: KnownBits.h:218
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition: KnownBits.h:173
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:340
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:241
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals