LLVM 22.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
42#include "llvm/IR/MDBuilder.h"
45#include "llvm/Support/ModRef.h"
47#include <optional>
48
49using namespace llvm;
50using namespace llvm::SDPatternMatch;
51
52#define DEBUG_TYPE "si-lower"
53
54STATISTIC(NumTailCalls, "Number of tail calls");
55
56static cl::opt<bool>
57 DisableLoopAlignment("amdgpu-disable-loop-alignment",
58 cl::desc("Do not align and prefetch loops"),
59 cl::init(false));
60
62 "amdgpu-use-divergent-register-indexing", cl::Hidden,
63 cl::desc("Use indirect register addressing for divergent indexes"),
64 cl::init(false));
65
66// TODO: This option should be removed once we switch to always using PTRADD in
67// the SelectionDAG.
69 "amdgpu-use-sdag-ptradd", cl::Hidden,
70 cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the "
71 "SelectionDAG ISel"),
72 cl::init(false));
73
76 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
77}
78
81 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
82}
83
84static unsigned findFirstFreeSGPR(CCState &CCInfo) {
85 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
86 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
87 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
88 return AMDGPU::SGPR0 + Reg;
89 }
90 }
91 llvm_unreachable("Cannot allocate sgpr");
92}
93
95 const GCNSubtarget &STI)
96 : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) {
97 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
98 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
99
100 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
101 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
102
103 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
104
105 const SIRegisterInfo *TRI = STI.getRegisterInfo();
106 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
107
108 addRegisterClass(MVT::f64, V64RegClass);
109 addRegisterClass(MVT::v2f32, V64RegClass);
110 addRegisterClass(MVT::Untyped, V64RegClass);
111
112 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
113 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
114
115 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
116 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
117
118 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
119 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
120
121 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
122 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
123
124 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
125 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
126
127 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
128 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
129
130 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
131 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
132
133 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
134 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
135
136 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
137 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
138
139 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
140 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
141
142 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
143 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
144
145 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
146 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
147
148 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
149 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
150
151 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
152 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
153
154 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
155 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
156
157 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
158 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
159
160 if (Subtarget->has16BitInsts()) {
161 if (Subtarget->useRealTrue16Insts()) {
162 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
163 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
164 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
165 } else {
166 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
167 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
168 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
169 }
170
171 // Unless there are also VOP3P operations, not operations are really legal.
172 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
173 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
174 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
175 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
176 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
177 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
178 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
179 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
180 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
181 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
182 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
183 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
184 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
185 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
186 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
187 }
188
189 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
190 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
191
193
194 // The boolean content concept here is too inflexible. Compares only ever
195 // really produce a 1-bit result. Any copy/extend from these will turn into a
196 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
197 // it's what most targets use.
200
201 // We need to custom lower vector stores from local memory
203 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
204 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
205 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
206 MVT::i1, MVT::v32i32},
207 Custom);
208
210 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
211 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
212 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
213 MVT::i1, MVT::v32i32},
214 Custom);
215
216 if (isTypeLegal(MVT::bf16)) {
217 for (unsigned Opc :
226 ISD::SETCC}) {
227 // FIXME: The promoted to type shouldn't need to be explicit
228 setOperationAction(Opc, MVT::bf16, Promote);
229 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
230 }
231
233
235 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
236
240
241 // We only need to custom lower because we can't specify an action for bf16
242 // sources.
245 }
246
247 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
248 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
249 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
250 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
251 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
252 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
253 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
254 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
255 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
256 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
257 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
258 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
259 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
260 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
261 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
262 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
263
264 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
265 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
266 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
267 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
268 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
269 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
270 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
271
272 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
273
277 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
278
279 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
280
282 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
283
285 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
286 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
287
289 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
290 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
291 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
292 Expand);
294 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
295 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
296 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
297 Expand);
298
300 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
301 MVT::v3i16, MVT::v4i16, MVT::Other},
302 Custom);
303
306 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
307
309
311
313 Expand);
314
315#if 0
317#endif
318
319 // We only support LOAD/STORE and vector manipulation ops for vectors
320 // with > 4 elements.
321 for (MVT VT :
322 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
323 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
324 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
325 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
326 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
327 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
328 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
329 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
330 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
331 switch (Op) {
332 case ISD::LOAD:
333 case ISD::STORE:
335 case ISD::BITCAST:
336 case ISD::UNDEF:
340 case ISD::IS_FPCLASS:
341 break;
346 break;
347 default:
349 break;
350 }
351 }
352 }
353
355
356 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
357 // is expanded to avoid having two separate loops in case the index is a VGPR.
358
359 // Most operations are naturally 32-bit vector operations. We only support
360 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
361 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
363 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
364
366 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
367
369 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
370
372 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
373 }
374
375 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
377 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
378
380 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
381
383 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
384
386 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
387 }
388
389 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
391 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
392
394 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
395
397 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
398
400 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
401 }
402
403 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
405 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
406
408 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
409
411 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
412
414 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
415 }
416
417 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
419 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
420
422 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
423
425 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
426
428 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
429 }
430
432 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
433 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
434 Custom);
435
436 if (Subtarget->hasPkMovB32()) {
437 // TODO: 16-bit element vectors should be legal with even aligned elements.
438 // TODO: Can be legal with wider source types than the result with
439 // subregister extracts.
440 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
441 }
442
443 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
444 Custom);
445
446 // Avoid stack access for these.
447 // TODO: Generalize to more vector types.
449 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
450 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
451 Custom);
452
453 // Deal with vec3 vector operations when widened to vec4.
455 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
456
457 // Deal with vec5/6/7 vector operations when widened to vec8.
459 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
460 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
461 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
462 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
463 Custom);
464
465 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
466 // and output demarshalling
467 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
468
469 // We can't return success/failure, only the old value,
470 // let LLVM add the comparison
472 Expand);
473
474 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
475
476 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
477
478 // FIXME: This should be narrowed to i32, but that only happens if i64 is
479 // illegal.
480 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
481 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
482
483 // On SI this is s_memtime and s_memrealtime on VI.
485
486 if (Subtarget->hasSMemRealTime() ||
490
491 if (Subtarget->has16BitInsts()) {
494 } else {
496 }
497
498 if (Subtarget->hasMadMacF32Insts())
500
501 if (!Subtarget->hasBFI())
502 // fcopysign can be done in a single instruction with BFI.
503 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
504
505 if (!Subtarget->hasBCNT(32))
507
508 if (!Subtarget->hasBCNT(64))
510
511 if (Subtarget->hasFFBH())
513
514 if (Subtarget->hasFFBL())
516
517 // We only really have 32-bit BFE instructions (and 16-bit on VI).
518 //
519 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
520 // effort to match them now. We want this to be false for i64 cases when the
521 // extraction isn't restricted to the upper or lower half. Ideally we would
522 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
523 // span the midpoint are probably relatively rare, so don't worry about them
524 // for now.
525 if (Subtarget->hasBFE())
527
528 // Clamp modifier on add/sub
529 if (Subtarget->hasIntClamp())
531
532 if (Subtarget->hasAddNoCarry())
533 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
534 Legal);
535
538 {MVT::f32, MVT::f64}, Custom);
539
540 // These are really only legal for ieee_mode functions. We should be avoiding
541 // them for functions that don't have ieee_mode enabled, so just say they are
542 // legal.
544 {MVT::f32, MVT::f64}, Legal);
545
546 if (Subtarget->haveRoundOpsF64())
548 Legal);
549 else
551 MVT::f64, Custom);
552
554 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
555 Legal);
556 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
557
560
561 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
562 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
563
564 // Custom lower these because we can't specify a rule based on an illegal
565 // source bf16.
568
569 if (Subtarget->has16BitInsts()) {
572 MVT::i16, Legal);
573
574 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
575
577 MVT::i16, Expand);
578
582 ISD::CTPOP},
583 MVT::i16, Promote);
584
586
587 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
588
590 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
592 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
593
597
599
600 // F16 - Constant Actions.
603
604 // F16 - Load/Store Actions.
606 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
608 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
609
610 // BF16 - Load/Store Actions.
612 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
614 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
615
616 // F16 - VOP1 Actions.
619 MVT::f16, Custom);
620
621 // BF16 - VOP1 Actions.
622 if (Subtarget->hasBF16TransInsts())
624
627
628 // F16 - VOP2 Actions.
629 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
630 Expand);
634
635 // F16 - VOP3 Actions.
637 if (STI.hasMadF16())
639
640 for (MVT VT :
641 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
642 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
643 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
644 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
645 switch (Op) {
646 case ISD::LOAD:
647 case ISD::STORE:
649 case ISD::BITCAST:
650 case ISD::UNDEF:
655 case ISD::IS_FPCLASS:
656 break;
660 break;
661 default:
663 break;
664 }
665 }
666 }
667
668 // v_perm_b32 can handle either of these.
669 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
671
672 // XXX - Do these do anything? Vector constants turn into build_vector.
673 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
674
675 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
676 Legal);
677
679 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
681 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
682
684 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
686 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
687
688 setOperationAction(ISD::AND, MVT::v2i16, Promote);
689 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
690 setOperationAction(ISD::OR, MVT::v2i16, Promote);
691 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
692 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
693 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
694
696 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
698 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
699 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
700 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
701
703 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
705 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
707 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
708
710 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
712 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
713 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
714 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
715
717 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
719 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
720
722 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
724 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
726 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
727
728 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
729 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
730 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
731 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
732 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
733 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
734
736 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
738 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
739 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
740 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
741
742 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
743 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
744 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
745 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
746 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
747 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
748
750 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
752 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
753 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
754 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
755
757 MVT::v2i32, Expand);
759
761 MVT::v4i32, Expand);
762
764 MVT::v8i32, Expand);
765
766 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
767 Subtarget->hasVOP3PInsts() ? Legal : Custom);
768
769 setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal);
770 // This isn't really legal, but this avoids the legalizer unrolling it (and
771 // allows matching fneg (fabs x) patterns)
772 setOperationAction(ISD::FABS, {MVT::v2f16, MVT::v2bf16}, Legal);
773
774 // Can do this in one BFI plus a constant materialize.
776 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
777 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
778 MVT::v32f16, MVT::v32bf16},
779 Custom);
780
783 MVT::f16, Custom);
785
788 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
789 Custom);
790
792 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
793 Expand);
794
795 for (MVT Vec16 :
796 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
797 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
800 Vec16, Custom);
802 }
803 }
804
805 if (Subtarget->hasVOP3PInsts()) {
809 MVT::v2i16, Legal);
810
813 MVT::v2f16, Legal);
814
816 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
817
819 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
820 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
821 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
822 Custom);
823
824 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
825 // Split vector operations.
830 VT, Custom);
831
832 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
833 // Split vector operations.
835 VT, Custom);
836
839 {MVT::v2f16, MVT::v4f16}, Custom);
840
841 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
842 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
843 Custom);
844
845 if (Subtarget->hasPackedFP32Ops()) {
847 MVT::v2f32, Legal);
849 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
850 Custom);
851 }
852 }
853
855
856 if (Subtarget->has16BitInsts()) {
858 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
860 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
861 } else {
862 // Legalization hack.
863 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
864
866 }
867
869 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
870 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
871 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
872 MVT::v32f16, MVT::v32bf16},
873 Custom);
874
876
877 if (Subtarget->hasVectorMulU64())
879 else if (Subtarget->hasScalarSMulU64())
881
882 if (Subtarget->hasMad64_32())
884
885 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
887
888 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
890 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
891 } else {
892 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
893 if (Subtarget->hasMinimum3Maximum3F32())
895
896 if (Subtarget->hasMinimum3Maximum3PKF16()) {
898
899 // If only the vector form is available, we need to widen to a vector.
900 if (!Subtarget->hasMinimum3Maximum3F16())
902 }
903 }
904
905 if (Subtarget->hasVOP3PInsts()) {
906 // We want to break these into v2f16 pieces, not scalarize.
908 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
909 Custom);
910 }
911
912 if (Subtarget->hasIntMinMax64())
914 Legal);
915
917 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
918 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
919 MVT::i8},
920 Custom);
921
923 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
924 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
925 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
926 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
927 Custom);
928
930 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
931 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
932 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
933 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
934 Custom);
935
941
942 // TODO: Could move this to custom lowering, could benefit from combines on
943 // extract of relevant bits.
945
947
948 if (Subtarget->hasBF16ConversionInsts()) {
949 setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom);
951 }
952
953 if (Subtarget->hasBF16PackedInsts()) {
956 MVT::v2bf16, Legal);
957 }
958
959 if (Subtarget->hasBF16TransInsts()) {
961 }
962
963 if (Subtarget->hasCvtPkF16F32Inst()) {
965 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
966 Custom);
967 }
968
972 ISD::SUB,
974 ISD::MUL,
975 ISD::FADD,
976 ISD::FSUB,
977 ISD::FDIV,
978 ISD::FMUL,
987 ISD::FMA,
988 ISD::SMIN,
989 ISD::SMAX,
990 ISD::UMIN,
991 ISD::UMAX,
994 ISD::SMIN,
995 ISD::SMAX,
996 ISD::UMIN,
997 ISD::UMAX,
998 ISD::AND,
999 ISD::OR,
1000 ISD::XOR,
1001 ISD::SHL,
1002 ISD::SRL,
1003 ISD::SRA,
1004 ISD::FSHR,
1014
1015 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1017
1018 // All memory operations. Some folding on the pointer operand is done to help
1019 // matching the constant offsets in the addressing modes.
1021 ISD::STORE,
1044
1045 // FIXME: In other contexts we pretend this is a per-function property.
1047
1049}
1050
1051const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1052
1054 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1055 return RCRegs;
1056}
1057
1058//===----------------------------------------------------------------------===//
1059// TargetLowering queries
1060//===----------------------------------------------------------------------===//
1061
1062// v_mad_mix* support a conversion from f16 to f32.
1063//
1064// There is only one special case when denormals are enabled we don't currently,
1065// where this is OK to use.
1066bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1067 EVT DestVT, EVT SrcVT) const {
1068 return DestVT.getScalarType() == MVT::f32 &&
1069 ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1070 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1071 SrcVT.getScalarType() == MVT::f16) ||
1072 (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1073 SrcVT.getScalarType() == MVT::bf16)) &&
1074 // TODO: This probably only requires no input flushing?
1076}
1077
1079 LLT DestTy, LLT SrcTy) const {
1080 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1081 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1082 DestTy.getScalarSizeInBits() == 32 &&
1083 SrcTy.getScalarSizeInBits() == 16 &&
1084 // TODO: This probably only requires no input flushing?
1085 denormalModeIsFlushAllF32(*MI.getMF());
1086}
1087
1089 // SI has some legal vector types, but no legal vector operations. Say no
1090 // shuffles are legal in order to prefer scalarizing some vector operations.
1091 return false;
1092}
1093
1095 CallingConv::ID CC,
1096 EVT VT) const {
1099
1100 if (VT.isVector()) {
1101 EVT ScalarVT = VT.getScalarType();
1102 unsigned Size = ScalarVT.getSizeInBits();
1103 if (Size == 16) {
1104 if (Subtarget->has16BitInsts()) {
1105 if (VT.isInteger())
1106 return MVT::v2i16;
1107 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1108 }
1109 return VT.isInteger() ? MVT::i32 : MVT::f32;
1110 }
1111
1112 if (Size < 16)
1113 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1114 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1115 }
1116
1117 if (VT.getSizeInBits() > 32)
1118 return MVT::i32;
1119
1121}
1122
1124 CallingConv::ID CC,
1125 EVT VT) const {
1128
1129 if (VT.isVector()) {
1130 unsigned NumElts = VT.getVectorNumElements();
1131 EVT ScalarVT = VT.getScalarType();
1132 unsigned Size = ScalarVT.getSizeInBits();
1133
1134 // FIXME: Should probably promote 8-bit vectors to i16.
1135 if (Size == 16 && Subtarget->has16BitInsts())
1136 return (NumElts + 1) / 2;
1137
1138 if (Size <= 32)
1139 return NumElts;
1140
1141 if (Size > 32)
1142 return NumElts * ((Size + 31) / 32);
1143 } else if (VT.getSizeInBits() > 32)
1144 return (VT.getSizeInBits() + 31) / 32;
1145
1147}
1148
1150 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1151 unsigned &NumIntermediates, MVT &RegisterVT) const {
1152 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1153 unsigned NumElts = VT.getVectorNumElements();
1154 EVT ScalarVT = VT.getScalarType();
1155 unsigned Size = ScalarVT.getSizeInBits();
1156 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1157 // support, but unless we can properly handle 3-vectors, it will be still be
1158 // inconsistent.
1159 if (Size == 16 && Subtarget->has16BitInsts()) {
1160 if (ScalarVT == MVT::bf16) {
1161 RegisterVT = MVT::i32;
1162 IntermediateVT = MVT::v2bf16;
1163 } else {
1164 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1165 IntermediateVT = RegisterVT;
1166 }
1167 NumIntermediates = (NumElts + 1) / 2;
1168 return NumIntermediates;
1169 }
1170
1171 if (Size == 32) {
1172 RegisterVT = ScalarVT.getSimpleVT();
1173 IntermediateVT = RegisterVT;
1174 NumIntermediates = NumElts;
1175 return NumIntermediates;
1176 }
1177
1178 if (Size < 16 && Subtarget->has16BitInsts()) {
1179 // FIXME: Should probably form v2i16 pieces
1180 RegisterVT = MVT::i16;
1181 IntermediateVT = ScalarVT;
1182 NumIntermediates = NumElts;
1183 return NumIntermediates;
1184 }
1185
1186 if (Size != 16 && Size <= 32) {
1187 RegisterVT = MVT::i32;
1188 IntermediateVT = ScalarVT;
1189 NumIntermediates = NumElts;
1190 return NumIntermediates;
1191 }
1192
1193 if (Size > 32) {
1194 RegisterVT = MVT::i32;
1195 IntermediateVT = RegisterVT;
1196 NumIntermediates = NumElts * ((Size + 31) / 32);
1197 return NumIntermediates;
1198 }
1199 }
1200
1202 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1203}
1204
1206 const DataLayout &DL, Type *Ty,
1207 unsigned MaxNumLanes) {
1208 assert(MaxNumLanes != 0);
1209
1210 LLVMContext &Ctx = Ty->getContext();
1211 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1212 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1213 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1214 NumElts);
1215 }
1216
1217 return TLI.getValueType(DL, Ty);
1218}
1219
1220// Peek through TFE struct returns to only use the data size.
1222 const DataLayout &DL, Type *Ty,
1223 unsigned MaxNumLanes) {
1224 auto *ST = dyn_cast<StructType>(Ty);
1225 if (!ST)
1226 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1227
1228 // TFE intrinsics return an aggregate type.
1229 assert(ST->getNumContainedTypes() == 2 &&
1230 ST->getContainedType(1)->isIntegerTy(32));
1231 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1232}
1233
1234/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1235/// in-memory representation. This return value is a custom type because there
1236/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1237/// could cause issues during codegen, these address space 7 pointers will be
1238/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1239/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1240/// for cost modeling, to work. (This also sets us up decently for doing the
1241/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1243 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1244 return MVT::amdgpuBufferFatPointer;
1246 DL.getPointerSizeInBits(AS) == 192)
1247 return MVT::amdgpuBufferStridedPointer;
1249}
1250/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1251/// v8i32 when padding is added.
1252/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1253/// also v8i32 with padding.
1255 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1256 DL.getPointerSizeInBits(AS) == 160) ||
1258 DL.getPointerSizeInBits(AS) == 192))
1259 return MVT::v8i32;
1261}
1262
1263static unsigned getIntrMemWidth(unsigned IntrID) {
1264 switch (IntrID) {
1265 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1266 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1267 return 8;
1268 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1269 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1270 return 32;
1271 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1272 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1273 return 64;
1274 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1275 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1276 return 128;
1277 default:
1278 llvm_unreachable("Unknown width");
1279 }
1280}
1281
1283 const CallInst &CI,
1284 MachineFunction &MF,
1285 unsigned IntrID) const {
1287 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1289 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1291 Info.flags |= getTargetMMOFlags(CI);
1292
1293 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1295 AttributeSet Attr =
1297 MemoryEffects ME = Attr.getMemoryEffects();
1298 if (ME.doesNotAccessMemory())
1299 return false;
1300
1301 // TODO: Should images get their own address space?
1302 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1303
1304 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1305 if (RsrcIntr->IsImage) {
1308 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1309 Info.align.reset();
1310 }
1311
1312 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1313 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1314 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1315 // We conservatively set the memory operand of a buffer intrinsic to the
1316 // base resource pointer, so that we can access alias information about
1317 // those pointers. Cases like "this points at the same value
1318 // but with a different offset" are handled in
1319 // areMemAccessesTriviallyDisjoint.
1320 Info.ptrVal = RsrcArg;
1321 }
1322
1323 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1324 if (!IsSPrefetch) {
1325 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1326 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1328 }
1329
1331 if (ME.onlyReadsMemory()) {
1332 if (RsrcIntr->IsImage) {
1333 unsigned MaxNumLanes = 4;
1334
1335 if (!BaseOpcode->Gather4) {
1336 // If this isn't a gather, we may have excess loaded elements in the
1337 // IR type. Check the dmask for the real number of elements loaded.
1338 unsigned DMask =
1339 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1340 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1341 }
1342
1343 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1344 CI.getType(), MaxNumLanes);
1345 } else {
1346 Info.memVT =
1348 std::numeric_limits<unsigned>::max());
1349 }
1350
1351 // FIXME: What does alignment mean for an image?
1354 } else if (ME.onlyWritesMemory()) {
1356
1357 Type *DataTy = CI.getArgOperand(0)->getType();
1358 if (RsrcIntr->IsImage) {
1359 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1360 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1361 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1362 DMaskLanes);
1363 } else
1364 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1365
1367 } else {
1368 // Atomic, NoReturn Sampler or prefetch
1371 Info.flags |=
1373
1374 if (!IsSPrefetch)
1376
1377 switch (IntrID) {
1378 default:
1379 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1380 // Fake memory access type for no return sampler intrinsics
1381 Info.memVT = MVT::i32;
1382 } else {
1383 // XXX - Should this be volatile without known ordering?
1385 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1386 }
1387 break;
1388 case Intrinsic::amdgcn_raw_buffer_load_lds:
1389 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1390 case Intrinsic::amdgcn_struct_buffer_load_lds:
1391 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1392 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1393 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1394 Info.ptrVal = CI.getArgOperand(1);
1395 return true;
1396 }
1397 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1398 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1399 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1400 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1401 Info.memVT =
1403 std::numeric_limits<unsigned>::max());
1404 Info.flags &= ~MachineMemOperand::MOStore;
1405 return true;
1406 }
1407 }
1408 }
1409 return true;
1410 }
1411
1412 switch (IntrID) {
1413 case Intrinsic::amdgcn_ds_ordered_add:
1414 case Intrinsic::amdgcn_ds_ordered_swap: {
1416 Info.memVT = MVT::getVT(CI.getType());
1417 Info.ptrVal = CI.getOperand(0);
1418 Info.align.reset();
1420
1421 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1422 if (!Vol->isZero())
1424
1425 return true;
1426 }
1427 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1428 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1430 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1431 Info.ptrVal = nullptr;
1432 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1434 return true;
1435 }
1436 case Intrinsic::amdgcn_ds_append:
1437 case Intrinsic::amdgcn_ds_consume: {
1439 Info.memVT = MVT::getVT(CI.getType());
1440 Info.ptrVal = CI.getOperand(0);
1441 Info.align.reset();
1443
1444 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1445 if (!Vol->isZero())
1447
1448 return true;
1449 }
1450 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1451 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1452 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1455 Info.memVT = MVT::getVT(CI.getType());
1456 Info.ptrVal = CI.getOperand(0);
1457 Info.memVT = MVT::i64;
1458 Info.size = 8;
1459 Info.align.reset();
1461 return true;
1462 }
1463 case Intrinsic::amdgcn_global_atomic_csub: {
1465 Info.memVT = MVT::getVT(CI.getType());
1466 Info.ptrVal = CI.getOperand(0);
1467 Info.align.reset();
1470 return true;
1471 }
1472 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1473 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1474 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1476 Info.memVT =
1477 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1478 ? CI.getType()
1479 : cast<StructType>(CI.getType())
1480 ->getElementType(0)); // XXX: what is correct VT?
1481
1482 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1483 Info.align.reset();
1484 Info.flags |=
1486 return true;
1487 }
1488 case Intrinsic::amdgcn_global_atomic_fmin_num:
1489 case Intrinsic::amdgcn_global_atomic_fmax_num:
1490 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1491 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1492 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1493 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1495 Info.memVT = MVT::getVT(CI.getType());
1496 Info.ptrVal = CI.getOperand(0);
1497 Info.align.reset();
1501 return true;
1502 }
1503 case Intrinsic::amdgcn_flat_load_monitor_b32:
1504 case Intrinsic::amdgcn_flat_load_monitor_b64:
1505 case Intrinsic::amdgcn_flat_load_monitor_b128:
1506 case Intrinsic::amdgcn_global_load_monitor_b32:
1507 case Intrinsic::amdgcn_global_load_monitor_b64:
1508 case Intrinsic::amdgcn_global_load_monitor_b128:
1509 case Intrinsic::amdgcn_cluster_load_b32:
1510 case Intrinsic::amdgcn_cluster_load_b64:
1511 case Intrinsic::amdgcn_cluster_load_b128:
1512 case Intrinsic::amdgcn_ds_load_tr6_b96:
1513 case Intrinsic::amdgcn_ds_load_tr4_b64:
1514 case Intrinsic::amdgcn_ds_load_tr8_b64:
1515 case Intrinsic::amdgcn_ds_load_tr16_b128:
1516 case Intrinsic::amdgcn_global_load_tr6_b96:
1517 case Intrinsic::amdgcn_global_load_tr4_b64:
1518 case Intrinsic::amdgcn_global_load_tr_b64:
1519 case Intrinsic::amdgcn_global_load_tr_b128:
1520 case Intrinsic::amdgcn_ds_read_tr4_b64:
1521 case Intrinsic::amdgcn_ds_read_tr6_b96:
1522 case Intrinsic::amdgcn_ds_read_tr8_b64:
1523 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1525 Info.memVT = MVT::getVT(CI.getType());
1526 Info.ptrVal = CI.getOperand(0);
1527 Info.align.reset();
1529 return true;
1530 }
1531 case Intrinsic::amdgcn_ds_gws_init:
1532 case Intrinsic::amdgcn_ds_gws_barrier:
1533 case Intrinsic::amdgcn_ds_gws_sema_v:
1534 case Intrinsic::amdgcn_ds_gws_sema_br:
1535 case Intrinsic::amdgcn_ds_gws_sema_p:
1536 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1538
1539 const GCNTargetMachine &TM =
1540 static_cast<const GCNTargetMachine &>(getTargetMachine());
1541
1543 Info.ptrVal = MFI->getGWSPSV(TM);
1544
1545 // This is an abstract access, but we need to specify a type and size.
1546 Info.memVT = MVT::i32;
1547 Info.size = 4;
1548 Info.align = Align(4);
1549
1550 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1552 else
1554 return true;
1555 }
1556 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1557 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1558 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1559 case Intrinsic::amdgcn_global_load_async_to_lds_b128: {
1561 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1562 Info.ptrVal = CI.getArgOperand(1);
1564 return true;
1565 }
1566 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1567 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1568 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1569 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1571 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1572 Info.ptrVal = CI.getArgOperand(0);
1574 return true;
1575 }
1576 case Intrinsic::amdgcn_load_to_lds:
1577 case Intrinsic::amdgcn_global_load_lds: {
1579 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1580 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1581 Info.ptrVal = CI.getArgOperand(1);
1583 return true;
1584 }
1585 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1586 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1587 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1588 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1590
1591 const GCNTargetMachine &TM =
1592 static_cast<const GCNTargetMachine &>(getTargetMachine());
1593
1595 Info.ptrVal = MFI->getGWSPSV(TM);
1596
1597 // This is an abstract access, but we need to specify a type and size.
1598 Info.memVT = MVT::i32;
1599 Info.size = 4;
1600 Info.align = Align(4);
1601
1603 return true;
1604 }
1605 case Intrinsic::amdgcn_s_prefetch_data:
1606 case Intrinsic::amdgcn_flat_prefetch:
1607 case Intrinsic::amdgcn_global_prefetch: {
1609 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1610 Info.ptrVal = CI.getArgOperand(0);
1612 return true;
1613 }
1614 default:
1615 return false;
1616 }
1617}
1618
1620 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1621 switch (cast<IntrinsicInst>(I).getIntrinsicID()) {
1622 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1623 // The DAG's ValueType loses the addrspaces.
1624 // Add them as 2 extra Constant operands "from" and "to".
1625 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1626 unsigned DstAS = I.getType()->getPointerAddressSpace();
1627 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1628 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1629 break;
1630 }
1631 default:
1632 break;
1633 }
1634}
1635
1638 Type *&AccessTy) const {
1639 Value *Ptr = nullptr;
1640 switch (II->getIntrinsicID()) {
1641 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1642 case Intrinsic::amdgcn_cluster_load_b128:
1643 case Intrinsic::amdgcn_cluster_load_b64:
1644 case Intrinsic::amdgcn_cluster_load_b32:
1645 case Intrinsic::amdgcn_ds_append:
1646 case Intrinsic::amdgcn_ds_consume:
1647 case Intrinsic::amdgcn_ds_load_tr8_b64:
1648 case Intrinsic::amdgcn_ds_load_tr16_b128:
1649 case Intrinsic::amdgcn_ds_load_tr4_b64:
1650 case Intrinsic::amdgcn_ds_load_tr6_b96:
1651 case Intrinsic::amdgcn_ds_read_tr4_b64:
1652 case Intrinsic::amdgcn_ds_read_tr6_b96:
1653 case Intrinsic::amdgcn_ds_read_tr8_b64:
1654 case Intrinsic::amdgcn_ds_read_tr16_b64:
1655 case Intrinsic::amdgcn_ds_ordered_add:
1656 case Intrinsic::amdgcn_ds_ordered_swap:
1657 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1658 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1659 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1660 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1661 case Intrinsic::amdgcn_flat_load_monitor_b128:
1662 case Intrinsic::amdgcn_flat_load_monitor_b32:
1663 case Intrinsic::amdgcn_flat_load_monitor_b64:
1664 case Intrinsic::amdgcn_global_atomic_csub:
1665 case Intrinsic::amdgcn_global_atomic_fmax_num:
1666 case Intrinsic::amdgcn_global_atomic_fmin_num:
1667 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1668 case Intrinsic::amdgcn_global_load_monitor_b128:
1669 case Intrinsic::amdgcn_global_load_monitor_b32:
1670 case Intrinsic::amdgcn_global_load_monitor_b64:
1671 case Intrinsic::amdgcn_global_load_tr_b64:
1672 case Intrinsic::amdgcn_global_load_tr_b128:
1673 case Intrinsic::amdgcn_global_load_tr4_b64:
1674 case Intrinsic::amdgcn_global_load_tr6_b96:
1675 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1676 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1677 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1678 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1679 Ptr = II->getArgOperand(0);
1680 break;
1681 case Intrinsic::amdgcn_load_to_lds:
1682 case Intrinsic::amdgcn_global_load_lds:
1683 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1684 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1685 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1686 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1687 Ptr = II->getArgOperand(1);
1688 break;
1689 default:
1690 return false;
1691 }
1692 AccessTy = II->getType();
1693 Ops.push_back(Ptr);
1694 return true;
1695}
1696
1698 unsigned AddrSpace) const {
1699 if (!Subtarget->hasFlatInstOffsets()) {
1700 // Flat instructions do not have offsets, and only have the register
1701 // address.
1702 return AM.BaseOffs == 0 && AM.Scale == 0;
1703 }
1704
1705 decltype(SIInstrFlags::FLAT) FlatVariant =
1709
1710 return AM.Scale == 0 &&
1711 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1712 AM.BaseOffs, AddrSpace, FlatVariant));
1713}
1714
1716 if (Subtarget->hasFlatGlobalInsts())
1718
1719 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1720 // Assume the we will use FLAT for all global memory accesses
1721 // on VI.
1722 // FIXME: This assumption is currently wrong. On VI we still use
1723 // MUBUF instructions for the r + i addressing mode. As currently
1724 // implemented, the MUBUF instructions only work on buffer < 4GB.
1725 // It may be possible to support > 4GB buffers with MUBUF instructions,
1726 // by setting the stride value in the resource descriptor which would
1727 // increase the size limit to (stride * 4GB). However, this is risky,
1728 // because it has never been validated.
1730 }
1731
1732 return isLegalMUBUFAddressingMode(AM);
1733}
1734
1735bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1736 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1737 // additionally can do r + r + i with addr64. 32-bit has more addressing
1738 // mode options. Depending on the resource constant, it can also do
1739 // (i64 r0) + (i32 r1) * (i14 i).
1740 //
1741 // Private arrays end up using a scratch buffer most of the time, so also
1742 // assume those use MUBUF instructions. Scratch loads / stores are currently
1743 // implemented as mubuf instructions with offen bit set, so slightly
1744 // different than the normal addr64.
1745 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1746 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1747 return false;
1748
1749 // FIXME: Since we can split immediate into soffset and immediate offset,
1750 // would it make sense to allow any immediate?
1751
1752 switch (AM.Scale) {
1753 case 0: // r + i or just i, depending on HasBaseReg.
1754 return true;
1755 case 1:
1756 return true; // We have r + r or r + i.
1757 case 2:
1758 if (AM.HasBaseReg) {
1759 // Reject 2 * r + r.
1760 return false;
1761 }
1762
1763 // Allow 2 * r as r + r
1764 // Or 2 * r + i is allowed as r + r + i.
1765 return true;
1766 default: // Don't allow n * r
1767 return false;
1768 }
1769}
1770
1772 const AddrMode &AM, Type *Ty,
1773 unsigned AS,
1774 Instruction *I) const {
1775 // No global is ever allowed as a base.
1776 if (AM.BaseGV)
1777 return false;
1778
1779 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1780 return isLegalGlobalAddressingMode(AM);
1781
1782 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1786 // If the offset isn't a multiple of 4, it probably isn't going to be
1787 // correctly aligned.
1788 // FIXME: Can we get the real alignment here?
1789 if (AM.BaseOffs % 4 != 0)
1790 return isLegalMUBUFAddressingMode(AM);
1791
1792 if (!Subtarget->hasScalarSubwordLoads()) {
1793 // There are no SMRD extloads, so if we have to do a small type access we
1794 // will use a MUBUF load.
1795 // FIXME?: We also need to do this if unaligned, but we don't know the
1796 // alignment here.
1797 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1798 return isLegalGlobalAddressingMode(AM);
1799 }
1800
1802 // SMRD instructions have an 8-bit, dword offset on SI.
1803 if (!isUInt<8>(AM.BaseOffs / 4))
1804 return false;
1805 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1806 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1807 // in 8-bits, it can use a smaller encoding.
1808 if (!isUInt<32>(AM.BaseOffs / 4))
1809 return false;
1810 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1811 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1812 if (!isUInt<20>(AM.BaseOffs))
1813 return false;
1814 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1815 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1816 // for S_BUFFER_* instructions).
1817 if (!isInt<21>(AM.BaseOffs))
1818 return false;
1819 } else {
1820 // On GFX12, all offsets are signed 24-bit in bytes.
1821 if (!isInt<24>(AM.BaseOffs))
1822 return false;
1823 }
1824
1825 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1827 AM.BaseOffs < 0) {
1828 // Scalar (non-buffer) loads can only use a negative offset if
1829 // soffset+offset is non-negative. Since the compiler can only prove that
1830 // in a few special cases, it is safer to claim that negative offsets are
1831 // not supported.
1832 return false;
1833 }
1834
1835 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1836 return true;
1837
1838 if (AM.Scale == 1 && AM.HasBaseReg)
1839 return true;
1840
1841 return false;
1842 }
1843
1844 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1845 return Subtarget->enableFlatScratch()
1847 : isLegalMUBUFAddressingMode(AM);
1848
1849 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1850 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1851 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1852 // field.
1853 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1854 // an 8-bit dword offset but we don't know the alignment here.
1855 if (!isUInt<16>(AM.BaseOffs))
1856 return false;
1857
1858 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1859 return true;
1860
1861 if (AM.Scale == 1 && AM.HasBaseReg)
1862 return true;
1863
1864 return false;
1865 }
1866
1868 // For an unknown address space, this usually means that this is for some
1869 // reason being used for pure arithmetic, and not based on some addressing
1870 // computation. We don't have instructions that compute pointers with any
1871 // addressing modes, so treat them as having no offset like flat
1872 // instructions.
1874 }
1875
1876 // Assume a user alias of global for unknown address spaces.
1877 return isLegalGlobalAddressingMode(AM);
1878}
1879
1881 const MachineFunction &MF) const {
1883 return (MemVT.getSizeInBits() <= 4 * 32);
1884 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1885 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1886 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1887 }
1889 return (MemVT.getSizeInBits() <= 2 * 32);
1890 return true;
1891}
1892
1894 unsigned Size, unsigned AddrSpace, Align Alignment,
1895 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1896 if (IsFast)
1897 *IsFast = 0;
1898
1899 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1900 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1901 // Check if alignment requirements for ds_read/write instructions are
1902 // disabled.
1903 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1904 return false;
1905
1906 Align RequiredAlignment(
1907 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1908 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1909 Alignment < RequiredAlignment)
1910 return false;
1911
1912 // Either, the alignment requirements are "enabled", or there is an
1913 // unaligned LDS access related hardware bug though alignment requirements
1914 // are "disabled". In either case, we need to check for proper alignment
1915 // requirements.
1916 //
1917 switch (Size) {
1918 case 64:
1919 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1920 // address is negative, then the instruction is incorrectly treated as
1921 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1922 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1923 // load later in the SILoadStoreOptimizer.
1924 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1925 return false;
1926
1927 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1928 // can do a 4 byte aligned, 8 byte access in a single operation using
1929 // ds_read2/write2_b32 with adjacent offsets.
1930 RequiredAlignment = Align(4);
1931
1932 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1933 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1934 // ds_write2_b32 depending on the alignment. In either case with either
1935 // alignment there is no faster way of doing this.
1936
1937 // The numbers returned here and below are not additive, it is a 'speed
1938 // rank'. They are just meant to be compared to decide if a certain way
1939 // of lowering an operation is faster than another. For that purpose
1940 // naturally aligned operation gets it bitsize to indicate that "it
1941 // operates with a speed comparable to N-bit wide load". With the full
1942 // alignment ds128 is slower than ds96 for example. If underaligned it
1943 // is comparable to a speed of a single dword access, which would then
1944 // mean 32 < 128 and it is faster to issue a wide load regardless.
1945 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1946 // wider load which will not be aligned anymore the latter is slower.
1947 if (IsFast)
1948 *IsFast = (Alignment >= RequiredAlignment) ? 64
1949 : (Alignment < Align(4)) ? 32
1950 : 1;
1951 return true;
1952 }
1953
1954 break;
1955 case 96:
1956 if (!Subtarget->hasDS96AndDS128())
1957 return false;
1958
1959 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1960 // gfx8 and older.
1961
1962 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1963 // Naturally aligned access is fastest. However, also report it is Fast
1964 // if memory is aligned less than DWORD. A narrow load or store will be
1965 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1966 // be more of them, so overall we will pay less penalty issuing a single
1967 // instruction.
1968
1969 // See comment on the values above.
1970 if (IsFast)
1971 *IsFast = (Alignment >= RequiredAlignment) ? 96
1972 : (Alignment < Align(4)) ? 32
1973 : 1;
1974 return true;
1975 }
1976
1977 break;
1978 case 128:
1979 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1980 return false;
1981
1982 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1983 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1984 // single operation using ds_read2/write2_b64.
1985 RequiredAlignment = Align(8);
1986
1987 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1988 // Naturally aligned access is fastest. However, also report it is Fast
1989 // if memory is aligned less than DWORD. A narrow load or store will be
1990 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1991 // will be more of them, so overall we will pay less penalty issuing a
1992 // single instruction.
1993
1994 // See comment on the values above.
1995 if (IsFast)
1996 *IsFast = (Alignment >= RequiredAlignment) ? 128
1997 : (Alignment < Align(4)) ? 32
1998 : 1;
1999 return true;
2000 }
2001
2002 break;
2003 default:
2004 if (Size > 32)
2005 return false;
2006
2007 break;
2008 }
2009
2010 // See comment on the values above.
2011 // Note that we have a single-dword or sub-dword here, so if underaligned
2012 // it is a slowest possible access, hence returned value is 0.
2013 if (IsFast)
2014 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
2015
2016 return Alignment >= RequiredAlignment ||
2017 Subtarget->hasUnalignedDSAccessEnabled();
2018 }
2019
2020 // FIXME: We have to be conservative here and assume that flat operations
2021 // will access scratch. If we had access to the IR function, then we
2022 // could determine if any private memory was used in the function.
2023 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
2024 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2025 bool AlignedBy4 = Alignment >= Align(4);
2026 if (IsFast)
2027 *IsFast = AlignedBy4;
2028
2029 return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
2030 }
2031
2032 // So long as they are correct, wide global memory operations perform better
2033 // than multiple smaller memory ops -- even when misaligned
2034 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
2035 if (IsFast)
2036 *IsFast = Size;
2037
2038 return Alignment >= Align(4) ||
2040 }
2041
2042 // Ensure robust out-of-bounds guarantees for buffer accesses are met if
2043 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
2044 // out-of-bounds behavior, but in the edge case where an access starts
2045 // out-of-bounds and then enter in-bounds, the entire access would be treated
2046 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
2047 // natural alignment of buffer accesses.
2048 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
2049 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
2050 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2051 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2052 Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
2053 return false;
2054 }
2055
2056 // Smaller than dword value must be aligned.
2057 if (Size < 32)
2058 return false;
2059
2060 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2061 // byte-address are ignored, thus forcing Dword alignment.
2062 // This applies to private, global, and constant memory.
2063 if (IsFast)
2064 *IsFast = 1;
2065
2066 return Size >= 32 && Alignment >= Align(4);
2067}
2068
2070 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2071 unsigned *IsFast) const {
2073 Alignment, Flags, IsFast);
2074}
2075
2077 LLVMContext &Context, const MemOp &Op,
2078 const AttributeList &FuncAttributes) const {
2079 // FIXME: Should account for address space here.
2080
2081 // The default fallback uses the private pointer size as a guess for a type to
2082 // use. Make sure we switch these to 64-bit accesses.
2083
2084 if (Op.size() >= 16 &&
2085 Op.isDstAligned(Align(4))) // XXX: Should only do for global
2086 return MVT::v4i32;
2087
2088 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
2089 return MVT::v2i32;
2090
2091 // Use the default.
2092 return MVT::Other;
2093}
2094
2096 const MemSDNode *MemNode = cast<MemSDNode>(N);
2097 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2098}
2099
2101 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
2103}
2104
2106 unsigned DestAS) const {
2107 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2108 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2109 Subtarget->hasGloballyAddressableScratch()) {
2110 // Flat -> private requires subtracting src_flat_scratch_base_lo.
2111 return false;
2112 }
2113
2114 // Flat -> private/local is a simple truncate.
2115 // Flat -> global is no-op
2116 return true;
2117 }
2118
2119 const GCNTargetMachine &TM =
2120 static_cast<const GCNTargetMachine &>(getTargetMachine());
2121 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2122}
2123
2126 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2127 VT.getScalarType().bitsLE(MVT::i16))
2130}
2131
2133 Type *Ty) const {
2134 // FIXME: Could be smarter if called for vector constants.
2135 return true;
2136}
2137
2139 unsigned Index) const {
2141 return false;
2142
2143 // TODO: Add more cases that are cheap.
2144 return Index == 0;
2145}
2146
2147bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2148 // TODO: This should be more aggressive, particular for 16-bit element
2149 // vectors. However there are some mixed improvements and regressions.
2150 EVT EltTy = VT.getVectorElementType();
2151 return EltTy.getSizeInBits() % 32 == 0;
2152}
2153
2155 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2156 switch (Op) {
2157 case ISD::LOAD:
2158 case ISD::STORE:
2159 return true;
2160 default:
2161 return false;
2162 }
2163 }
2164
2165 // SimplifySetCC uses this function to determine whether or not it should
2166 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2167 if (VT == MVT::i1 && Op == ISD::SETCC)
2168 return false;
2169
2171}
2172
2173SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2174 const SDLoc &SL,
2175 SDValue Chain,
2176 uint64_t Offset) const {
2177 const DataLayout &DL = DAG.getDataLayout();
2181
2182 auto [InputPtrReg, RC, ArgTy] =
2184
2185 // We may not have the kernarg segment argument if we have no kernel
2186 // arguments.
2187 if (!InputPtrReg)
2188 return DAG.getConstant(Offset, SL, PtrVT);
2189
2191 SDValue BasePtr = DAG.getCopyFromReg(
2192 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2193
2194 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2195}
2196
2197SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2198 const SDLoc &SL) const {
2201 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2202}
2203
2204SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2205 const SDLoc &SL) const {
2206
2208 std::optional<uint32_t> KnownSize =
2210 if (KnownSize.has_value())
2211 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2212 return SDValue();
2213}
2214
2215SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2216 const SDLoc &SL, SDValue Val,
2217 bool Signed,
2218 const ISD::InputArg *Arg) const {
2219 // First, if it is a widened vector, narrow it.
2220 if (VT.isVector() &&
2222 EVT NarrowedVT =
2225 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2226 DAG.getConstant(0, SL, MVT::i32));
2227 }
2228
2229 // Then convert the vector elements or scalar value.
2230 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2231 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2232 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2233 }
2234
2235 if (MemVT.isFloatingPoint())
2236 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2237 else if (Signed)
2238 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2239 else
2240 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2241
2242 return Val;
2243}
2244
2245SDValue SITargetLowering::lowerKernargMemParameter(
2246 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2247 uint64_t Offset, Align Alignment, bool Signed,
2248 const ISD::InputArg *Arg) const {
2250
2251 // Try to avoid using an extload by loading earlier than the argument address,
2252 // and extracting the relevant bits. The load should hopefully be merged with
2253 // the previous argument.
2254 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2255 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2256 int64_t AlignDownOffset = alignDown(Offset, 4);
2257 int64_t OffsetDiff = Offset - AlignDownOffset;
2258
2259 EVT IntVT = MemVT.changeTypeToInteger();
2260
2261 // TODO: If we passed in the base kernel offset we could have a better
2262 // alignment than 4, but we don't really need it.
2263 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2264 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2267
2268 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2269 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2270
2271 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2272 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2273 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2274
2275 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2276 }
2277
2278 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2279 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2282
2283 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2284 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2285}
2286
2287SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2288 CCValAssign &VA, const SDLoc &SL,
2289 SDValue Chain,
2290 const ISD::InputArg &Arg) const {
2292 MachineFrameInfo &MFI = MF.getFrameInfo();
2293
2294 if (Arg.Flags.isByVal()) {
2295 unsigned Size = Arg.Flags.getByValSize();
2296 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2297 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2298 }
2299
2300 unsigned ArgOffset = VA.getLocMemOffset();
2301 unsigned ArgSize = VA.getValVT().getStoreSize();
2302
2303 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2304
2305 // Create load nodes to retrieve arguments from the stack.
2306 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2307 SDValue ArgValue;
2308
2309 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2311 MVT MemVT = VA.getValVT();
2312
2313 switch (VA.getLocInfo()) {
2314 default:
2315 break;
2316 case CCValAssign::BCvt:
2317 MemVT = VA.getLocVT();
2318 break;
2319 case CCValAssign::SExt:
2320 ExtType = ISD::SEXTLOAD;
2321 break;
2322 case CCValAssign::ZExt:
2323 ExtType = ISD::ZEXTLOAD;
2324 break;
2325 case CCValAssign::AExt:
2326 ExtType = ISD::EXTLOAD;
2327 break;
2328 }
2329
2330 ArgValue = DAG.getExtLoad(
2331 ExtType, SL, VA.getLocVT(), Chain, FIN,
2333 return ArgValue;
2334}
2335
2336SDValue SITargetLowering::getPreloadedValue(
2337 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2339 const ArgDescriptor *Reg = nullptr;
2340 const TargetRegisterClass *RC;
2341 LLT Ty;
2342
2344 const ArgDescriptor WorkGroupIDX =
2345 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2346 // If GridZ is not programmed in an entry function then the hardware will set
2347 // it to all zeros, so there is no need to mask the GridY value in the low
2348 // order bits.
2349 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2350 AMDGPU::TTMP7,
2351 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2352 const ArgDescriptor WorkGroupIDZ =
2353 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2354 if (Subtarget->hasArchitectedSGPRs() &&
2357 switch (PVID) {
2359 Reg = &WorkGroupIDX;
2360 RC = &AMDGPU::SReg_32RegClass;
2361 Ty = LLT::scalar(32);
2362 break;
2364 Reg = &WorkGroupIDY;
2365 RC = &AMDGPU::SReg_32RegClass;
2366 Ty = LLT::scalar(32);
2367 break;
2369 Reg = &WorkGroupIDZ;
2370 RC = &AMDGPU::SReg_32RegClass;
2371 Ty = LLT::scalar(32);
2372 break;
2373 default:
2374 break;
2375 }
2376 }
2377
2378 if (!Reg)
2379 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2380 if (!Reg) {
2382 // It's possible for a kernarg intrinsic call to appear in a kernel with
2383 // no allocated segment, in which case we do not add the user sgpr
2384 // argument, so just return null.
2385 return DAG.getConstant(0, SDLoc(), VT);
2386 }
2387
2388 // It's undefined behavior if a function marked with the amdgpu-no-*
2389 // attributes uses the corresponding intrinsic.
2390 return DAG.getPOISON(VT);
2391 }
2392
2393 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2394}
2395
2397 CallingConv::ID CallConv,
2398 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2399 FunctionType *FType,
2400 SIMachineFunctionInfo *Info) {
2401 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2402 const ISD::InputArg *Arg = &Ins[I];
2403
2404 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2405 "vector type argument should have been split");
2406
2407 // First check if it's a PS input addr.
2408 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2409 PSInputNum <= 15) {
2410 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2411
2412 // Inconveniently only the first part of the split is marked as isSplit,
2413 // so skip to the end. We only want to increment PSInputNum once for the
2414 // entire split argument.
2415 if (Arg->Flags.isSplit()) {
2416 while (!Arg->Flags.isSplitEnd()) {
2417 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2418 "unexpected vector split in ps argument type");
2419 if (!SkipArg)
2420 Splits.push_back(*Arg);
2421 Arg = &Ins[++I];
2422 }
2423 }
2424
2425 if (SkipArg) {
2426 // We can safely skip PS inputs.
2427 Skipped.set(Arg->getOrigArgIndex());
2428 ++PSInputNum;
2429 continue;
2430 }
2431
2432 Info->markPSInputAllocated(PSInputNum);
2433 if (Arg->Used)
2434 Info->markPSInputEnabled(PSInputNum);
2435
2436 ++PSInputNum;
2437 }
2438
2439 Splits.push_back(*Arg);
2440 }
2441}
2442
2443// Allocate special inputs passed in VGPRs.
2445 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2446 SIMachineFunctionInfo &Info) const {
2447 const LLT S32 = LLT::scalar(32);
2449
2450 if (Info.hasWorkItemIDX()) {
2451 Register Reg = AMDGPU::VGPR0;
2452 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2453
2454 CCInfo.AllocateReg(Reg);
2455 unsigned Mask =
2456 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2457 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2458 }
2459
2460 if (Info.hasWorkItemIDY()) {
2461 assert(Info.hasWorkItemIDX());
2462 if (Subtarget->hasPackedTID()) {
2463 Info.setWorkItemIDY(
2464 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2465 } else {
2466 unsigned Reg = AMDGPU::VGPR1;
2467 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2468
2469 CCInfo.AllocateReg(Reg);
2470 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2471 }
2472 }
2473
2474 if (Info.hasWorkItemIDZ()) {
2475 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2476 if (Subtarget->hasPackedTID()) {
2477 Info.setWorkItemIDZ(
2478 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2479 } else {
2480 unsigned Reg = AMDGPU::VGPR2;
2481 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2482
2483 CCInfo.AllocateReg(Reg);
2484 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2485 }
2486 }
2487}
2488
2489// Try to allocate a VGPR at the end of the argument list, or if no argument
2490// VGPRs are left allocating a stack slot.
2491// If \p Mask is is given it indicates bitfield position in the register.
2492// If \p Arg is given use it with new ]p Mask instead of allocating new.
2493static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2494 ArgDescriptor Arg = ArgDescriptor()) {
2495 if (Arg.isSet())
2496 return ArgDescriptor::createArg(Arg, Mask);
2497
2498 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2499 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2500 if (RegIdx == ArgVGPRs.size()) {
2501 // Spill to stack required.
2502 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2503
2504 return ArgDescriptor::createStack(Offset, Mask);
2505 }
2506
2507 unsigned Reg = ArgVGPRs[RegIdx];
2508 Reg = CCInfo.AllocateReg(Reg);
2509 assert(Reg != AMDGPU::NoRegister);
2510
2511 MachineFunction &MF = CCInfo.getMachineFunction();
2512 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2513 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2514 return ArgDescriptor::createRegister(Reg, Mask);
2515}
2516
2518 const TargetRegisterClass *RC,
2519 unsigned NumArgRegs) {
2520 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2521 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2522 if (RegIdx == ArgSGPRs.size())
2523 report_fatal_error("ran out of SGPRs for arguments");
2524
2525 unsigned Reg = ArgSGPRs[RegIdx];
2526 Reg = CCInfo.AllocateReg(Reg);
2527 assert(Reg != AMDGPU::NoRegister);
2528
2529 MachineFunction &MF = CCInfo.getMachineFunction();
2530 MF.addLiveIn(Reg, RC);
2532}
2533
2534// If this has a fixed position, we still should allocate the register in the
2535// CCInfo state. Technically we could get away with this for values passed
2536// outside of the normal argument range.
2538 const TargetRegisterClass *RC,
2539 MCRegister Reg) {
2540 Reg = CCInfo.AllocateReg(Reg);
2541 assert(Reg != AMDGPU::NoRegister);
2542 MachineFunction &MF = CCInfo.getMachineFunction();
2543 MF.addLiveIn(Reg, RC);
2544}
2545
2546static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2547 if (Arg) {
2548 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2549 Arg.getRegister());
2550 } else
2551 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2552}
2553
2554static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2555 if (Arg) {
2556 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2557 Arg.getRegister());
2558 } else
2559 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2560}
2561
2562/// Allocate implicit function VGPR arguments at the end of allocated user
2563/// arguments.
2565 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2566 SIMachineFunctionInfo &Info) const {
2567 const unsigned Mask = 0x3ff;
2568 ArgDescriptor Arg;
2569
2570 if (Info.hasWorkItemIDX()) {
2571 Arg = allocateVGPR32Input(CCInfo, Mask);
2572 Info.setWorkItemIDX(Arg);
2573 }
2574
2575 if (Info.hasWorkItemIDY()) {
2576 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2577 Info.setWorkItemIDY(Arg);
2578 }
2579
2580 if (Info.hasWorkItemIDZ())
2581 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2582}
2583
2584/// Allocate implicit function VGPR arguments in fixed registers.
2586 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2587 SIMachineFunctionInfo &Info) const {
2588 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2589 if (!Reg)
2590 report_fatal_error("failed to allocate VGPR for implicit arguments");
2591
2592 const unsigned Mask = 0x3ff;
2593 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2594 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2595 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2596}
2597
2599 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2600 SIMachineFunctionInfo &Info) const {
2601 auto &ArgInfo = Info.getArgInfo();
2602 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2603
2604 // TODO: Unify handling with private memory pointers.
2605 if (UserSGPRInfo.hasDispatchPtr())
2606 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2607
2608 if (UserSGPRInfo.hasQueuePtr())
2609 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2610
2611 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2612 // constant offset from the kernarg segment.
2613 if (Info.hasImplicitArgPtr())
2614 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2615
2616 if (UserSGPRInfo.hasDispatchID())
2617 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2618
2619 // flat_scratch_init is not applicable for non-kernel functions.
2620
2621 if (Info.hasWorkGroupIDX())
2622 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2623
2624 if (Info.hasWorkGroupIDY())
2625 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2626
2627 if (Info.hasWorkGroupIDZ())
2628 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2629
2630 if (Info.hasLDSKernelId())
2631 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2632}
2633
2634// Allocate special inputs passed in user SGPRs.
2636 MachineFunction &MF,
2637 const SIRegisterInfo &TRI,
2638 SIMachineFunctionInfo &Info) const {
2639 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2640 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2641 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2642 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2643 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2644 }
2645
2646 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2647 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2648 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2649 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2650 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2651 }
2652
2653 if (UserSGPRInfo.hasDispatchPtr()) {
2654 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2655 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2656 CCInfo.AllocateReg(DispatchPtrReg);
2657 }
2658
2659 if (UserSGPRInfo.hasQueuePtr()) {
2660 Register QueuePtrReg = Info.addQueuePtr(TRI);
2661 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2662 CCInfo.AllocateReg(QueuePtrReg);
2663 }
2664
2665 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2667 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2668 CCInfo.AllocateReg(InputPtrReg);
2669
2670 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2671 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2672 }
2673
2674 if (UserSGPRInfo.hasDispatchID()) {
2675 Register DispatchIDReg = Info.addDispatchID(TRI);
2676 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2677 CCInfo.AllocateReg(DispatchIDReg);
2678 }
2679
2680 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2681 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2682 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2683 CCInfo.AllocateReg(FlatScratchInitReg);
2684 }
2685
2686 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2687 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2688 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2689 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2690 }
2691
2692 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2693 // these from the dispatch pointer.
2694}
2695
2696// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2697// sequential starting from the first argument.
2699 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2701 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2702 Function &F = MF.getFunction();
2703 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2704 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2705 bool InPreloadSequence = true;
2706 unsigned InIdx = 0;
2707 bool AlignedForImplictArgs = false;
2708 unsigned ImplicitArgOffset = 0;
2709 for (auto &Arg : F.args()) {
2710 if (!InPreloadSequence || !Arg.hasInRegAttr())
2711 break;
2712
2713 unsigned ArgIdx = Arg.getArgNo();
2714 // Don't preload non-original args or parts not in the current preload
2715 // sequence.
2716 if (InIdx < Ins.size() &&
2717 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2718 break;
2719
2720 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2721 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2722 InIdx++) {
2723 assert(ArgLocs[ArgIdx].isMemLoc());
2724 auto &ArgLoc = ArgLocs[InIdx];
2725 const Align KernelArgBaseAlign = Align(16);
2726 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2727 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2728 unsigned NumAllocSGPRs =
2729 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2730
2731 // Fix alignment for hidden arguments.
2732 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2733 if (!AlignedForImplictArgs) {
2734 ImplicitArgOffset =
2735 alignTo(LastExplicitArgOffset,
2736 Subtarget->getAlignmentForImplicitArgPtr()) -
2737 LastExplicitArgOffset;
2738 AlignedForImplictArgs = true;
2739 }
2740 ArgOffset += ImplicitArgOffset;
2741 }
2742
2743 // Arg is preloaded into the previous SGPR.
2744 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2745 assert(InIdx >= 1 && "No previous SGPR");
2746 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2747 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2748 continue;
2749 }
2750
2751 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2752 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2753 // Check for free user SGPRs for preloading.
2754 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2755 InPreloadSequence = false;
2756 break;
2757 }
2758
2759 // Preload this argument.
2760 const TargetRegisterClass *RC =
2761 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2762 SmallVectorImpl<MCRegister> *PreloadRegs =
2763 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2764
2765 if (PreloadRegs->size() > 1)
2766 RC = &AMDGPU::SGPR_32RegClass;
2767 for (auto &Reg : *PreloadRegs) {
2768 assert(Reg);
2769 MF.addLiveIn(Reg, RC);
2770 CCInfo.AllocateReg(Reg);
2771 }
2772
2773 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2774 }
2775 }
2776}
2777
2779 const SIRegisterInfo &TRI,
2780 SIMachineFunctionInfo &Info) const {
2781 // Always allocate this last since it is a synthetic preload.
2782 if (Info.hasLDSKernelId()) {
2783 Register Reg = Info.addLDSKernelId();
2784 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2785 CCInfo.AllocateReg(Reg);
2786 }
2787}
2788
2789// Allocate special input registers that are initialized per-wave.
2792 CallingConv::ID CallConv,
2793 bool IsShader) const {
2794 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2795 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2796 // Note: user SGPRs are handled by the front-end for graphics shaders
2797 // Pad up the used user SGPRs with dead inputs.
2798
2799 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2800 // before enabling architected SGPRs for workgroup IDs.
2801 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2802
2803 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2804 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2805 // rely on it to reach 16 since if we end up having no stack usage, it will
2806 // not really be added.
2807 unsigned NumRequiredSystemSGPRs =
2808 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
2809 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
2810 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2811 Register Reg = Info.addReservedUserSGPR();
2812 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2813 CCInfo.AllocateReg(Reg);
2814 }
2815 }
2816
2817 if (!HasArchitectedSGPRs) {
2818 if (Info.hasWorkGroupIDX()) {
2819 Register Reg = Info.addWorkGroupIDX();
2820 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2821 CCInfo.AllocateReg(Reg);
2822 }
2823
2824 if (Info.hasWorkGroupIDY()) {
2825 Register Reg = Info.addWorkGroupIDY();
2826 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2827 CCInfo.AllocateReg(Reg);
2828 }
2829
2830 if (Info.hasWorkGroupIDZ()) {
2831 Register Reg = Info.addWorkGroupIDZ();
2832 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2833 CCInfo.AllocateReg(Reg);
2834 }
2835 }
2836
2837 if (Info.hasWorkGroupInfo()) {
2838 Register Reg = Info.addWorkGroupInfo();
2839 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2840 CCInfo.AllocateReg(Reg);
2841 }
2842
2843 if (Info.hasPrivateSegmentWaveByteOffset()) {
2844 // Scratch wave offset passed in system SGPR.
2845 unsigned PrivateSegmentWaveByteOffsetReg;
2846
2847 if (IsShader) {
2848 PrivateSegmentWaveByteOffsetReg =
2849 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2850
2851 // This is true if the scratch wave byte offset doesn't have a fixed
2852 // location.
2853 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2854 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2855 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2856 }
2857 } else
2858 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2859
2860 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2861 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2862 }
2863
2864 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2865 Info.getNumPreloadedSGPRs() >= 16);
2866}
2867
2869 MachineFunction &MF,
2870 const SIRegisterInfo &TRI,
2871 SIMachineFunctionInfo &Info) {
2872 // Now that we've figured out where the scratch register inputs are, see if
2873 // should reserve the arguments and use them directly.
2874 MachineFrameInfo &MFI = MF.getFrameInfo();
2875 bool HasStackObjects = MFI.hasStackObjects();
2876 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2877
2878 // Record that we know we have non-spill stack objects so we don't need to
2879 // check all stack objects later.
2880 if (HasStackObjects)
2881 Info.setHasNonSpillStackObjects(true);
2882
2883 // Everything live out of a block is spilled with fast regalloc, so it's
2884 // almost certain that spilling will be required.
2885 if (TM.getOptLevel() == CodeGenOptLevel::None)
2886 HasStackObjects = true;
2887
2888 // For now assume stack access is needed in any callee functions, so we need
2889 // the scratch registers to pass in.
2890 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2891
2892 if (!ST.enableFlatScratch()) {
2893 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2894 // If we have stack objects, we unquestionably need the private buffer
2895 // resource. For the Code Object V2 ABI, this will be the first 4 user
2896 // SGPR inputs. We can reserve those and use them directly.
2897
2898 Register PrivateSegmentBufferReg =
2900 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2901 } else {
2902 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2903 // We tentatively reserve the last registers (skipping the last registers
2904 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2905 // we'll replace these with the ones immediately after those which were
2906 // really allocated. In the prologue copies will be inserted from the
2907 // argument to these reserved registers.
2908
2909 // Without HSA, relocations are used for the scratch pointer and the
2910 // buffer resource setup is always inserted in the prologue. Scratch wave
2911 // offset is still in an input SGPR.
2912 Info.setScratchRSrcReg(ReservedBufferReg);
2913 }
2914 }
2915
2917
2918 // For entry functions we have to set up the stack pointer if we use it,
2919 // whereas non-entry functions get this "for free". This means there is no
2920 // intrinsic advantage to using S32 over S34 in cases where we do not have
2921 // calls but do need a frame pointer (i.e. if we are requested to have one
2922 // because frame pointer elimination is disabled). To keep things simple we
2923 // only ever use S32 as the call ABI stack pointer, and so using it does not
2924 // imply we need a separate frame pointer.
2925 //
2926 // Try to use s32 as the SP, but move it if it would interfere with input
2927 // arguments. This won't work with calls though.
2928 //
2929 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2930 // registers.
2931 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2932 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2933 } else {
2935
2936 if (MFI.hasCalls())
2937 report_fatal_error("call in graphics shader with too many input SGPRs");
2938
2939 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2940 if (!MRI.isLiveIn(Reg)) {
2941 Info.setStackPtrOffsetReg(Reg);
2942 break;
2943 }
2944 }
2945
2946 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2947 report_fatal_error("failed to find register for SP");
2948 }
2949
2950 // hasFP should be accurate for entry functions even before the frame is
2951 // finalized, because it does not rely on the known stack size, only
2952 // properties like whether variable sized objects are present.
2953 if (ST.getFrameLowering()->hasFP(MF)) {
2954 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2955 }
2956}
2957
2960 return !Info->isEntryFunction();
2961}
2962
2964
2966 MachineBasicBlock *Entry,
2967 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2969
2970 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2971 if (!IStart)
2972 return;
2973
2974 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2975 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2976 MachineBasicBlock::iterator MBBI = Entry->begin();
2977 for (const MCPhysReg *I = IStart; *I; ++I) {
2978 const TargetRegisterClass *RC = nullptr;
2979 if (AMDGPU::SReg_64RegClass.contains(*I))
2980 RC = &AMDGPU::SGPR_64RegClass;
2981 else if (AMDGPU::SReg_32RegClass.contains(*I))
2982 RC = &AMDGPU::SGPR_32RegClass;
2983 else
2984 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2985
2986 Register NewVR = MRI->createVirtualRegister(RC);
2987 // Create copy from CSR to a virtual register.
2988 Entry->addLiveIn(*I);
2989 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2990 .addReg(*I);
2991
2992 // Insert the copy-back instructions right before the terminator.
2993 for (auto *Exit : Exits)
2994 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2995 TII->get(TargetOpcode::COPY), *I)
2996 .addReg(NewVR);
2997 }
2998}
2999
3001 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3002 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3003 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3005
3007 const Function &Fn = MF.getFunction();
3010 bool IsError = false;
3011
3012 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
3014 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3015 IsError = true;
3016 }
3017
3020 BitVector Skipped(Ins.size());
3021 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3022 *DAG.getContext());
3023
3024 bool IsGraphics = AMDGPU::isGraphics(CallConv);
3025 bool IsKernel = AMDGPU::isKernel(CallConv);
3026 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
3027
3028 if (IsGraphics) {
3029 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3030 assert(!UserSGPRInfo.hasDispatchPtr() &&
3031 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3032 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3033 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3034 (void)UserSGPRInfo;
3035 if (!Subtarget->enableFlatScratch())
3036 assert(!UserSGPRInfo.hasFlatScratchInit());
3037 if ((CallConv != CallingConv::AMDGPU_CS &&
3038 CallConv != CallingConv::AMDGPU_Gfx &&
3039 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
3040 !Subtarget->hasArchitectedSGPRs())
3041 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3042 !Info->hasWorkGroupIDZ());
3043 }
3044
3045 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3046
3047 if (CallConv == CallingConv::AMDGPU_PS) {
3048 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3049
3050 // At least one interpolation mode must be enabled or else the GPU will
3051 // hang.
3052 //
3053 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3054 // set PSInputAddr, the user wants to enable some bits after the compilation
3055 // based on run-time states. Since we can't know what the final PSInputEna
3056 // will look like, so we shouldn't do anything here and the user should take
3057 // responsibility for the correct programming.
3058 //
3059 // Otherwise, the following restrictions apply:
3060 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
3061 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
3062 // enabled too.
3063 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3064 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3065 CCInfo.AllocateReg(AMDGPU::VGPR0);
3066 CCInfo.AllocateReg(AMDGPU::VGPR1);
3067 Info->markPSInputAllocated(0);
3068 Info->markPSInputEnabled(0);
3069 }
3070 if (Subtarget->isAmdPalOS()) {
3071 // For isAmdPalOS, the user does not enable some bits after compilation
3072 // based on run-time states; the register values being generated here are
3073 // the final ones set in hardware. Therefore we need to apply the
3074 // workaround to PSInputAddr and PSInputEnable together. (The case where
3075 // a bit is set in PSInputAddr but not PSInputEnable is where the
3076 // frontend set up an input arg for a particular interpolation mode, but
3077 // nothing uses that input arg. Really we should have an earlier pass
3078 // that removes such an arg.)
3079 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3080 if ((PsInputBits & 0x7F) == 0 ||
3081 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3082 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
3083 }
3084 } else if (IsKernel) {
3085 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3086 } else {
3087 Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3088 Ins.end());
3089 }
3090
3091 if (IsKernel)
3092 analyzeFormalArgumentsCompute(CCInfo, Ins);
3093
3094 if (IsEntryFunc) {
3095 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
3096 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
3097 if (IsKernel && Subtarget->hasKernargPreload())
3098 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
3099
3100 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
3101 } else if (!IsGraphics) {
3102 // For the fixed ABI, pass workitem IDs in the last argument register.
3103 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
3104
3105 // FIXME: Sink this into allocateSpecialInputSGPRs
3106 if (!Subtarget->enableFlatScratch())
3107 CCInfo.AllocateReg(Info->getScratchRSrcReg());
3108
3109 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
3110 }
3111
3112 if (!IsKernel) {
3113 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
3114 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
3115
3116 // This assumes the registers are allocated by CCInfo in ascending order
3117 // with no gaps.
3118 Info->setNumWaveDispatchSGPRs(
3119 CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
3120 Info->setNumWaveDispatchVGPRs(
3121 CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
3122 } else if (Info->getNumKernargPreloadedSGPRs()) {
3123 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3124 }
3125
3127
3128 if (IsWholeWaveFunc) {
3130 {MVT::i1, MVT::Other}, Chain);
3131 InVals.push_back(Setup.getValue(0));
3132 Chains.push_back(Setup.getValue(1));
3133 }
3134
3135 // FIXME: This is the minimum kernel argument alignment. We should improve
3136 // this to the maximum alignment of the arguments.
3137 //
3138 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3139 // kern arg offset.
3140 const Align KernelArgBaseAlign = Align(16);
3141
3142 for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3143 ++i) {
3144 const ISD::InputArg &Arg = Ins[i];
3145 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3146 InVals.push_back(DAG.getPOISON(Arg.VT));
3147 continue;
3148 }
3149
3150 CCValAssign &VA = ArgLocs[ArgIdx++];
3151 MVT VT = VA.getLocVT();
3152
3153 if (IsEntryFunc && VA.isMemLoc()) {
3154 VT = Ins[i].VT;
3155 EVT MemVT = VA.getLocVT();
3156
3157 const uint64_t Offset = VA.getLocMemOffset();
3158 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
3159
3160 if (Arg.Flags.isByRef()) {
3161 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
3162
3163 const GCNTargetMachine &TM =
3164 static_cast<const GCNTargetMachine &>(getTargetMachine());
3165 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
3166 Arg.Flags.getPointerAddrSpace())) {
3169 }
3170
3171 InVals.push_back(Ptr);
3172 continue;
3173 }
3174
3175 SDValue NewArg;
3176 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3177 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3178 // In this case the argument is packed into the previous preload SGPR.
3179 int64_t AlignDownOffset = alignDown(Offset, 4);
3180 int64_t OffsetDiff = Offset - AlignDownOffset;
3181 EVT IntVT = MemVT.changeTypeToInteger();
3182
3186 Register Reg =
3187 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3188
3189 assert(Reg);
3190 Register VReg = MRI.getLiveInVirtReg(Reg);
3191 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3192
3193 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
3194 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
3195
3196 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
3197 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
3198 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
3199 Ins[i].Flags.isSExt(), &Ins[i]);
3200
3201 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
3202 } else {
3206 const SmallVectorImpl<MCRegister> &PreloadRegs =
3207 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3208
3209 SDValue Copy;
3210 if (PreloadRegs.size() == 1) {
3211 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3212 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3213 NewArg = DAG.getCopyFromReg(
3214 Chain, DL, VReg,
3216 TRI->getRegSizeInBits(*RC)));
3217
3218 } else {
3219 // If the kernarg alignment does not match the alignment of the SGPR
3220 // tuple RC that can accommodate this argument, it will be built up
3221 // via copies from from the individual SGPRs that the argument was
3222 // preloaded to.
3224 for (auto Reg : PreloadRegs) {
3225 Register VReg = MRI.getLiveInVirtReg(Reg);
3226 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3227 Elts.push_back(Copy);
3228 }
3229 NewArg =
3230 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3231 PreloadRegs.size()),
3232 DL, Elts);
3233 }
3234
3235 // If the argument was preloaded to multiple consecutive 32-bit
3236 // registers because of misalignment between addressable SGPR tuples
3237 // and the argument size, we can still assume that because of kernarg
3238 // segment alignment restrictions that NewArg's size is the same as
3239 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3240 // truncate since we cannot preload to less than a single SGPR and the
3241 // MemVT may be smaller.
3242 EVT MemVTInt =
3244 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3245 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3246
3247 NewArg = DAG.getBitcast(MemVT, NewArg);
3248 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3249 Ins[i].Flags.isSExt(), &Ins[i]);
3250 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3251 }
3252 } else {
3253 // Hidden arguments that are in the kernel signature must be preloaded
3254 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3255 // the argument list and is not preloaded.
3256 if (Arg.isOrigArg()) {
3257 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3258 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3260 *OrigArg->getParent(),
3261 "hidden argument in kernel signature was not preloaded",
3262 DL.getDebugLoc()));
3263 }
3264 }
3265
3266 NewArg =
3267 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3268 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3269 }
3270 Chains.push_back(NewArg.getValue(1));
3271
3272 auto *ParamTy =
3273 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3275 ParamTy &&
3276 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3277 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3278 // On SI local pointers are just offsets into LDS, so they are always
3279 // less than 16-bits. On CI and newer they could potentially be
3280 // real pointers, so we can't guarantee their size.
3281 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3282 DAG.getValueType(MVT::i16));
3283 }
3284
3285 InVals.push_back(NewArg);
3286 continue;
3287 }
3288 if (!IsEntryFunc && VA.isMemLoc()) {
3289 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3290 InVals.push_back(Val);
3291 if (!Arg.Flags.isByVal())
3292 Chains.push_back(Val.getValue(1));
3293 continue;
3294 }
3295
3296 assert(VA.isRegLoc() && "Parameter must be in a register!");
3297
3298 Register Reg = VA.getLocReg();
3299 const TargetRegisterClass *RC = nullptr;
3300 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3301 RC = &AMDGPU::VGPR_32RegClass;
3302 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3303 RC = &AMDGPU::SGPR_32RegClass;
3304 else
3305 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3306 EVT ValVT = VA.getValVT();
3307
3308 Reg = MF.addLiveIn(Reg, RC);
3309 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3310
3311 if (Arg.Flags.isSRet()) {
3312 // The return object should be reasonably addressable.
3313
3314 // FIXME: This helps when the return is a real sret. If it is a
3315 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3316 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3317 unsigned NumBits =
3319 Val = DAG.getNode(
3320 ISD::AssertZext, DL, VT, Val,
3321 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3322 }
3323
3324 // If this is an 8 or 16-bit value, it is really passed promoted
3325 // to 32 bits. Insert an assert[sz]ext to capture this, then
3326 // truncate to the right size.
3327 switch (VA.getLocInfo()) {
3328 case CCValAssign::Full:
3329 break;
3330 case CCValAssign::BCvt:
3331 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3332 break;
3333 case CCValAssign::SExt:
3334 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val, DAG.getValueType(ValVT));
3335 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3336 break;
3337 case CCValAssign::ZExt:
3338 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, DAG.getValueType(ValVT));
3339 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3340 break;
3341 case CCValAssign::AExt:
3342 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3343 break;
3344 default:
3345 llvm_unreachable("Unknown loc info!");
3346 }
3347
3348 InVals.push_back(Val);
3349 }
3350
3351 // Start adding system SGPRs.
3352 if (IsEntryFunc)
3353 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3354
3355 // DAG.getPass() returns nullptr when using new pass manager.
3356 // TODO: Use DAG.getMFAM() to access analysis result.
3357 if (DAG.getPass()) {
3358 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3359 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3360 }
3361
3362 unsigned StackArgSize = CCInfo.getStackSize();
3363 Info->setBytesInStackArgArea(StackArgSize);
3364
3365 return Chains.empty() ? Chain
3366 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3367}
3368
3369// TODO: If return values can't fit in registers, we should return as many as
3370// possible in registers before passing on stack.
3372 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3373 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3374 const Type *RetTy) const {
3375 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3376 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3377 // for shaders. Vector types should be explicitly handled by CC.
3378 if (AMDGPU::isEntryFunctionCC(CallConv))
3379 return true;
3380
3382 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3383 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3384 return false;
3385
3386 // We must use the stack if return would require unavailable registers.
3387 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3388 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3389 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3390 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3391 return false;
3392
3393 return true;
3394}
3395
3396SDValue
3398 bool isVarArg,
3400 const SmallVectorImpl<SDValue> &OutVals,
3401 const SDLoc &DL, SelectionDAG &DAG) const {
3405
3406 if (AMDGPU::isKernel(CallConv)) {
3407 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3408 OutVals, DL, DAG);
3409 }
3410
3411 bool IsShader = AMDGPU::isShader(CallConv);
3412
3413 Info->setIfReturnsVoid(Outs.empty());
3414 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3415
3416 // CCValAssign - represent the assignment of the return value to a location.
3418
3419 // CCState - Info about the registers and stack slots.
3420 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3421 *DAG.getContext());
3422
3423 // Analyze outgoing return values.
3424 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3425
3426 SDValue Glue;
3428 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3429
3430 SDValue ReadFirstLane =
3431 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3432 // Copy the result values into the output registers.
3433 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3434 ++I, ++RealRVLocIdx) {
3435 CCValAssign &VA = RVLocs[I];
3436 assert(VA.isRegLoc() && "Can only return in registers!");
3437 // TODO: Partially return in registers if return values don't fit.
3438 SDValue Arg = OutVals[RealRVLocIdx];
3439
3440 // Copied from other backends.
3441 switch (VA.getLocInfo()) {
3442 case CCValAssign::Full:
3443 break;
3444 case CCValAssign::BCvt:
3445 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3446 break;
3447 case CCValAssign::SExt:
3448 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3449 break;
3450 case CCValAssign::ZExt:
3451 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3452 break;
3453 case CCValAssign::AExt:
3454 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3455 break;
3456 default:
3457 llvm_unreachable("Unknown loc info!");
3458 }
3459 if (TRI->isSGPRPhysReg(VA.getLocReg()))
3461 ReadFirstLane, Arg);
3462 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3463 Glue = Chain.getValue(1);
3464 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3465 }
3466
3467 // FIXME: Does sret work properly?
3468 if (!Info->isEntryFunction()) {
3469 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3470 const MCPhysReg *I =
3471 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3472 if (I) {
3473 for (; *I; ++I) {
3474 if (AMDGPU::SReg_64RegClass.contains(*I))
3475 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3476 else if (AMDGPU::SReg_32RegClass.contains(*I))
3477 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3478 else
3479 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3480 }
3481 }
3482 }
3483
3484 // Update chain and glue.
3485 RetOps[0] = Chain;
3486 if (Glue.getNode())
3487 RetOps.push_back(Glue);
3488
3489 unsigned Opc = AMDGPUISD::ENDPGM;
3490 if (!IsWaveEnd)
3491 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3492 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3494 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3495}
3496
3498 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3499 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3500 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3501 SDValue ThisVal) const {
3502 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3503
3504 // Assign locations to each value returned by this call.
3506 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3507 *DAG.getContext());
3508 CCInfo.AnalyzeCallResult(Ins, RetCC);
3509
3510 // Copy all of the result registers out of their specified physreg.
3511 for (CCValAssign VA : RVLocs) {
3512 SDValue Val;
3513
3514 if (VA.isRegLoc()) {
3515 Val =
3516 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3517 Chain = Val.getValue(1);
3518 InGlue = Val.getValue(2);
3519 } else if (VA.isMemLoc()) {
3520 report_fatal_error("TODO: return values in memory");
3521 } else
3522 llvm_unreachable("unknown argument location type");
3523
3524 switch (VA.getLocInfo()) {
3525 case CCValAssign::Full:
3526 break;
3527 case CCValAssign::BCvt:
3528 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3529 break;
3530 case CCValAssign::ZExt:
3531 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3532 DAG.getValueType(VA.getValVT()));
3533 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3534 break;
3535 case CCValAssign::SExt:
3536 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3537 DAG.getValueType(VA.getValVT()));
3538 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3539 break;
3540 case CCValAssign::AExt:
3541 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3542 break;
3543 default:
3544 llvm_unreachable("Unknown loc info!");
3545 }
3546
3547 InVals.push_back(Val);
3548 }
3549
3550 return Chain;
3551}
3552
3553// Add code to pass special inputs required depending on used features separate
3554// from the explicit user arguments present in the IR.
3556 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3557 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3558 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3559 // If we don't have a call site, this was a call inserted by
3560 // legalization. These can never use special inputs.
3561 if (!CLI.CB)
3562 return;
3563
3564 SelectionDAG &DAG = CLI.DAG;
3565 const SDLoc &DL = CLI.DL;
3566 const Function &F = DAG.getMachineFunction().getFunction();
3567
3568 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3569 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3570
3571 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3573 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3574 // DAG.getPass() returns nullptr when using new pass manager.
3575 // TODO: Use DAG.getMFAM() to access analysis result.
3576 if (DAG.getPass()) {
3577 auto &ArgUsageInfo =
3579 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3580 }
3581 }
3582
3583 // TODO: Unify with private memory register handling. This is complicated by
3584 // the fact that at least in kernels, the input argument is not necessarily
3585 // in the same location as the input.
3586 // clang-format off
3587 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3589 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3590 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3591 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3592 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3593 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3594 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3595 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3596 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3597 };
3598 // clang-format on
3599
3600 for (auto [InputID, Attr] : ImplicitAttrs) {
3601 // If the callee does not use the attribute value, skip copying the value.
3602 if (CLI.CB->hasFnAttr(Attr))
3603 continue;
3604
3605 const auto [OutgoingArg, ArgRC, ArgTy] =
3606 CalleeArgInfo->getPreloadedValue(InputID);
3607 if (!OutgoingArg)
3608 continue;
3609
3610 const auto [IncomingArg, IncomingArgRC, Ty] =
3611 CallerArgInfo.getPreloadedValue(InputID);
3612 assert(IncomingArgRC == ArgRC);
3613
3614 // All special arguments are ints for now.
3615 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3616 SDValue InputReg;
3617
3618 if (IncomingArg) {
3619 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3620 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3621 // The implicit arg ptr is special because it doesn't have a corresponding
3622 // input for kernels, and is computed from the kernarg segment pointer.
3623 InputReg = getImplicitArgPtr(DAG, DL);
3624 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3625 std::optional<uint32_t> Id =
3627 if (Id.has_value()) {
3628 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3629 } else {
3630 InputReg = DAG.getPOISON(ArgVT);
3631 }
3632 } else {
3633 // We may have proven the input wasn't needed, although the ABI is
3634 // requiring it. We just need to allocate the register appropriately.
3635 InputReg = DAG.getPOISON(ArgVT);
3636 }
3637
3638 if (OutgoingArg->isRegister()) {
3639 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3640 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3641 report_fatal_error("failed to allocate implicit input argument");
3642 } else {
3643 unsigned SpecialArgOffset =
3644 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3645 SDValue ArgStore =
3646 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3647 MemOpChains.push_back(ArgStore);
3648 }
3649 }
3650
3651 // Pack workitem IDs into a single register or pass it as is if already
3652 // packed.
3653
3654 auto [OutgoingArg, ArgRC, Ty] =
3656 if (!OutgoingArg)
3657 std::tie(OutgoingArg, ArgRC, Ty) =
3659 if (!OutgoingArg)
3660 std::tie(OutgoingArg, ArgRC, Ty) =
3662 if (!OutgoingArg)
3663 return;
3664
3665 const ArgDescriptor *IncomingArgX = std::get<0>(
3667 const ArgDescriptor *IncomingArgY = std::get<0>(
3669 const ArgDescriptor *IncomingArgZ = std::get<0>(
3671
3672 SDValue InputReg;
3673 SDLoc SL;
3674
3675 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3676 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3677 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3678
3679 // If incoming ids are not packed we need to pack them.
3680 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3681 NeedWorkItemIDX) {
3682 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3683 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3684 } else {
3685 InputReg = DAG.getConstant(0, DL, MVT::i32);
3686 }
3687 }
3688
3689 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3690 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3691 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3692 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3693 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3694 InputReg = InputReg.getNode()
3695 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3696 : Y;
3697 }
3698
3699 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3700 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3701 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3702 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3703 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3704 InputReg = InputReg.getNode()
3705 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3706 : Z;
3707 }
3708
3709 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3710 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3711 // We're in a situation where the outgoing function requires the workitem
3712 // ID, but the calling function does not have it (e.g a graphics function
3713 // calling a C calling convention function). This is illegal, but we need
3714 // to produce something.
3715 InputReg = DAG.getPOISON(MVT::i32);
3716 } else {
3717 // Workitem ids are already packed, any of present incoming arguments
3718 // will carry all required fields.
3719 ArgDescriptor IncomingArg =
3720 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3721 : IncomingArgY ? *IncomingArgY
3722 : *IncomingArgZ,
3723 ~0u);
3724 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3725 }
3726 }
3727
3728 if (OutgoingArg->isRegister()) {
3729 if (InputReg)
3730 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3731
3732 CCInfo.AllocateReg(OutgoingArg->getRegister());
3733 } else {
3734 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3735 if (InputReg) {
3736 SDValue ArgStore =
3737 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3738 MemOpChains.push_back(ArgStore);
3739 }
3740 }
3741}
3742
3744 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3746 const SmallVectorImpl<SDValue> &OutVals,
3747 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3748 if (AMDGPU::isChainCC(CalleeCC))
3749 return true;
3750
3751 if (!AMDGPU::mayTailCallThisCC(CalleeCC))
3752 return false;
3753
3754 // For a divergent call target, we need to do a waterfall loop over the
3755 // possible callees which precludes us from using a simple jump.
3756 if (Callee->isDivergent())
3757 return false;
3758
3760 const Function &CallerF = MF.getFunction();
3761 CallingConv::ID CallerCC = CallerF.getCallingConv();
3763 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3764
3765 // Kernels aren't callable, and don't have a live in return address so it
3766 // doesn't make sense to do a tail call with entry functions.
3767 if (!CallerPreserved)
3768 return false;
3769
3770 bool CCMatch = CallerCC == CalleeCC;
3771
3773 if (AMDGPU::canGuaranteeTCO(CalleeCC) && CCMatch)
3774 return true;
3775 return false;
3776 }
3777
3778 // TODO: Can we handle var args?
3779 if (IsVarArg)
3780 return false;
3781
3782 for (const Argument &Arg : CallerF.args()) {
3783 if (Arg.hasByValAttr())
3784 return false;
3785 }
3786
3787 LLVMContext &Ctx = *DAG.getContext();
3788
3789 // Check that the call results are passed in the same way.
3790 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3791 CCAssignFnForCall(CalleeCC, IsVarArg),
3792 CCAssignFnForCall(CallerCC, IsVarArg)))
3793 return false;
3794
3795 // The callee has to preserve all registers the caller needs to preserve.
3796 if (!CCMatch) {
3797 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3798 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3799 return false;
3800 }
3801
3802 // Nothing more to check if the callee is taking no arguments.
3803 if (Outs.empty())
3804 return true;
3805
3807 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3808
3809 // FIXME: We are not allocating special input registers, so we will be
3810 // deciding based on incorrect register assignments.
3811 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3812
3813 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3814 // If the stack arguments for this call do not fit into our own save area then
3815 // the call cannot be made tail.
3816 // TODO: Is this really necessary?
3817 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3818 return false;
3819
3820 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
3821 // FIXME: What about inreg arguments that end up passed in memory?
3822 if (!CCVA.isRegLoc())
3823 continue;
3824
3825 // If we are passing an argument in an SGPR, and the value is divergent,
3826 // this call requires a waterfall loop.
3827 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3828 LLVM_DEBUG(
3829 dbgs() << "Cannot tail call due to divergent outgoing argument in "
3830 << printReg(CCVA.getLocReg(), TRI) << '\n');
3831 return false;
3832 }
3833 }
3834
3835 const MachineRegisterInfo &MRI = MF.getRegInfo();
3836 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3837}
3838
3840 if (!CI->isTailCall())
3841 return false;
3842
3843 const Function *ParentFn = CI->getParent()->getParent();
3845 return false;
3846 return true;
3847}
3848
3849namespace {
3850// Chain calls have special arguments that we need to handle. These are
3851// tagging along at the end of the arguments list(s), after the SGPR and VGPR
3852// arguments (index 0 and 1 respectively).
3853enum ChainCallArgIdx {
3854 Exec = 2,
3855 Flags,
3856 NumVGPRs,
3857 FallbackExec,
3858 FallbackCallee
3859};
3860} // anonymous namespace
3861
3862// The wave scratch offset register is used as the global base pointer.
3864 SmallVectorImpl<SDValue> &InVals) const {
3865 CallingConv::ID CallConv = CLI.CallConv;
3866 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3867
3868 SelectionDAG &DAG = CLI.DAG;
3869
3870 const SDLoc &DL = CLI.DL;
3871 SDValue Chain = CLI.Chain;
3872 SDValue Callee = CLI.Callee;
3873
3874 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
3875 bool UsesDynamicVGPRs = false;
3876 if (IsChainCallConv) {
3877 // The last arguments should be the value that we need to put in EXEC,
3878 // followed by the flags and any other arguments with special meanings.
3879 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
3880 // we don't treat them like the "real" arguments.
3881 auto RequestedExecIt =
3882 llvm::find_if(CLI.Outs, [](const ISD::OutputArg &Arg) {
3883 return Arg.OrigArgIndex == 2;
3884 });
3885 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
3886
3887 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
3888 CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx,
3889 CLI.OutVals.end());
3890 CLI.Outs.erase(RequestedExecIt, CLI.Outs.end());
3891
3892 assert(CLI.Outs.back().OrigArgIndex < 2 &&
3893 "Haven't popped all the special args");
3894
3895 TargetLowering::ArgListEntry RequestedExecArg =
3896 CLI.Args[ChainCallArgIdx::Exec];
3897 if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3898 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3899
3900 // Convert constants into TargetConstants, so they become immediate operands
3901 // instead of being selected into S_MOV.
3902 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
3903 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {
3904 ChainCallSpecialArgs.push_back(DAG.getTargetConstant(
3905 ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0)));
3906 } else
3907 ChainCallSpecialArgs.push_back(Arg.Node);
3908 };
3909
3910 PushNodeOrTargetConstant(RequestedExecArg);
3911
3912 // Process any other special arguments depending on the value of the flags.
3913 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
3914
3915 const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
3916 if (FlagsValue.isZero()) {
3917 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
3918 return lowerUnhandledCall(CLI, InVals,
3919 "no additional args allowed if flags == 0");
3920 } else if (FlagsValue.isOneBitSet(0)) {
3921 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
3922 return lowerUnhandledCall(CLI, InVals, "expected 3 additional args");
3923 }
3924
3925 if (!Subtarget->isWave32()) {
3926 return lowerUnhandledCall(
3927 CLI, InVals, "dynamic VGPR mode is only supported for wave32");
3928 }
3929
3930 UsesDynamicVGPRs = true;
3931 std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
3932 CLI.Args.end(), PushNodeOrTargetConstant);
3933 }
3934 }
3935
3937 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3939 bool &IsTailCall = CLI.IsTailCall;
3940 bool IsVarArg = CLI.IsVarArg;
3941 bool IsSibCall = false;
3943
3944 if (Callee.isUndef() || isNullConstant(Callee)) {
3945 if (!CLI.IsTailCall) {
3946 for (ISD::InputArg &Arg : CLI.Ins)
3947 InVals.push_back(DAG.getPOISON(Arg.VT));
3948 }
3949
3950 return Chain;
3951 }
3952
3953 if (IsVarArg) {
3954 return lowerUnhandledCall(CLI, InVals,
3955 "unsupported call to variadic function ");
3956 }
3957
3958 if (!CLI.CB)
3959 return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization");
3960
3961 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3962 return lowerUnhandledCall(CLI, InVals,
3963 "unsupported required tail call to function ");
3964 }
3965
3966 if (IsTailCall) {
3967 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
3968 Outs, OutVals, Ins, DAG);
3969 if (!IsTailCall &&
3970 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3971 report_fatal_error("failed to perform tail call elimination on a call "
3972 "site marked musttail or on llvm.amdgcn.cs.chain");
3973 }
3974
3975 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3976
3977 // A sibling call is one where we're under the usual C ABI and not planning
3978 // to change that but can still do a tail call:
3979 if (!TailCallOpt && IsTailCall)
3980 IsSibCall = true;
3981
3982 if (IsTailCall)
3983 ++NumTailCalls;
3984 }
3985
3988 SmallVector<SDValue, 8> MemOpChains;
3989
3990 // Analyze operands of the call, assigning locations to each operand.
3992 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3993 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3994
3995 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
3997 // With a fixed ABI, allocate fixed registers before user arguments.
3998 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3999 }
4000
4001 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
4002
4003 // Get a count of how many bytes are to be pushed on the stack.
4004 unsigned NumBytes = CCInfo.getStackSize();
4005
4006 if (IsSibCall) {
4007 // Since we're not changing the ABI to make this a tail call, the memory
4008 // operands are already available in the caller's incoming argument space.
4009 NumBytes = 0;
4010 }
4011
4012 // FPDiff is the byte offset of the call's argument area from the callee's.
4013 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4014 // by this amount for a tail call. In a sibling call it must be 0 because the
4015 // caller will deallocate the entire stack and the callee still expects its
4016 // arguments to begin at SP+0. Completely unused for non-tail calls.
4017 int32_t FPDiff = 0;
4018 MachineFrameInfo &MFI = MF.getFrameInfo();
4019 auto *TRI = Subtarget->getRegisterInfo();
4020
4021 // Adjust the stack pointer for the new arguments...
4022 // These operations are automatically eliminated by the prolog/epilog pass
4023 if (!IsSibCall)
4024 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
4025
4026 if (!IsSibCall || IsChainCallConv) {
4027 if (!Subtarget->enableFlatScratch()) {
4028 SmallVector<SDValue, 4> CopyFromChains;
4029
4030 // In the HSA case, this should be an identity copy.
4031 SDValue ScratchRSrcReg =
4032 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
4033 RegsToPass.emplace_back(IsChainCallConv
4034 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4035 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4036 ScratchRSrcReg);
4037 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
4038 Chain = DAG.getTokenFactor(DL, CopyFromChains);
4039 }
4040 }
4041
4042 const unsigned NumSpecialInputs = RegsToPass.size();
4043
4044 MVT PtrVT = MVT::i32;
4045
4046 // Walk the register/memloc assignments, inserting copies/loads.
4047 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4048 CCValAssign &VA = ArgLocs[i];
4049 SDValue Arg = OutVals[i];
4050
4051 // Promote the value if needed.
4052 switch (VA.getLocInfo()) {
4053 case CCValAssign::Full:
4054 break;
4055 case CCValAssign::BCvt:
4056 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
4057 break;
4058 case CCValAssign::ZExt:
4059 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
4060 break;
4061 case CCValAssign::SExt:
4062 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
4063 break;
4064 case CCValAssign::AExt:
4065 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
4066 break;
4067 case CCValAssign::FPExt:
4068 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
4069 break;
4070 default:
4071 llvm_unreachable("Unknown loc info!");
4072 }
4073
4074 if (VA.isRegLoc()) {
4075 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
4076 } else {
4077 assert(VA.isMemLoc());
4078
4079 SDValue DstAddr;
4080 MachinePointerInfo DstInfo;
4081
4082 unsigned LocMemOffset = VA.getLocMemOffset();
4083 int32_t Offset = LocMemOffset;
4084
4085 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
4086 MaybeAlign Alignment;
4087
4088 if (IsTailCall) {
4089 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4090 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4091 : VA.getValVT().getStoreSize();
4092
4093 // FIXME: We can have better than the minimum byval required alignment.
4094 Alignment =
4095 Flags.isByVal()
4096 ? Flags.getNonZeroByValAlign()
4097 : commonAlignment(Subtarget->getStackAlignment(), Offset);
4098
4099 Offset = Offset + FPDiff;
4100 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
4101
4102 DstAddr = DAG.getFrameIndex(FI, PtrVT);
4103 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4104
4105 // Make sure any stack arguments overlapping with where we're storing
4106 // are loaded before this eventual operation. Otherwise they'll be
4107 // clobbered.
4108
4109 // FIXME: Why is this really necessary? This seems to just result in a
4110 // lot of code to copy the stack and write them back to the same
4111 // locations, which are supposed to be immutable?
4112 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
4113 } else {
4114 // Stores to the argument stack area are relative to the stack pointer.
4115 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
4116 MVT::i32);
4117 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
4118 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
4119 Alignment =
4120 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
4121 }
4122
4123 if (Outs[i].Flags.isByVal()) {
4124 SDValue SizeNode =
4125 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
4126 SDValue Cpy =
4127 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
4128 Outs[i].Flags.getNonZeroByValAlign(),
4129 /*isVol = */ false, /*AlwaysInline = */ true,
4130 /*CI=*/nullptr, std::nullopt, DstInfo,
4132
4133 MemOpChains.push_back(Cpy);
4134 } else {
4135 SDValue Store =
4136 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
4137 MemOpChains.push_back(Store);
4138 }
4139 }
4140 }
4141
4142 if (!MemOpChains.empty())
4143 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
4144
4145 SDValue ReadFirstLaneID =
4146 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4147
4148 SDValue TokenGlue;
4149 if (CLI.ConvergenceControlToken) {
4150 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
4152 }
4153
4154 // Build a sequence of copy-to-reg nodes chained together with token chain
4155 // and flag operands which copy the outgoing args into the appropriate regs.
4156 SDValue InGlue;
4157
4158 unsigned ArgIdx = 0;
4159 for (auto [Reg, Val] : RegsToPass) {
4160 if (ArgIdx++ >= NumSpecialInputs &&
4161 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4162 // For chain calls, the inreg arguments are required to be
4163 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4164 // they are uniform.
4165 //
4166 // For other calls, if an inreg arguments is known to be uniform,
4167 // speculatively insert a readfirstlane in case it is in a VGPR.
4168 //
4169 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4170 // value, so let that continue to produce invalid code.
4171
4172 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4173 if (TokenGlue)
4174 ReadfirstlaneArgs.push_back(TokenGlue);
4176 ReadfirstlaneArgs);
4177 }
4178
4179 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4180 InGlue = Chain.getValue(1);
4181 }
4182
4183 // We don't usually want to end the call-sequence here because we would tidy
4184 // the frame up *after* the call, however in the ABI-changing tail-call case
4185 // we've carefully laid out the parameters so that when sp is reset they'll be
4186 // in the correct location.
4187 if (IsTailCall && !IsSibCall) {
4188 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
4189 InGlue = Chain.getValue(1);
4190 }
4191
4192 std::vector<SDValue> Ops({Chain});
4193
4194 // Add a redundant copy of the callee global which will not be legalized, as
4195 // we need direct access to the callee later.
4196 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
4197 const GlobalValue *GV = GSD->getGlobal();
4198 Ops.push_back(Callee);
4199 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
4200 } else {
4201 if (IsTailCall) {
4202 // isEligibleForTailCallOptimization considered whether the call target is
4203 // divergent, but we may still end up with a uniform value in a VGPR.
4204 // Insert a readfirstlane just in case.
4205 SDValue ReadFirstLaneID =
4206 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4207
4208 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4209 if (TokenGlue)
4210 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
4211 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
4212 ReadfirstlaneArgs);
4213 }
4214
4215 Ops.push_back(Callee);
4216 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
4217 }
4218
4219 if (IsTailCall) {
4220 // Each tail call may have to adjust the stack by a different amount, so
4221 // this information must travel along with the operation for eventual
4222 // consumption by emitEpilogue.
4223 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
4224 }
4225
4226 if (IsChainCallConv)
4227 llvm::append_range(Ops, ChainCallSpecialArgs);
4228
4229 // Add argument registers to the end of the list so that they are known live
4230 // into the call.
4231 for (auto &[Reg, Val] : RegsToPass)
4232 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
4233
4234 // Add a register mask operand representing the call-preserved registers.
4235 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4236 assert(Mask && "Missing call preserved mask for calling convention");
4237 Ops.push_back(DAG.getRegisterMask(Mask));
4238
4239 if (SDValue Token = CLI.ConvergenceControlToken) {
4241 GlueOps.push_back(Token);
4242 if (InGlue)
4243 GlueOps.push_back(InGlue);
4244
4245 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
4246 MVT::Glue, GlueOps),
4247 0);
4248 }
4249
4250 if (InGlue)
4251 Ops.push_back(InGlue);
4252
4253 // If we're doing a tall call, use a TC_RETURN here rather than an
4254 // actual call instruction.
4255 if (IsTailCall) {
4256 MFI.setHasTailCall();
4257 unsigned OPC = AMDGPUISD::TC_RETURN;
4258 switch (CallConv) {
4261 break;
4264 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4266 break;
4267 }
4268
4269 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4270 }
4271
4272 // Returns a chain and a flag for retval copy to use.
4273 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4274 Chain = Call.getValue(0);
4275 InGlue = Call.getValue(1);
4276
4277 uint64_t CalleePopBytes = NumBytes;
4278 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4279 if (!Ins.empty())
4280 InGlue = Chain.getValue(1);
4281
4282 // Handle result values, copying them out of physregs into vregs that we
4283 // return.
4284 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4285 InVals, /*IsThisReturn=*/false, SDValue());
4286}
4287
4288// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4289// except for:
4290// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4291// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4293 SelectionDAG &DAG) const {
4294 const MachineFunction &MF = DAG.getMachineFunction();
4296
4297 SDLoc dl(Op);
4298 EVT VT = Op.getValueType();
4299 SDValue Chain = Op.getOperand(0);
4300 Register SPReg = Info->getStackPtrOffsetReg();
4301
4302 // Chain the dynamic stack allocation so that it doesn't modify the stack
4303 // pointer when other instructions are using the stack.
4304 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4305
4306 SDValue Size = Op.getOperand(1);
4307 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4308 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4309
4310 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4312 "Stack grows upwards for AMDGPU");
4313
4314 Chain = BaseAddr.getValue(1);
4315 Align StackAlign = TFL->getStackAlign();
4316 if (Alignment > StackAlign) {
4317 uint64_t ScaledAlignment = Alignment.value()
4318 << Subtarget->getWavefrontSizeLog2();
4319 uint64_t StackAlignMask = ScaledAlignment - 1;
4320 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4321 DAG.getConstant(StackAlignMask, dl, VT));
4322 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4323 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4324 }
4325
4326 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4327 SDValue NewSP;
4328 if (isa<ConstantSDNode>(Size)) {
4329 // For constant sized alloca, scale alloca size by wave-size
4330 SDValue ScaledSize = DAG.getNode(
4331 ISD::SHL, dl, VT, Size,
4332 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4333 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4334 } else {
4335 // For dynamic sized alloca, perform wave-wide reduction to get max of
4336 // alloca size(divergent) and then scale it by wave-size
4337 SDValue WaveReduction =
4338 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4339 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4340 Size, DAG.getConstant(0, dl, MVT::i32));
4341 SDValue ScaledSize = DAG.getNode(
4342 ISD::SHL, dl, VT, Size,
4343 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4344 NewSP =
4345 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4346 SDValue ReadFirstLaneID =
4347 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4348 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4349 NewSP);
4350 }
4351
4352 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4353 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4354
4355 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4356}
4357
4359 if (Op.getValueType() != MVT::i32)
4360 return Op; // Defer to cannot select error.
4361
4363 SDLoc SL(Op);
4364
4365 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4366
4367 // Convert from wave uniform to swizzled vector address. This should protect
4368 // from any edge cases where the stacksave result isn't directly used with
4369 // stackrestore.
4370 SDValue VectorAddress =
4371 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4372 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4373}
4374
4376 SelectionDAG &DAG) const {
4377 SDLoc SL(Op);
4378 assert(Op.getValueType() == MVT::i32);
4379
4380 uint32_t BothRoundHwReg =
4382 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4383
4384 SDValue IntrinID =
4385 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4386 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4387 Op.getOperand(0), IntrinID, GetRoundBothImm);
4388
4389 // There are two rounding modes, one for f32 and one for f64/f16. We only
4390 // report in the standard value range if both are the same.
4391 //
4392 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4393 // ties away from zero is not supported, and the other values are rotated by
4394 // 1.
4395 //
4396 // If the two rounding modes are not the same, report a target defined value.
4397
4398 // Mode register rounding mode fields:
4399 //
4400 // [1:0] Single-precision round mode.
4401 // [3:2] Double/Half-precision round mode.
4402 //
4403 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4404 //
4405 // Hardware Spec
4406 // Toward-0 3 0
4407 // Nearest Even 0 1
4408 // +Inf 1 2
4409 // -Inf 2 3
4410 // NearestAway0 N/A 4
4411 //
4412 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4413 // table we can index by the raw hardware mode.
4414 //
4415 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4416
4417 SDValue BitTable =
4419
4420 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4421 SDValue RoundModeTimesNumBits =
4422 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4423
4424 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4425 // knew only one mode was demanded.
4426 SDValue TableValue =
4427 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4428 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4429
4430 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4431 SDValue TableEntry =
4432 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4433
4434 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4435 // if it's an extended value.
4436 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4437 SDValue IsStandardValue =
4438 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4439 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4440 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4441 TableEntry, EnumOffset);
4442
4443 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4444}
4445
4447 SelectionDAG &DAG) const {
4448 SDLoc SL(Op);
4449
4450 SDValue NewMode = Op.getOperand(1);
4451 assert(NewMode.getValueType() == MVT::i32);
4452
4453 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4454 // hardware MODE.fp_round values.
4455 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4456 uint32_t ClampedVal = std::min(
4457 static_cast<uint32_t>(ConstMode->getZExtValue()),
4459 NewMode = DAG.getConstant(
4460 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4461 } else {
4462 // If we know the input can only be one of the supported standard modes in
4463 // the range 0-3, we can use a simplified mapping to hardware values.
4464 KnownBits KB = DAG.computeKnownBits(NewMode);
4465 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4466 // The supported standard values are 0-3. The extended values start at 8. We
4467 // need to offset by 4 if the value is in the extended range.
4468
4469 if (UseReducedTable) {
4470 // Truncate to the low 32-bits.
4471 SDValue BitTable = DAG.getConstant(
4472 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4473
4474 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4475 SDValue RoundModeTimesNumBits =
4476 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4477
4478 NewMode =
4479 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4480
4481 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4482 // the table extracted bits into inline immediates.
4483 } else {
4484 // table_index = umin(value, value - 4)
4485 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4486 SDValue BitTable =
4488
4489 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4490 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4491 SDValue IndexVal =
4492 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4493
4494 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4495 SDValue RoundModeTimesNumBits =
4496 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4497
4498 SDValue TableValue =
4499 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4500 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4501
4502 // No need to mask out the high bits since the setreg will ignore them
4503 // anyway.
4504 NewMode = TruncTable;
4505 }
4506
4507 // Insert a readfirstlane in case the value is a VGPR. We could do this
4508 // earlier and keep more operations scalar, but that interferes with
4509 // combining the source.
4510 SDValue ReadFirstLaneID =
4511 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4512 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4513 ReadFirstLaneID, NewMode);
4514 }
4515
4516 // N.B. The setreg will be later folded into s_round_mode on supported
4517 // targets.
4518 SDValue IntrinID =
4519 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4520 uint32_t BothRoundHwReg =
4522 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4523
4524 SDValue SetReg =
4525 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4526 IntrinID, RoundBothImm, NewMode);
4527
4528 return SetReg;
4529}
4530
4532 if (Op->isDivergent() &&
4533 (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
4534 // Cannot do I$ prefetch with divergent pointer.
4535 return SDValue();
4536
4537 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4541 break;
4543 if (Subtarget->hasSafeSmemPrefetch())
4544 break;
4545 [[fallthrough]];
4546 default:
4547 return SDValue();
4548 }
4549
4550 // I$ prefetch
4551 if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
4552 return SDValue();
4553
4554 return Op;
4555}
4556
4557// Work around DAG legality rules only based on the result type.
4559 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4560 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4561 EVT SrcVT = Src.getValueType();
4562
4563 if (SrcVT.getScalarType() != MVT::bf16)
4564 return Op;
4565
4566 SDLoc SL(Op);
4567 SDValue BitCast =
4568 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4569
4570 EVT DstVT = Op.getValueType();
4571 if (IsStrict)
4572 llvm_unreachable("Need STRICT_BF16_TO_FP");
4573
4574 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4575}
4576
4578 SDLoc SL(Op);
4579 if (Op.getValueType() != MVT::i64)
4580 return Op;
4581
4582 uint32_t ModeHwReg =
4584 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4585 uint32_t TrapHwReg =
4587 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4588
4589 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4590 SDValue IntrinID =
4591 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4592 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4593 Op.getOperand(0), IntrinID, ModeHwRegImm);
4594 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4595 Op.getOperand(0), IntrinID, TrapHwRegImm);
4596 SDValue TokenReg =
4597 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4598 GetTrapReg.getValue(1));
4599
4600 SDValue CvtPtr =
4601 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4602 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4603
4604 return DAG.getMergeValues({Result, TokenReg}, SL);
4605}
4606
4608 SDLoc SL(Op);
4609 if (Op.getOperand(1).getValueType() != MVT::i64)
4610 return Op;
4611
4612 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4613 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4614 DAG.getConstant(0, SL, MVT::i32));
4615 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4616 DAG.getConstant(1, SL, MVT::i32));
4617
4618 SDValue ReadFirstLaneID =
4619 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4620 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4621 ReadFirstLaneID, NewModeReg);
4622 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4623 ReadFirstLaneID, NewTrapReg);
4624
4625 unsigned ModeHwReg =
4627 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4628 unsigned TrapHwReg =
4630 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4631
4632 SDValue IntrinID =
4633 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4634 SDValue SetModeReg =
4635 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4636 IntrinID, ModeHwRegImm, NewModeReg);
4637 SDValue SetTrapReg =
4638 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4639 IntrinID, TrapHwRegImm, NewTrapReg);
4640 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4641}
4642
4644 const MachineFunction &MF) const {
4645 const Function &Fn = MF.getFunction();
4646
4648 .Case("m0", AMDGPU::M0)
4649 .Case("exec", AMDGPU::EXEC)
4650 .Case("exec_lo", AMDGPU::EXEC_LO)
4651 .Case("exec_hi", AMDGPU::EXEC_HI)
4652 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4653 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4654 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4655 .Default(Register());
4656 if (!Reg)
4657 return Reg;
4658
4659 if (!Subtarget->hasFlatScrRegister() &&
4660 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4661 Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) +
4662 "\" for subtarget."));
4663 }
4664
4665 switch (Reg) {
4666 case AMDGPU::M0:
4667 case AMDGPU::EXEC_LO:
4668 case AMDGPU::EXEC_HI:
4669 case AMDGPU::FLAT_SCR_LO:
4670 case AMDGPU::FLAT_SCR_HI:
4671 if (VT.getSizeInBits() == 32)
4672 return Reg;
4673 break;
4674 case AMDGPU::EXEC:
4675 case AMDGPU::FLAT_SCR:
4676 if (VT.getSizeInBits() == 64)
4677 return Reg;
4678 break;
4679 default:
4680 llvm_unreachable("missing register type checking");
4681 }
4682
4684 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4685}
4686
4687// If kill is not the last instruction, split the block so kill is always a
4688// proper terminator.
4691 MachineBasicBlock *BB) const {
4692 MachineBasicBlock *SplitBB = BB->splitAt(MI, /*UpdateLiveIns=*/true);
4694 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4695 return SplitBB;
4696}
4697
4698// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4699// \p MI will be the only instruction in the loop body block. Otherwise, it will
4700// be the first instruction in the remainder block.
4701//
4702/// \returns { LoopBody, Remainder }
4703static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4707
4708 // To insert the loop we need to split the block. Move everything after this
4709 // point to a new block, and insert a new empty block between the two.
4711 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4713 ++MBBI;
4714
4715 MF->insert(MBBI, LoopBB);
4716 MF->insert(MBBI, RemainderBB);
4717
4718 LoopBB->addSuccessor(LoopBB);
4719 LoopBB->addSuccessor(RemainderBB);
4720
4721 // Move the rest of the block into a new block.
4722 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4723
4724 if (InstInLoop) {
4725 auto Next = std::next(I);
4726
4727 // Move instruction to loop body.
4728 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4729
4730 // Move the rest of the block.
4731 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4732 } else {
4733 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4734 }
4735
4736 MBB.addSuccessor(LoopBB);
4737
4738 return std::pair(LoopBB, RemainderBB);
4739}
4740
4741/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4743 MachineBasicBlock *MBB = MI.getParent();
4745 auto I = MI.getIterator();
4746 auto E = std::next(I);
4747
4748 // clang-format off
4749 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4750 .addImm(0);
4751 // clang-format on
4752
4753 MIBundleBuilder Bundler(*MBB, I, E);
4754 finalizeBundle(*MBB, Bundler.begin());
4755}
4756
4759 MachineBasicBlock *BB) const {
4760 const DebugLoc &DL = MI.getDebugLoc();
4761
4763
4765
4766 // Apparently kill flags are only valid if the def is in the same block?
4767 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4768 Src->setIsKill(false);
4769
4770 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
4771
4772 MachineBasicBlock::iterator I = LoopBB->end();
4773
4774 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4776
4777 // Clear TRAP_STS.MEM_VIOL
4778 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4779 .addImm(0)
4780 .addImm(EncodedReg);
4781
4783
4784 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4785
4786 // Load and check TRAP_STS.MEM_VIOL
4787 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4788 .addImm(EncodedReg);
4789
4790 // FIXME: Do we need to use an isel pseudo that may clobber scc?
4791 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4792 .addReg(Reg, RegState::Kill)
4793 .addImm(0);
4794 // clang-format off
4795 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4796 .addMBB(LoopBB);
4797 // clang-format on
4798
4799 return RemainderBB;
4800}
4801
4802// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4803// wavefront. If the value is uniform and just happens to be in a VGPR, this
4804// will only do one iteration. In the worst case, this will loop 64 times.
4805//
4806// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4809 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4810 const DebugLoc &DL, const MachineOperand &Idx,
4811 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4812 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4813 Register &SGPRIdxReg) {
4814
4815 MachineFunction *MF = OrigBB.getParent();
4816 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4817 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4819
4820 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4821 Register PhiExec = MRI.createVirtualRegister(BoolRC);
4822 Register NewExec = MRI.createVirtualRegister(BoolRC);
4823 Register CurrentIdxReg =
4824 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4825 Register CondReg = MRI.createVirtualRegister(BoolRC);
4826
4827 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
4828 .addReg(InitReg)
4829 .addMBB(&OrigBB)
4830 .addReg(ResultReg)
4831 .addMBB(&LoopBB);
4832
4833 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
4834 .addReg(InitSaveExecReg)
4835 .addMBB(&OrigBB)
4836 .addReg(NewExec)
4837 .addMBB(&LoopBB);
4838
4839 // Read the next variant <- also loop target.
4840 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4841 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
4842
4843 // Compare the just read M0 value to all possible Idx values.
4844 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4845 .addReg(CurrentIdxReg)
4846 .addReg(Idx.getReg(), 0, Idx.getSubReg());
4847
4848 // Update EXEC, save the original EXEC value to VCC.
4849 BuildMI(LoopBB, I, DL,
4850 TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4851 : AMDGPU::S_AND_SAVEEXEC_B64),
4852 NewExec)
4853 .addReg(CondReg, RegState::Kill);
4854
4855 MRI.setSimpleHint(NewExec, CondReg);
4856
4857 if (UseGPRIdxMode) {
4858 if (Offset == 0) {
4859 SGPRIdxReg = CurrentIdxReg;
4860 } else {
4861 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4862 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4863 .addReg(CurrentIdxReg, RegState::Kill)
4864 .addImm(Offset);
4865 }
4866 } else {
4867 // Move index from VCC into M0
4868 if (Offset == 0) {
4869 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
4870 .addReg(CurrentIdxReg, RegState::Kill);
4871 } else {
4872 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4873 .addReg(CurrentIdxReg, RegState::Kill)
4874 .addImm(Offset);
4875 }
4876 }
4877
4878 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4879 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4880 MachineInstr *InsertPt =
4881 BuildMI(LoopBB, I, DL,
4882 TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4883 : AMDGPU::S_XOR_B64_term),
4884 Exec)
4885 .addReg(Exec)
4886 .addReg(NewExec);
4887
4888 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4889 // s_cbranch_scc0?
4890
4891 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4892 // clang-format off
4893 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4894 .addMBB(&LoopBB);
4895 // clang-format on
4896
4897 return InsertPt->getIterator();
4898}
4899
4900// This has slightly sub-optimal regalloc when the source vector is killed by
4901// the read. The register allocator does not understand that the kill is
4902// per-workitem, so is kept alive for the whole loop so we end up not re-using a
4903// subregister from it, using 1 more VGPR than necessary. This was saved when
4904// this was expanded after register allocation.
4907 unsigned InitResultReg, unsigned PhiReg, int Offset,
4908 bool UseGPRIdxMode, Register &SGPRIdxReg) {
4910 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4911 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4913 const DebugLoc &DL = MI.getDebugLoc();
4915
4916 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
4917 Register DstReg = MI.getOperand(0).getReg();
4918 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4919 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
4920 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4921 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4922
4923 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
4924
4925 // Save the EXEC mask
4926 // clang-format off
4927 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
4928 .addReg(Exec);
4929 // clang-format on
4930
4931 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
4932
4933 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4934
4935 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
4936 InitResultReg, DstReg, PhiReg, TmpExec,
4937 Offset, UseGPRIdxMode, SGPRIdxReg);
4938
4939 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
4941 ++MBBI;
4942 MF->insert(MBBI, LandingPad);
4943 LoopBB->removeSuccessor(RemainderBB);
4944 LandingPad->addSuccessor(RemainderBB);
4945 LoopBB->addSuccessor(LandingPad);
4946 MachineBasicBlock::iterator First = LandingPad->begin();
4947 // clang-format off
4948 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
4949 .addReg(SaveExec);
4950 // clang-format on
4951
4952 return InsPt;
4953}
4954
4955// Returns subreg index, offset
4956static std::pair<unsigned, int>
4958 const TargetRegisterClass *SuperRC, unsigned VecReg,
4959 int Offset) {
4960 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
4961
4962 // Skip out of bounds offsets, or else we would end up using an undefined
4963 // register.
4964 if (Offset >= NumElts || Offset < 0)
4965 return std::pair(AMDGPU::sub0, Offset);
4966
4967 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
4968}
4969
4972 int Offset) {
4973 MachineBasicBlock *MBB = MI.getParent();
4974 const DebugLoc &DL = MI.getDebugLoc();
4976
4977 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4978
4979 assert(Idx->getReg() != AMDGPU::NoRegister);
4980
4981 if (Offset == 0) {
4982 // clang-format off
4983 BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
4984 .add(*Idx);
4985 // clang-format on
4986 } else {
4987 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4988 .add(*Idx)
4989 .addImm(Offset);
4990 }
4991}
4992
4995 int Offset) {
4996 MachineBasicBlock *MBB = MI.getParent();
4997 const DebugLoc &DL = MI.getDebugLoc();
4999
5000 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5001
5002 if (Offset == 0)
5003 return Idx->getReg();
5004
5005 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5006 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
5007 .add(*Idx)
5008 .addImm(Offset);
5009 return Tmp;
5010}
5011
5014 const GCNSubtarget &ST) {
5015 const SIInstrInfo *TII = ST.getInstrInfo();
5016 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5019
5020 Register Dst = MI.getOperand(0).getReg();
5021 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5022 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
5023 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5024
5025 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
5026 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5027
5028 unsigned SubReg;
5029 std::tie(SubReg, Offset) =
5030 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
5031
5032 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5033
5034 // Check for a SGPR index.
5035 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5037 const DebugLoc &DL = MI.getDebugLoc();
5038
5039 if (UseGPRIdxMode) {
5040 // TODO: Look at the uses to avoid the copy. This may require rescheduling
5041 // to avoid interfering with other uses, so probably requires a new
5042 // optimization pass.
5044
5045 const MCInstrDesc &GPRIDXDesc =
5046 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5047 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5048 .addReg(SrcReg)
5049 .addReg(Idx)
5050 .addImm(SubReg);
5051 } else {
5053
5054 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5055 .addReg(SrcReg, 0, SubReg)
5056 .addReg(SrcReg, RegState::Implicit);
5057 }
5058
5059 MI.eraseFromParent();
5060
5061 return &MBB;
5062 }
5063
5064 // Control flow needs to be inserted if indexing with a VGPR.
5065 const DebugLoc &DL = MI.getDebugLoc();
5067
5068 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5069 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5070
5071 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
5072
5073 Register SGPRIdxReg;
5074 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
5075 UseGPRIdxMode, SGPRIdxReg);
5076
5077 MachineBasicBlock *LoopBB = InsPt->getParent();
5078
5079 if (UseGPRIdxMode) {
5080 const MCInstrDesc &GPRIDXDesc =
5081 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5082
5083 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5084 .addReg(SrcReg)
5085 .addReg(SGPRIdxReg)
5086 .addImm(SubReg);
5087 } else {
5088 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5089 .addReg(SrcReg, 0, SubReg)
5090 .addReg(SrcReg, RegState::Implicit);
5091 }
5092
5093 MI.eraseFromParent();
5094
5095 return LoopBB;
5096}
5097
5100 const GCNSubtarget &ST) {
5101 const SIInstrInfo *TII = ST.getInstrInfo();
5102 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5105
5106 Register Dst = MI.getOperand(0).getReg();
5107 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
5108 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5109 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
5110 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5111 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
5112 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5113
5114 // This can be an immediate, but will be folded later.
5115 assert(Val->getReg());
5116
5117 unsigned SubReg;
5118 std::tie(SubReg, Offset) =
5119 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
5120 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5121
5122 if (Idx->getReg() == AMDGPU::NoRegister) {
5124 const DebugLoc &DL = MI.getDebugLoc();
5125
5126 assert(Offset == 0);
5127
5128 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
5129 .add(*SrcVec)
5130 .add(*Val)
5131 .addImm(SubReg);
5132
5133 MI.eraseFromParent();
5134 return &MBB;
5135 }
5136
5137 // Check for a SGPR index.
5138 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5140 const DebugLoc &DL = MI.getDebugLoc();
5141
5142 if (UseGPRIdxMode) {
5144
5145 const MCInstrDesc &GPRIDXDesc =
5146 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5147 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5148 .addReg(SrcVec->getReg())
5149 .add(*Val)
5150 .addReg(Idx)
5151 .addImm(SubReg);
5152 } else {
5154
5155 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5156 TRI.getRegSizeInBits(*VecRC), 32, false);
5157 BuildMI(MBB, I, DL, MovRelDesc, Dst)
5158 .addReg(SrcVec->getReg())
5159 .add(*Val)
5160 .addImm(SubReg);
5161 }
5162 MI.eraseFromParent();
5163 return &MBB;
5164 }
5165
5166 // Control flow needs to be inserted if indexing with a VGPR.
5167 if (Val->isReg())
5168 MRI.clearKillFlags(Val->getReg());
5169
5170 const DebugLoc &DL = MI.getDebugLoc();
5171
5172 Register PhiReg = MRI.createVirtualRegister(VecRC);
5173
5174 Register SGPRIdxReg;
5175 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
5176 UseGPRIdxMode, SGPRIdxReg);
5177 MachineBasicBlock *LoopBB = InsPt->getParent();
5178
5179 if (UseGPRIdxMode) {
5180 const MCInstrDesc &GPRIDXDesc =
5181 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5182
5183 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5184 .addReg(PhiReg)
5185 .add(*Val)
5186 .addReg(SGPRIdxReg)
5187 .addImm(SubReg);
5188 } else {
5189 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5190 TRI.getRegSizeInBits(*VecRC), 32, false);
5191 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
5192 .addReg(PhiReg)
5193 .add(*Val)
5194 .addImm(SubReg);
5195 }
5196
5197 MI.eraseFromParent();
5198 return LoopBB;
5199}
5200
5202 switch (Opc) {
5203 case AMDGPU::S_MIN_U32:
5204 return std::numeric_limits<uint32_t>::max();
5205 case AMDGPU::S_MIN_I32:
5206 return std::numeric_limits<int32_t>::max();
5207 case AMDGPU::S_MAX_U32:
5208 return std::numeric_limits<uint32_t>::min();
5209 case AMDGPU::S_MAX_I32:
5210 return std::numeric_limits<int32_t>::min();
5211 case AMDGPU::S_ADD_I32:
5212 case AMDGPU::S_SUB_I32:
5213 case AMDGPU::S_OR_B32:
5214 case AMDGPU::S_XOR_B32:
5215 return std::numeric_limits<uint32_t>::min();
5216 case AMDGPU::S_AND_B32:
5217 return std::numeric_limits<uint32_t>::max();
5218 default:
5219 llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction");
5220 }
5221}
5222
5225 const GCNSubtarget &ST,
5226 unsigned Opc) {
5228 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5229 const DebugLoc &DL = MI.getDebugLoc();
5230 const SIInstrInfo *TII = ST.getInstrInfo();
5231
5232 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5233 Register SrcReg = MI.getOperand(1).getReg();
5234 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
5235 Register DstReg = MI.getOperand(0).getReg();
5236 MachineBasicBlock *RetBB = nullptr;
5237 if (isSGPR) {
5238 switch (Opc) {
5239 case AMDGPU::S_MIN_U32:
5240 case AMDGPU::S_MIN_I32:
5241 case AMDGPU::S_MAX_U32:
5242 case AMDGPU::S_MAX_I32:
5243 case AMDGPU::S_AND_B32:
5244 case AMDGPU::S_OR_B32: {
5245 // Idempotent operations.
5246 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5247 RetBB = &BB;
5248 break;
5249 }
5250 case AMDGPU::S_XOR_B32:
5251 case AMDGPU::S_ADD_I32:
5252 case AMDGPU::S_SUB_I32: {
5253 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5254 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5255 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5256 Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
5257
5258 bool IsWave32 = ST.isWave32();
5259 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5260 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5261 unsigned CountReg =
5262 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5263
5264 auto Exec =
5265 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5266
5267 auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5268 .addReg(Exec->getOperand(0).getReg());
5269
5270 switch (Opc) {
5271 case AMDGPU::S_XOR_B32: {
5272 // Performing an XOR operation on a uniform value
5273 // depends on the parity of the number of active lanes.
5274 // For even parity, the result will be 0, for odd
5275 // parity the result will be the same as the input value.
5276 Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
5277
5278 auto ParityReg =
5279 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5280 .addReg(NewAccumulator->getOperand(0).getReg())
5281 .addImm(1);
5282 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5283 .addReg(SrcReg)
5284 .addReg(ParityReg->getOperand(0).getReg());
5285 break;
5286 }
5287 case AMDGPU::S_SUB_I32: {
5288 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5289
5290 // Take the negation of the source operand.
5291 auto InvertedValReg =
5292 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
5293 .addImm(-1)
5294 .addReg(SrcReg);
5295 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5296 .addReg(InvertedValReg->getOperand(0).getReg())
5297 .addReg(NewAccumulator->getOperand(0).getReg());
5298 break;
5299 }
5300 case AMDGPU::S_ADD_I32: {
5301 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5302 .addReg(SrcReg)
5303 .addReg(NewAccumulator->getOperand(0).getReg());
5304 break;
5305 }
5306 }
5307 RetBB = &BB;
5308 }
5309 }
5310 } else {
5311 // TODO: Implement DPP Strategy and switch based on immediate strategy
5312 // operand. For now, for all the cases (default, Iterative and DPP we use
5313 // iterative approach by default.)
5314
5315 // To reduce the VGPR using iterative approach, we need to iterate
5316 // over all the active lanes. Lowering consists of ComputeLoop,
5317 // which iterate over only active lanes. We use copy of EXEC register
5318 // as induction variable and every active lane modifies it using bitset0
5319 // so that we will get the next active lane for next iteration.
5321 Register SrcReg = MI.getOperand(1).getReg();
5322
5323 // Create Control flow for loop
5324 // Split MI's Machine Basic block into For loop
5325 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
5326
5327 // Create virtual registers required for lowering.
5328 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5329 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5330 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5331 Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
5332
5333 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5334 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5335 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5336
5337 Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
5338 Register LaneValueReg =
5339 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5340
5341 bool IsWave32 = ST.isWave32();
5342 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5343 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5344
5345 // Create initial values of induction variable from Exec, Accumulator and
5346 // insert branch instr to newly created ComputeBlock
5348 auto TmpSReg =
5349 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
5350 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
5351 .addImm(InitalValue);
5352 // clang-format off
5353 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5354 .addMBB(ComputeLoop);
5355 // clang-format on
5356
5357 // Start constructing ComputeLoop
5358 I = ComputeLoop->end();
5359 auto Accumulator =
5360 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5361 .addReg(InitalValReg)
5362 .addMBB(&BB);
5363 auto ActiveBits =
5364 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5365 .addReg(TmpSReg->getOperand(0).getReg())
5366 .addMBB(&BB);
5367
5368 // Perform the computations
5369 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5370 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5371 .addReg(ActiveBits->getOperand(0).getReg());
5372 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5373 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5374 .addReg(SrcReg)
5375 .addReg(FF1->getOperand(0).getReg());
5376 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5377 .addReg(Accumulator->getOperand(0).getReg())
5378 .addReg(LaneValue->getOperand(0).getReg());
5379
5380 // Manipulate the iterator to get the next active lane
5381 unsigned BITSETOpc =
5382 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5383 auto NewActiveBits =
5384 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5385 .addReg(FF1->getOperand(0).getReg())
5386 .addReg(ActiveBits->getOperand(0).getReg());
5387
5388 // Add phi nodes
5389 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5390 .addMBB(ComputeLoop);
5391 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5392 .addMBB(ComputeLoop);
5393
5394 // Creating branching
5395 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5396 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5397 .addReg(NewActiveBits->getOperand(0).getReg())
5398 .addImm(0);
5399 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5400 .addMBB(ComputeLoop);
5401
5402 RetBB = ComputeEnd;
5403 }
5404 MI.eraseFromParent();
5405 return RetBB;
5406}
5407
5410 MachineBasicBlock *BB) const {
5411
5413 MachineFunction *MF = BB->getParent();
5415
5416 switch (MI.getOpcode()) {
5417 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5418 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5419 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5420 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5421 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5422 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5423 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5424 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5425 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5426 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5427 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5428 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5429 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5430 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5431 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5432 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5433 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5434 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
5435 case AMDGPU::S_UADDO_PSEUDO:
5436 case AMDGPU::S_USUBO_PSEUDO: {
5437 const DebugLoc &DL = MI.getDebugLoc();
5438 MachineOperand &Dest0 = MI.getOperand(0);
5439 MachineOperand &Dest1 = MI.getOperand(1);
5440 MachineOperand &Src0 = MI.getOperand(2);
5441 MachineOperand &Src1 = MI.getOperand(3);
5442
5443 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5444 ? AMDGPU::S_ADD_I32
5445 : AMDGPU::S_SUB_I32;
5446 // clang-format off
5447 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
5448 .add(Src0)
5449 .add(Src1);
5450 // clang-format on
5451
5452 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
5453 .addImm(1)
5454 .addImm(0);
5455
5456 MI.eraseFromParent();
5457 return BB;
5458 }
5459 case AMDGPU::S_ADD_U64_PSEUDO:
5460 case AMDGPU::S_SUB_U64_PSEUDO: {
5461 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5462 // For GFX12, we emit s_add_u64 and s_sub_u64.
5463 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5465 const DebugLoc &DL = MI.getDebugLoc();
5466 MachineOperand &Dest = MI.getOperand(0);
5467 MachineOperand &Src0 = MI.getOperand(1);
5468 MachineOperand &Src1 = MI.getOperand(2);
5469 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5470 if (Subtarget->hasScalarAddSub64()) {
5471 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5472 // clang-format off
5473 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5474 .add(Src0)
5475 .add(Src1);
5476 // clang-format on
5477 } else {
5478 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5479 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5480
5481 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5482 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5483
5484 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5485 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5486 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5487 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5488
5489 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5490 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5491 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5492 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5493
5494 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5495 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5496 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5497 .add(Src0Sub0)
5498 .add(Src1Sub0);
5499 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5500 .add(Src0Sub1)
5501 .add(Src1Sub1);
5502 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5503 .addReg(DestSub0)
5504 .addImm(AMDGPU::sub0)
5505 .addReg(DestSub1)
5506 .addImm(AMDGPU::sub1);
5507 }
5508 MI.eraseFromParent();
5509 return BB;
5510 }
5511 case AMDGPU::V_ADD_U64_PSEUDO:
5512 case AMDGPU::V_SUB_U64_PSEUDO: {
5514 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5515 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5516 const DebugLoc &DL = MI.getDebugLoc();
5517
5518 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5519
5520 MachineOperand &Dest = MI.getOperand(0);
5521 MachineOperand &Src0 = MI.getOperand(1);
5522 MachineOperand &Src1 = MI.getOperand(2);
5523
5524 if (ST.hasAddSubU64Insts()) {
5525 auto I = BuildMI(*BB, MI, DL,
5526 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
5527 : AMDGPU::V_SUB_U64_e64),
5528 Dest.getReg())
5529 .add(Src0)
5530 .add(Src1)
5531 .addImm(0); // clamp
5532 TII->legalizeOperands(*I);
5533 MI.eraseFromParent();
5534 return BB;
5535 }
5536
5537 if (IsAdd && ST.hasLshlAddU64Inst()) {
5538 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
5539 Dest.getReg())
5540 .add(Src0)
5541 .addImm(0)
5542 .add(Src1);
5543 TII->legalizeOperands(*Add);
5544 MI.eraseFromParent();
5545 return BB;
5546 }
5547
5548 const auto *CarryRC = TRI->getWaveMaskRegClass();
5549
5550 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5551 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5552
5553 Register CarryReg = MRI.createVirtualRegister(CarryRC);
5554 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
5555
5556 const TargetRegisterClass *Src0RC = Src0.isReg()
5557 ? MRI.getRegClass(Src0.getReg())
5558 : &AMDGPU::VReg_64RegClass;
5559 const TargetRegisterClass *Src1RC = Src1.isReg()
5560 ? MRI.getRegClass(Src1.getReg())
5561 : &AMDGPU::VReg_64RegClass;
5562
5563 const TargetRegisterClass *Src0SubRC =
5564 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5565 const TargetRegisterClass *Src1SubRC =
5566 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5567
5568 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
5569 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5570 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
5571 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5572
5573 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
5574 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5575 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
5576 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5577
5578 unsigned LoOpc =
5579 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5580 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5581 .addReg(CarryReg, RegState::Define)
5582 .add(SrcReg0Sub0)
5583 .add(SrcReg1Sub0)
5584 .addImm(0); // clamp bit
5585
5586 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5587 MachineInstr *HiHalf =
5588 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5589 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
5590 .add(SrcReg0Sub1)
5591 .add(SrcReg1Sub1)
5592 .addReg(CarryReg, RegState::Kill)
5593 .addImm(0); // clamp bit
5594
5595 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5596 .addReg(DestSub0)
5597 .addImm(AMDGPU::sub0)
5598 .addReg(DestSub1)
5599 .addImm(AMDGPU::sub1);
5600 TII->legalizeOperands(*LoHalf);
5601 TII->legalizeOperands(*HiHalf);
5602 MI.eraseFromParent();
5603 return BB;
5604 }
5605 case AMDGPU::S_ADD_CO_PSEUDO:
5606 case AMDGPU::S_SUB_CO_PSEUDO: {
5607 // This pseudo has a chance to be selected
5608 // only from uniform add/subcarry node. All the VGPR operands
5609 // therefore assumed to be splat vectors.
5611 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5612 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5614 const DebugLoc &DL = MI.getDebugLoc();
5615 MachineOperand &Dest = MI.getOperand(0);
5616 MachineOperand &CarryDest = MI.getOperand(1);
5617 MachineOperand &Src0 = MI.getOperand(2);
5618 MachineOperand &Src1 = MI.getOperand(3);
5619 MachineOperand &Src2 = MI.getOperand(4);
5620 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5621 ? AMDGPU::S_ADDC_U32
5622 : AMDGPU::S_SUBB_U32;
5623 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5624 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5625 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5626 .addReg(Src0.getReg());
5627 Src0.setReg(RegOp0);
5628 }
5629 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5630 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5631 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5632 .addReg(Src1.getReg());
5633 Src1.setReg(RegOp1);
5634 }
5635 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5636 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
5637 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5638 .addReg(Src2.getReg());
5639 Src2.setReg(RegOp2);
5640 }
5641
5642 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
5643 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
5644 assert(WaveSize == 64 || WaveSize == 32);
5645
5646 if (WaveSize == 64) {
5647 if (ST.hasScalarCompareEq64()) {
5648 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
5649 .addReg(Src2.getReg())
5650 .addImm(0);
5651 } else {
5652 const TargetRegisterClass *SubRC =
5653 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5654 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5655 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5656 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5657 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5658 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5659
5660 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
5661 .add(Src2Sub0)
5662 .add(Src2Sub1);
5663
5664 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5665 .addReg(Src2_32, RegState::Kill)
5666 .addImm(0);
5667 }
5668 } else {
5669 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5670 .addReg(Src2.getReg())
5671 .addImm(0);
5672 }
5673
5674 // clang-format off
5675 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg())
5676 .add(Src0)
5677 .add(Src1);
5678 // clang-format on
5679
5680 unsigned SelOpc =
5681 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5682
5683 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
5684 .addImm(-1)
5685 .addImm(0);
5686
5687 MI.eraseFromParent();
5688 return BB;
5689 }
5690 case AMDGPU::SI_INIT_M0: {
5691 MachineOperand &M0Init = MI.getOperand(0);
5692 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5693 TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
5694 AMDGPU::M0)
5695 .add(M0Init);
5696 MI.eraseFromParent();
5697 return BB;
5698 }
5699 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
5700 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
5701 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5702 TII->get(AMDGPU::S_CMP_EQ_U32))
5703 .addImm(0)
5704 .addImm(0);
5705 return BB;
5706 }
5707 case AMDGPU::GET_GROUPSTATICSIZE: {
5708 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
5709 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
5710 DebugLoc DL = MI.getDebugLoc();
5711 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
5712 .add(MI.getOperand(0))
5713 .addImm(MFI->getLDSSize());
5714 MI.eraseFromParent();
5715 return BB;
5716 }
5717 case AMDGPU::GET_SHADERCYCLESHILO: {
5720 const DebugLoc &DL = MI.getDebugLoc();
5721 // The algorithm is:
5722 //
5723 // hi1 = getreg(SHADER_CYCLES_HI)
5724 // lo1 = getreg(SHADER_CYCLES_LO)
5725 // hi2 = getreg(SHADER_CYCLES_HI)
5726 //
5727 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
5728 // Otherwise there was overflow and the result is hi2:0. In both cases the
5729 // result should represent the actual time at some point during the sequence
5730 // of three getregs.
5731 using namespace AMDGPU::Hwreg;
5732 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5733 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
5734 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5735 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5736 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
5737 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5738 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5739 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
5740 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5741 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
5742 .addReg(RegHi1)
5743 .addReg(RegHi2);
5744 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5745 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
5746 .addReg(RegLo1)
5747 .addImm(0);
5748 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
5749 .add(MI.getOperand(0))
5750 .addReg(RegLo)
5751 .addImm(AMDGPU::sub0)
5752 .addReg(RegHi2)
5753 .addImm(AMDGPU::sub1);
5754 MI.eraseFromParent();
5755 return BB;
5756 }
5757 case AMDGPU::SI_INDIRECT_SRC_V1:
5758 case AMDGPU::SI_INDIRECT_SRC_V2:
5759 case AMDGPU::SI_INDIRECT_SRC_V4:
5760 case AMDGPU::SI_INDIRECT_SRC_V8:
5761 case AMDGPU::SI_INDIRECT_SRC_V9:
5762 case AMDGPU::SI_INDIRECT_SRC_V10:
5763 case AMDGPU::SI_INDIRECT_SRC_V11:
5764 case AMDGPU::SI_INDIRECT_SRC_V12:
5765 case AMDGPU::SI_INDIRECT_SRC_V16:
5766 case AMDGPU::SI_INDIRECT_SRC_V32:
5767 return emitIndirectSrc(MI, *BB, *getSubtarget());
5768 case AMDGPU::SI_INDIRECT_DST_V1:
5769 case AMDGPU::SI_INDIRECT_DST_V2:
5770 case AMDGPU::SI_INDIRECT_DST_V4:
5771 case AMDGPU::SI_INDIRECT_DST_V8:
5772 case AMDGPU::SI_INDIRECT_DST_V9:
5773 case AMDGPU::SI_INDIRECT_DST_V10:
5774 case AMDGPU::SI_INDIRECT_DST_V11:
5775 case AMDGPU::SI_INDIRECT_DST_V12:
5776 case AMDGPU::SI_INDIRECT_DST_V16:
5777 case AMDGPU::SI_INDIRECT_DST_V32:
5778 return emitIndirectDst(MI, *BB, *getSubtarget());
5779 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5780 case AMDGPU::SI_KILL_I1_PSEUDO:
5781 return splitKillBlock(MI, BB);
5782 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5784 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5785 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5786
5787 Register Dst = MI.getOperand(0).getReg();
5788 const MachineOperand &Src0 = MI.getOperand(1);
5789 const MachineOperand &Src1 = MI.getOperand(2);
5790 const DebugLoc &DL = MI.getDebugLoc();
5791 Register SrcCond = MI.getOperand(3).getReg();
5792
5793 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5794 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5795 const auto *CondRC = TRI->getWaveMaskRegClass();
5796 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
5797
5798 const TargetRegisterClass *Src0RC = Src0.isReg()
5799 ? MRI.getRegClass(Src0.getReg())
5800 : &AMDGPU::VReg_64RegClass;
5801 const TargetRegisterClass *Src1RC = Src1.isReg()
5802 ? MRI.getRegClass(Src1.getReg())
5803 : &AMDGPU::VReg_64RegClass;
5804
5805 const TargetRegisterClass *Src0SubRC =
5806 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5807 const TargetRegisterClass *Src1SubRC =
5808 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5809
5810 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5811 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5812 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5813 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5814
5815 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5816 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5817 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5818 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5819
5820 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
5821 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5822 .addImm(0)
5823 .add(Src0Sub0)
5824 .addImm(0)
5825 .add(Src1Sub0)
5826 .addReg(SrcCondCopy);
5827 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5828 .addImm(0)
5829 .add(Src0Sub1)
5830 .addImm(0)
5831 .add(Src1Sub1)
5832 .addReg(SrcCondCopy);
5833
5834 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5835 .addReg(DstLo)
5836 .addImm(AMDGPU::sub0)
5837 .addReg(DstHi)
5838 .addImm(AMDGPU::sub1);
5839 MI.eraseFromParent();
5840 return BB;
5841 }
5842 case AMDGPU::SI_BR_UNDEF: {
5844 const DebugLoc &DL = MI.getDebugLoc();
5845 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5846 .add(MI.getOperand(0));
5847 Br->getOperand(1).setIsUndef(); // read undef SCC
5848 MI.eraseFromParent();
5849 return BB;
5850 }
5851 case AMDGPU::ADJCALLSTACKUP:
5852 case AMDGPU::ADJCALLSTACKDOWN: {
5854 MachineInstrBuilder MIB(*MF, &MI);
5855 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
5856 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
5857 return BB;
5858 }
5859 case AMDGPU::SI_CALL_ISEL: {
5861 const DebugLoc &DL = MI.getDebugLoc();
5862
5863 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
5864
5866 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5867
5868 for (const MachineOperand &MO : MI.operands())
5869 MIB.add(MO);
5870
5871 MIB.cloneMemRefs(MI);
5872 MI.eraseFromParent();
5873 return BB;
5874 }
5875 case AMDGPU::V_ADD_CO_U32_e32:
5876 case AMDGPU::V_SUB_CO_U32_e32:
5877 case AMDGPU::V_SUBREV_CO_U32_e32: {
5878 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
5879 const DebugLoc &DL = MI.getDebugLoc();
5880 unsigned Opc = MI.getOpcode();
5881
5882 bool NeedClampOperand = false;
5883 if (TII->pseudoToMCOpcode(Opc) == -1) {
5885 NeedClampOperand = true;
5886 }
5887
5888 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
5889 if (TII->isVOP3(*I)) {
5890 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5891 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5892 I.addReg(TRI->getVCC(), RegState::Define);
5893 }
5894 I.add(MI.getOperand(1)).add(MI.getOperand(2));
5895 if (NeedClampOperand)
5896 I.addImm(0); // clamp bit for e64 encoding
5897
5898 TII->legalizeOperands(*I);
5899
5900 MI.eraseFromParent();
5901 return BB;
5902 }
5903 case AMDGPU::V_ADDC_U32_e32:
5904 case AMDGPU::V_SUBB_U32_e32:
5905 case AMDGPU::V_SUBBREV_U32_e32:
5906 // These instructions have an implicit use of vcc which counts towards the
5907 // constant bus limit.
5908 TII->legalizeOperands(MI);
5909 return BB;
5910 case AMDGPU::DS_GWS_INIT:
5911 case AMDGPU::DS_GWS_SEMA_BR:
5912 case AMDGPU::DS_GWS_BARRIER:
5913 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
5914 [[fallthrough]];
5915 case AMDGPU::DS_GWS_SEMA_V:
5916 case AMDGPU::DS_GWS_SEMA_P:
5917 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5918 // A s_waitcnt 0 is required to be the instruction immediately following.
5919 if (getSubtarget()->hasGWSAutoReplay()) {
5921 return BB;
5922 }
5923
5924 return emitGWSMemViolTestLoop(MI, BB);
5925 case AMDGPU::S_SETREG_B32: {
5926 // Try to optimize cases that only set the denormal mode or rounding mode.
5927 //
5928 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
5929 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
5930 // instead.
5931 //
5932 // FIXME: This could be predicates on the immediate, but tablegen doesn't
5933 // allow you to have a no side effect instruction in the output of a
5934 // sideeffecting pattern.
5935 auto [ID, Offset, Width] =
5936 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
5938 return BB;
5939
5940 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5941 const unsigned SetMask = WidthMask << Offset;
5942
5943 if (getSubtarget()->hasDenormModeInst()) {
5944 unsigned SetDenormOp = 0;
5945 unsigned SetRoundOp = 0;
5946
5947 // The dedicated instructions can only set the whole denorm or round mode
5948 // at once, not a subset of bits in either.
5949 if (SetMask ==
5951 // If this fully sets both the round and denorm mode, emit the two
5952 // dedicated instructions for these.
5953 SetRoundOp = AMDGPU::S_ROUND_MODE;
5954 SetDenormOp = AMDGPU::S_DENORM_MODE;
5955 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
5956 SetRoundOp = AMDGPU::S_ROUND_MODE;
5957 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
5958 SetDenormOp = AMDGPU::S_DENORM_MODE;
5959 }
5960
5961 if (SetRoundOp || SetDenormOp) {
5963 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
5964 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5965 unsigned ImmVal = Def->getOperand(1).getImm();
5966 if (SetRoundOp) {
5967 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
5968 .addImm(ImmVal & 0xf);
5969
5970 // If we also have the denorm mode, get just the denorm mode bits.
5971 ImmVal >>= 4;
5972 }
5973
5974 if (SetDenormOp) {
5975 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
5976 .addImm(ImmVal & 0xf);
5977 }
5978
5979 MI.eraseFromParent();
5980 return BB;
5981 }
5982 }
5983 }
5984
5985 // If only FP bits are touched, used the no side effects pseudo.
5986 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
5987 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
5988 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
5989
5990 return BB;
5991 }
5992 case AMDGPU::S_INVERSE_BALLOT_U32:
5993 case AMDGPU::S_INVERSE_BALLOT_U64:
5994 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
5995 // necessary. After that they are equivalent to a COPY.
5996 MI.setDesc(TII->get(AMDGPU::COPY));
5997 return BB;
5998 case AMDGPU::ENDPGM_TRAP: {
5999 const DebugLoc &DL = MI.getDebugLoc();
6000 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
6001 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
6002 MI.addOperand(MachineOperand::CreateImm(0));
6003 return BB;
6004 }
6005
6006 // We need a block split to make the real endpgm a terminator. We also don't
6007 // want to break phis in successor blocks, so we can't just delete to the
6008 // end of the block.
6009
6010 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
6012 MF->push_back(TrapBB);
6013 // clang-format off
6014 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
6015 .addImm(0);
6016 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
6017 .addMBB(TrapBB);
6018 // clang-format on
6019
6020 BB->addSuccessor(TrapBB);
6021 MI.eraseFromParent();
6022 return SplitBB;
6023 }
6024 case AMDGPU::SIMULATED_TRAP: {
6025 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6027 MachineBasicBlock *SplitBB =
6028 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
6029 MI.eraseFromParent();
6030 return SplitBB;
6031 }
6032 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6034
6035 // During ISel, it's difficult to propagate the original EXEC mask to use as
6036 // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
6037 MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
6038 Register OriginalExec = Setup->getOperand(0).getReg();
6039 assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6040 MF->getRegInfo().clearKillFlags(OriginalExec);
6041 MI.getOperand(0).setReg(OriginalExec);
6042 return BB;
6043 }
6044 default:
6045 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
6046 if (!MI.mayStore())
6048 return BB;
6049 }
6051 }
6052}
6053
6055 // This currently forces unfolding various combinations of fsub into fma with
6056 // free fneg'd operands. As long as we have fast FMA (controlled by
6057 // isFMAFasterThanFMulAndFAdd), we should perform these.
6058
6059 // When fma is quarter rate, for f64 where add / sub are at best half rate,
6060 // most of these combines appear to be cycle neutral but save on instruction
6061 // count / code size.
6062 return true;
6063}
6064
6066
6068 EVT VT) const {
6069 if (!VT.isVector()) {
6070 return MVT::i1;
6071 }
6072 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
6073}
6074
6076 // TODO: Should i16 be used always if legal? For now it would force VALU
6077 // shifts.
6078 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6079}
6080
6082 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6083 ? Ty.changeElementSize(16)
6084 : Ty.changeElementSize(32);
6085}
6086
6087// Answering this is somewhat tricky and depends on the specific device which
6088// have different rates for fma or all f64 operations.
6089//
6090// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
6091// regardless of which device (although the number of cycles differs between
6092// devices), so it is always profitable for f64.
6093//
6094// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
6095// only on full rate devices. Normally, we should prefer selecting v_mad_f32
6096// which we can always do even without fused FP ops since it returns the same
6097// result as the separate operations and since it is always full
6098// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
6099// however does not support denormals, so we do report fma as faster if we have
6100// a fast fma device and require denormals.
6101//
6103 EVT VT) const {
6104 VT = VT.getScalarType();
6105
6106 switch (VT.getSimpleVT().SimpleTy) {
6107 case MVT::f32: {
6108 // If mad is not available this depends only on if f32 fma is full rate.
6109 if (!Subtarget->hasMadMacF32Insts())
6110 return Subtarget->hasFastFMAF32();
6111
6112 // Otherwise f32 mad is always full rate and returns the same result as
6113 // the separate operations so should be preferred over fma.
6114 // However does not support denormals.
6116 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6117
6118 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
6119 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6120 }
6121 case MVT::f64:
6122 return true;
6123 case MVT::f16:
6124 case MVT::bf16:
6125 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
6126 default:
6127 break;
6128 }
6129
6130 return false;
6131}
6132
6134 LLT Ty) const {
6135 switch (Ty.getScalarSizeInBits()) {
6136 case 16:
6137 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
6138 case 32:
6139 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
6140 case 64:
6141 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
6142 default:
6143 break;
6144 }
6145
6146 return false;
6147}
6148
6150 if (!Ty.isScalar())
6151 return false;
6152
6153 if (Ty.getScalarSizeInBits() == 16)
6154 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
6155 if (Ty.getScalarSizeInBits() == 32)
6156 return Subtarget->hasMadMacF32Insts() &&
6157 denormalModeIsFlushAllF32(*MI.getMF());
6158
6159 return false;
6160}
6161
6163 const SDNode *N) const {
6164 // TODO: Check future ftz flag
6165 // v_mad_f32/v_mac_f32 do not support denormals.
6166 EVT VT = N->getValueType(0);
6167 if (VT == MVT::f32)
6168 return Subtarget->hasMadMacF32Insts() &&
6170 if (VT == MVT::f16) {
6171 return Subtarget->hasMadF16() &&
6173 }
6174
6175 return false;
6176}
6177
6178//===----------------------------------------------------------------------===//
6179// Custom DAG Lowering Operations
6180//===----------------------------------------------------------------------===//
6181
6182// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6183// wider vector type is legal.
6185 SelectionDAG &DAG) const {
6186 unsigned Opc = Op.getOpcode();
6187 EVT VT = Op.getValueType();
6188 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
6189 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
6190 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6191 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
6192
6193 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
6194
6195 SDLoc SL(Op);
6196 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
6197 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
6198
6199 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6200}
6201
6202// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6203// wider vector type is legal.
6205 SelectionDAG &DAG) const {
6206 unsigned Opc = Op.getOpcode();
6207 EVT VT = Op.getValueType();
6208 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6209 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6210 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6211 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6212 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6213 VT == MVT::v32bf16);
6214
6215 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
6216 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6217
6218 SDLoc SL(Op);
6219
6220 SDValue OpLo =
6221 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
6222 SDValue OpHi =
6223 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
6224
6225 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6226}
6227
6229 SelectionDAG &DAG) const {
6230 unsigned Opc = Op.getOpcode();
6231 EVT VT = Op.getValueType();
6232 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6233 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6234 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6235 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6236 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6237 VT == MVT::v32bf16);
6238
6239 SDValue Op0 = Op.getOperand(0);
6240 auto [Lo0, Hi0] = Op0.getValueType().isVector()
6241 ? DAG.SplitVectorOperand(Op.getNode(), 0)
6242 : std::pair(Op0, Op0);
6243
6244 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6245 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
6246
6247 SDLoc SL(Op);
6248 auto ResVT = DAG.GetSplitDestVTs(VT);
6249
6250 SDValue OpLo =
6251 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
6252 SDValue OpHi =
6253 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
6254
6255 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6256}
6257
6259 switch (Op.getOpcode()) {
6260 default:
6262 case ISD::BRCOND:
6263 return LowerBRCOND(Op, DAG);
6264 case ISD::RETURNADDR:
6265 return LowerRETURNADDR(Op, DAG);
6266 case ISD::LOAD: {
6267 SDValue Result = LowerLOAD(Op, DAG);
6268 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6269 "Load should return a value and a chain");
6270 return Result;
6271 }
6272 case ISD::FSQRT: {
6273 EVT VT = Op.getValueType();
6274 if (VT == MVT::f32)
6275 return lowerFSQRTF32(Op, DAG);
6276 if (VT == MVT::f64)
6277 return lowerFSQRTF64(Op, DAG);
6278 return SDValue();
6279 }
6280 case ISD::FSIN:
6281 case ISD::FCOS:
6282 return LowerTrig(Op, DAG);
6283 case ISD::SELECT:
6284 return LowerSELECT(Op, DAG);
6285 case ISD::FDIV:
6286 return LowerFDIV(Op, DAG);
6287 case ISD::FFREXP:
6288 return LowerFFREXP(Op, DAG);
6290 return LowerATOMIC_CMP_SWAP(Op, DAG);
6291 case ISD::STORE:
6292 return LowerSTORE(Op, DAG);
6293 case ISD::GlobalAddress: {
6296 return LowerGlobalAddress(MFI, Op, DAG);
6297 }
6299 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6301 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6303 return LowerINTRINSIC_VOID(Op, DAG);
6304 case ISD::ADDRSPACECAST:
6305 return lowerADDRSPACECAST(Op, DAG);
6307 return lowerINSERT_SUBVECTOR(Op, DAG);
6309 return lowerINSERT_VECTOR_ELT(Op, DAG);
6311 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
6313 return lowerVECTOR_SHUFFLE(Op, DAG);
6315 return lowerSCALAR_TO_VECTOR(Op, DAG);
6316 case ISD::BUILD_VECTOR:
6317 return lowerBUILD_VECTOR(Op, DAG);
6318 case ISD::FP_ROUND:
6320 return lowerFP_ROUND(Op, DAG);
6321 case ISD::TRAP:
6322 return lowerTRAP(Op, DAG);
6323 case ISD::DEBUGTRAP:
6324 return lowerDEBUGTRAP(Op, DAG);
6325 case ISD::ABS:
6326 case ISD::FABS:
6327 case ISD::FNEG:
6328 case ISD::FCANONICALIZE:
6329 case ISD::BSWAP:
6330 return splitUnaryVectorOp(Op, DAG);
6331 case ISD::FMINNUM:
6332 case ISD::FMAXNUM:
6333 return lowerFMINNUM_FMAXNUM(Op, DAG);
6334 case ISD::FMINIMUMNUM:
6335 case ISD::FMAXIMUMNUM:
6336 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
6337 case ISD::FMINIMUM:
6338 case ISD::FMAXIMUM:
6339 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
6340 case ISD::FLDEXP:
6341 case ISD::STRICT_FLDEXP:
6342 return lowerFLDEXP(Op, DAG);
6343 case ISD::FMA:
6344 return splitTernaryVectorOp(Op, DAG);
6345 case ISD::FP_TO_SINT:
6346 case ISD::FP_TO_UINT:
6347 return LowerFP_TO_INT(Op, DAG);
6348 case ISD::SHL:
6349 case ISD::SRA:
6350 case ISD::SRL:
6351 case ISD::ADD:
6352 case ISD::SUB:
6353 case ISD::SMIN:
6354 case ISD::SMAX:
6355 case ISD::UMIN:
6356 case ISD::UMAX:
6357 case ISD::FADD:
6358 case ISD::FMUL:
6359 case ISD::FMINNUM_IEEE:
6360 case ISD::FMAXNUM_IEEE:
6361 case ISD::UADDSAT:
6362 case ISD::USUBSAT:
6363 case ISD::SADDSAT:
6364 case ISD::SSUBSAT:
6365 return splitBinaryVectorOp(Op, DAG);
6366 case ISD::FCOPYSIGN:
6367 return lowerFCOPYSIGN(Op, DAG);
6368 case ISD::MUL:
6369 return lowerMUL(Op, DAG);
6370 case ISD::SMULO:
6371 case ISD::UMULO:
6372 return lowerXMULO(Op, DAG);
6373 case ISD::SMUL_LOHI:
6374 case ISD::UMUL_LOHI:
6375 return lowerXMUL_LOHI(Op, DAG);
6377 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6378 case ISD::STACKSAVE:
6379 return LowerSTACKSAVE(Op, DAG);
6380 case ISD::GET_ROUNDING:
6381 return lowerGET_ROUNDING(Op, DAG);
6382 case ISD::SET_ROUNDING:
6383 return lowerSET_ROUNDING(Op, DAG);
6384 case ISD::PREFETCH:
6385 return lowerPREFETCH(Op, DAG);
6386 case ISD::FP_EXTEND:
6388 return lowerFP_EXTEND(Op, DAG);
6389 case ISD::GET_FPENV:
6390 return lowerGET_FPENV(Op, DAG);
6391 case ISD::SET_FPENV:
6392 return lowerSET_FPENV(Op, DAG);
6393 }
6394 return SDValue();
6395}
6396
6397// Used for D16: Casts the result of an instruction into the right vector,
6398// packs values if loads return unpacked values.
6400 const SDLoc &DL, SelectionDAG &DAG,
6401 bool Unpacked) {
6402 if (!LoadVT.isVector())
6403 return Result;
6404
6405 // Cast back to the original packed type or to a larger type that is a
6406 // multiple of 32 bit for D16. Widening the return type is a required for
6407 // legalization.
6408 EVT FittingLoadVT = LoadVT;
6409 if ((LoadVT.getVectorNumElements() % 2) == 1) {
6410 FittingLoadVT =
6412 LoadVT.getVectorNumElements() + 1);
6413 }
6414
6415 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6416 // Truncate to v2i16/v4i16.
6417 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6418
6419 // Workaround legalizer not scalarizing truncate after vector op
6420 // legalization but not creating intermediate vector trunc.
6422 DAG.ExtractVectorElements(Result, Elts);
6423 for (SDValue &Elt : Elts)
6424 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6425
6426 // Pad illegal v1i16/v3fi6 to v4i16
6427 if ((LoadVT.getVectorNumElements() % 2) == 1)
6428 Elts.push_back(DAG.getPOISON(MVT::i16));
6429
6430 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6431
6432 // Bitcast to original type (v2f16/v4f16).
6433 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6434 }
6435
6436 // Cast back to the original packed type.
6437 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6438}
6439
6440SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6441 SelectionDAG &DAG,
6443 bool IsIntrinsic) const {
6444 SDLoc DL(M);
6445
6446 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6447 EVT LoadVT = M->getValueType(0);
6448
6449 EVT EquivLoadVT = LoadVT;
6450 if (LoadVT.isVector()) {
6451 if (Unpacked) {
6452 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6453 LoadVT.getVectorNumElements());
6454 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
6455 // Widen v3f16 to legal type
6456 EquivLoadVT =
6458 LoadVT.getVectorNumElements() + 1);
6459 }
6460 }
6461
6462 // Change from v4f16/v2f16 to EquivLoadVT.
6463 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
6464
6466 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
6467 M->getMemoryVT(), M->getMemOperand());
6468
6469 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
6470
6471 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
6472}
6473
6474SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
6475 SelectionDAG &DAG,
6476 ArrayRef<SDValue> Ops) const {
6477 SDLoc DL(M);
6478 EVT LoadVT = M->getValueType(0);
6479 EVT EltType = LoadVT.getScalarType();
6480 EVT IntVT = LoadVT.changeTypeToInteger();
6481
6482 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
6483
6484 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
6485 bool IsTFE = M->getNumValues() == 3;
6486
6487 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
6491
6492 if (IsD16) {
6493 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
6494 }
6495
6496 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6497 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
6498 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
6499 IsTFE);
6500
6501 if (isTypeLegal(LoadVT)) {
6502 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
6503 M->getMemOperand(), DAG);
6504 }
6505
6506 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
6507 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
6508 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
6509 M->getMemOperand(), DAG);
6510 return DAG.getMergeValues(
6511 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
6512 DL);
6513}
6514
6516 SelectionDAG &DAG) {
6517 EVT VT = N->getValueType(0);
6518 unsigned CondCode = N->getConstantOperandVal(3);
6519 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
6520 return DAG.getPOISON(VT);
6521
6522 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
6523
6524 SDValue LHS = N->getOperand(1);
6525 SDValue RHS = N->getOperand(2);
6526
6527 SDLoc DL(N);
6528
6529 EVT CmpVT = LHS.getValueType();
6530 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
6531 unsigned PromoteOp =
6533 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
6534 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
6535 }
6536
6537 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
6538
6539 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6540 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6541
6542 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
6543 DAG.getCondCode(CCOpcode));
6544 if (VT.bitsEq(CCVT))
6545 return SetCC;
6546 return DAG.getZExtOrTrunc(SetCC, DL, VT);
6547}
6548
6550 SelectionDAG &DAG) {
6551 EVT VT = N->getValueType(0);
6552
6553 unsigned CondCode = N->getConstantOperandVal(3);
6554 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
6555 return DAG.getPOISON(VT);
6556
6557 SDValue Src0 = N->getOperand(1);
6558 SDValue Src1 = N->getOperand(2);
6559 EVT CmpVT = Src0.getValueType();
6560 SDLoc SL(N);
6561
6562 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
6563 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6564 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6565 }
6566
6567 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
6568 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
6569 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6570 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6571 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
6572 DAG.getCondCode(CCOpcode));
6573 if (VT.bitsEq(CCVT))
6574 return SetCC;
6575 return DAG.getZExtOrTrunc(SetCC, SL, VT);
6576}
6577
6579 SelectionDAG &DAG) {
6580 EVT VT = N->getValueType(0);
6581 SDValue Src = N->getOperand(1);
6582 SDLoc SL(N);
6583
6584 if (Src.getOpcode() == ISD::SETCC) {
6585 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
6586 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
6587 Src.getOperand(1), Src.getOperand(2));
6588 }
6589 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
6590 // (ballot 0) -> 0
6591 if (Arg->isZero())
6592 return DAG.getConstant(0, SL, VT);
6593
6594 // (ballot 1) -> EXEC/EXEC_LO
6595 if (Arg->isOne()) {
6596 Register Exec;
6597 if (VT.getScalarSizeInBits() == 32)
6598 Exec = AMDGPU::EXEC_LO;
6599 else if (VT.getScalarSizeInBits() == 64)
6600 Exec = AMDGPU::EXEC;
6601 else
6602 return SDValue();
6603
6604 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
6605 }
6606 }
6607
6608 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
6609 // ISD::SETNE)
6610 return DAG.getNode(
6611 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
6612 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
6613}
6614
6616 SelectionDAG &DAG) {
6617 EVT VT = N->getValueType(0);
6618 unsigned ValSize = VT.getSizeInBits();
6619 unsigned IID = N->getConstantOperandVal(0);
6620 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6621 IID == Intrinsic::amdgcn_permlanex16;
6622 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6623 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6624 SDLoc SL(N);
6625 MVT IntVT = MVT::getIntegerVT(ValSize);
6626 const GCNSubtarget *ST = TLI.getSubtarget();
6627 unsigned SplitSize = 32;
6628 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
6629 ST->hasDPALU_DPP() &&
6630 AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))
6631 SplitSize = 64;
6632
6633 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
6634 SDValue Src2, MVT ValT) -> SDValue {
6636 switch (IID) {
6637 case Intrinsic::amdgcn_permlane16:
6638 case Intrinsic::amdgcn_permlanex16:
6639 case Intrinsic::amdgcn_update_dpp:
6640 Operands.push_back(N->getOperand(6));
6641 Operands.push_back(N->getOperand(5));
6642 Operands.push_back(N->getOperand(4));
6643 [[fallthrough]];
6644 case Intrinsic::amdgcn_writelane:
6645 Operands.push_back(Src2);
6646 [[fallthrough]];
6647 case Intrinsic::amdgcn_readlane:
6648 case Intrinsic::amdgcn_set_inactive:
6649 case Intrinsic::amdgcn_set_inactive_chain_arg:
6650 case Intrinsic::amdgcn_mov_dpp8:
6651 Operands.push_back(Src1);
6652 [[fallthrough]];
6653 case Intrinsic::amdgcn_readfirstlane:
6654 case Intrinsic::amdgcn_permlane64:
6655 Operands.push_back(Src0);
6656 break;
6657 default:
6658 llvm_unreachable("unhandled lane op");
6659 }
6660
6661 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
6662 std::reverse(Operands.begin(), Operands.end());
6663
6664 if (SDNode *GL = N->getGluedNode()) {
6665 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6666 GL = GL->getOperand(0).getNode();
6667 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6668 SDValue(GL, 0)));
6669 }
6670
6671 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
6672 };
6673
6674 SDValue Src0 = N->getOperand(1);
6675 SDValue Src1, Src2;
6676 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6677 IID == Intrinsic::amdgcn_mov_dpp8 ||
6678 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6679 Src1 = N->getOperand(2);
6680 if (IID == Intrinsic::amdgcn_writelane ||
6681 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
6682 Src2 = N->getOperand(3);
6683 }
6684
6685 if (ValSize == SplitSize) {
6686 // Already legal
6687 return SDValue();
6688 }
6689
6690 if (ValSize < 32) {
6691 bool IsFloat = VT.isFloatingPoint();
6692 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
6693 SL, MVT::i32);
6694
6695 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6696 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
6697 SL, MVT::i32);
6698 }
6699
6700 if (IID == Intrinsic::amdgcn_writelane) {
6701 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
6702 SL, MVT::i32);
6703 }
6704
6705 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6706 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
6707 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
6708 }
6709
6710 if (ValSize % SplitSize != 0)
6711 return SDValue();
6712
6713 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
6714 EVT VT = N->getValueType(0);
6715 unsigned NE = VT.getVectorNumElements();
6716 EVT EltVT = VT.getVectorElementType();
6718 unsigned NumOperands = N->getNumOperands();
6719 SmallVector<SDValue, 4> Operands(NumOperands);
6720 SDNode *GL = N->getGluedNode();
6721
6722 // only handle convergencectrl_glue
6724
6725 for (unsigned i = 0; i != NE; ++i) {
6726 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6727 ++j) {
6728 SDValue Operand = N->getOperand(j);
6729 EVT OperandVT = Operand.getValueType();
6730 if (OperandVT.isVector()) {
6731 // A vector operand; extract a single element.
6732 EVT OperandEltVT = OperandVT.getVectorElementType();
6733 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
6734 Operand, DAG.getVectorIdxConstant(i, SL));
6735 } else {
6736 // A scalar operand; just use it as is.
6737 Operands[j] = Operand;
6738 }
6739 }
6740
6741 if (GL)
6742 Operands[NumOperands - 1] =
6743 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6744 SDValue(GL->getOperand(0).getNode(), 0));
6745
6746 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
6747 }
6748
6749 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
6750 return DAG.getBuildVector(VecVT, SL, Scalars);
6751 };
6752
6753 if (VT.isVector()) {
6754 switch (MVT::SimpleValueType EltTy =
6756 case MVT::i32:
6757 case MVT::f32:
6758 if (SplitSize == 32) {
6759 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
6760 return unrollLaneOp(LaneOp.getNode());
6761 }
6762 [[fallthrough]];
6763 case MVT::i16:
6764 case MVT::f16:
6765 case MVT::bf16: {
6766 unsigned SubVecNumElt =
6767 SplitSize / VT.getVectorElementType().getSizeInBits();
6768 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
6770 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6771 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
6772 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
6773 DAG.getConstant(EltIdx, SL, MVT::i32));
6774
6775 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
6776 IsPermLane16)
6777 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
6778 DAG.getConstant(EltIdx, SL, MVT::i32));
6779
6780 if (IID == Intrinsic::amdgcn_writelane)
6781 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
6782 DAG.getConstant(EltIdx, SL, MVT::i32));
6783
6784 Pieces.push_back(
6785 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
6786 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6787 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6788 EltIdx += SubVecNumElt;
6789 }
6790 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
6791 }
6792 default:
6793 // Handle all other cases by bitcasting to i32 vectors
6794 break;
6795 }
6796 }
6797
6798 MVT VecVT =
6799 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
6800 Src0 = DAG.getBitcast(VecVT, Src0);
6801
6802 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6803 Src1 = DAG.getBitcast(VecVT, Src1);
6804
6805 if (IID == Intrinsic::amdgcn_writelane)
6806 Src2 = DAG.getBitcast(VecVT, Src2);
6807
6808 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6809 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
6810 return DAG.getBitcast(VT, UnrolledLaneOp);
6811}
6812
6815 SelectionDAG &DAG) const {
6816 switch (N->getOpcode()) {
6818 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
6819 Results.push_back(Res);
6820 return;
6821 }
6823 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
6824 Results.push_back(Res);
6825 return;
6826 }
6828 unsigned IID = N->getConstantOperandVal(0);
6829 switch (IID) {
6830 case Intrinsic::amdgcn_make_buffer_rsrc:
6831 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
6832 return;
6833 case Intrinsic::amdgcn_cvt_pkrtz: {
6834 SDValue Src0 = N->getOperand(1);
6835 SDValue Src1 = N->getOperand(2);
6836 SDLoc SL(N);
6837 SDValue Cvt =
6838 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
6839 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
6840 return;
6841 }
6842 case Intrinsic::amdgcn_cvt_pknorm_i16:
6843 case Intrinsic::amdgcn_cvt_pknorm_u16:
6844 case Intrinsic::amdgcn_cvt_pk_i16:
6845 case Intrinsic::amdgcn_cvt_pk_u16: {
6846 SDValue Src0 = N->getOperand(1);
6847 SDValue Src1 = N->getOperand(2);
6848 SDLoc SL(N);
6849 unsigned Opcode;
6850
6851 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6853 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6855 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6857 else
6859
6860 EVT VT = N->getValueType(0);
6861 if (isTypeLegal(VT))
6862 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
6863 else {
6864 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
6865 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
6866 }
6867 return;
6868 }
6869 case Intrinsic::amdgcn_s_buffer_load: {
6870 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
6871 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
6872 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
6873 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
6874 // s_buffer_load_i8.
6875 if (!Subtarget->hasScalarSubwordLoads())
6876 return;
6877 SDValue Op = SDValue(N, 0);
6878 SDValue Rsrc = Op.getOperand(1);
6879 SDValue Offset = Op.getOperand(2);
6880 SDValue CachePolicy = Op.getOperand(3);
6881 EVT VT = Op.getValueType();
6882 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
6883 SDLoc DL(Op);
6885 const DataLayout &DataLayout = DAG.getDataLayout();
6886 Align Alignment =
6892 VT.getStoreSize(), Alignment);
6893 SDValue LoadVal;
6894 if (!Offset->isDivergent()) {
6895 SDValue Ops[] = {Rsrc, // source register
6896 Offset, CachePolicy};
6897 SDValue BufferLoad =
6899 DAG.getVTList(MVT::i32), Ops, VT, MMO);
6900 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
6901 } else {
6902 SDValue Ops[] = {
6903 DAG.getEntryNode(), // Chain
6904 Rsrc, // rsrc
6905 DAG.getConstant(0, DL, MVT::i32), // vindex
6906 {}, // voffset
6907 {}, // soffset
6908 {}, // offset
6909 CachePolicy, // cachepolicy
6910 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
6911 };
6912 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
6913 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
6914 }
6915 Results.push_back(LoadVal);
6916 return;
6917 }
6918 case Intrinsic::amdgcn_dead: {
6919 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
6920 Results.push_back(DAG.getPOISON(N->getValueType(I)));
6921 return;
6922 }
6923 }
6924 break;
6925 }
6927 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
6928 if (Res.getOpcode() == ISD::MERGE_VALUES) {
6929 // FIXME: Hacky
6930 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
6931 Results.push_back(Res.getOperand(I));
6932 }
6933 } else {
6934 Results.push_back(Res);
6935 Results.push_back(Res.getValue(1));
6936 }
6937 return;
6938 }
6939
6940 break;
6941 }
6942 case ISD::SELECT: {
6943 SDLoc SL(N);
6944 EVT VT = N->getValueType(0);
6945 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
6946 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
6947 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
6948
6949 EVT SelectVT = NewVT;
6950 if (NewVT.bitsLT(MVT::i32)) {
6951 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
6952 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
6953 SelectVT = MVT::i32;
6954 }
6955
6956 SDValue NewSelect =
6957 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
6958
6959 if (NewVT != SelectVT)
6960 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
6961 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
6962 return;
6963 }
6964 case ISD::FNEG: {
6965 if (N->getValueType(0) != MVT::v2f16)
6966 break;
6967
6968 SDLoc SL(N);
6969 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6970
6971 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
6972 DAG.getConstant(0x80008000, SL, MVT::i32));
6973 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6974 return;
6975 }
6976 case ISD::FABS: {
6977 if (N->getValueType(0) != MVT::v2f16)
6978 break;
6979
6980 SDLoc SL(N);
6981 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6982
6983 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
6984 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
6985 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6986 return;
6987 }
6988 case ISD::FSQRT: {
6989 if (N->getValueType(0) != MVT::f16)
6990 break;
6991 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
6992 break;
6993 }
6994 default:
6996 break;
6997 }
6998}
6999
7000/// Helper function for LowerBRCOND
7001static SDNode *findUser(SDValue Value, unsigned Opcode) {
7002
7003 for (SDUse &U : Value->uses()) {
7004 if (U.get() != Value)
7005 continue;
7006
7007 if (U.getUser()->getOpcode() == Opcode)
7008 return U.getUser();
7009 }
7010 return nullptr;
7011}
7012
7013unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
7014 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
7015 switch (Intr->getConstantOperandVal(1)) {
7016 case Intrinsic::amdgcn_if:
7017 return AMDGPUISD::IF;
7018 case Intrinsic::amdgcn_else:
7019 return AMDGPUISD::ELSE;
7020 case Intrinsic::amdgcn_loop:
7021 return AMDGPUISD::LOOP;
7022 case Intrinsic::amdgcn_end_cf:
7023 llvm_unreachable("should not occur");
7024 default:
7025 return 0;
7026 }
7027 }
7028
7029 // break, if_break, else_break are all only used as inputs to loop, not
7030 // directly as branch conditions.
7031 return 0;
7032}
7033
7035 const Triple &TT = getTargetMachine().getTargetTriple();
7039}
7040
7042 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7043 return false;
7044
7045 // FIXME: Either avoid relying on address space here or change the default
7046 // address space for functions to avoid the explicit check.
7047 return (GV->getValueType()->isFunctionTy() ||
7050}
7051
7053 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
7054}
7055
7057 if (!GV->hasExternalLinkage())
7058 return true;
7059
7060 const auto OS = getTargetMachine().getTargetTriple().getOS();
7061 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
7062}
7063
7064/// This transforms the control flow intrinsics to get the branch destination as
7065/// last parameter, also switches branch target with BR if the need arise
7066SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
7067 SDLoc DL(BRCOND);
7068
7069 SDNode *Intr = BRCOND.getOperand(1).getNode();
7070 SDValue Target = BRCOND.getOperand(2);
7071 SDNode *BR = nullptr;
7072 SDNode *SetCC = nullptr;
7073
7074 if (Intr->getOpcode() == ISD::SETCC) {
7075 // As long as we negate the condition everything is fine
7076 SetCC = Intr;
7077 Intr = SetCC->getOperand(0).getNode();
7078
7079 } else {
7080 // Get the target from BR if we don't negate the condition
7081 BR = findUser(BRCOND, ISD::BR);
7082 assert(BR && "brcond missing unconditional branch user");
7083 Target = BR->getOperand(1);
7084 }
7085
7086 unsigned CFNode = isCFIntrinsic(Intr);
7087 if (CFNode == 0) {
7088 // This is a uniform branch so we don't need to legalize.
7089 return BRCOND;
7090 }
7091
7092 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
7093 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
7094
7095 assert(!SetCC ||
7096 (SetCC->getConstantOperandVal(1) == 1 &&
7097 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
7098 ISD::SETNE));
7099
7100 // operands of the new intrinsic call
7102 if (HaveChain)
7103 Ops.push_back(BRCOND.getOperand(0));
7104
7105 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
7106 Ops.push_back(Target);
7107
7108 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
7109
7110 // build the new intrinsic call
7111 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
7112
7113 if (!HaveChain) {
7114 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
7115
7116 Result = DAG.getMergeValues(Ops, DL).getNode();
7117 }
7118
7119 if (BR) {
7120 // Give the branch instruction our target
7121 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
7122 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
7123 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
7124 }
7125
7126 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
7127
7128 // Copy the intrinsic results to registers
7129 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
7131 if (!CopyToReg)
7132 continue;
7133
7134 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
7135 SDValue(Result, i - 1), SDValue());
7136
7137 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
7138 }
7139
7140 // Remove the old intrinsic from the chain
7141 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
7142 Intr->getOperand(0));
7143
7144 return Chain;
7145}
7146
7147SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
7148 MVT VT = Op.getSimpleValueType();
7149 SDLoc DL(Op);
7150 // Checking the depth
7151 if (Op.getConstantOperandVal(0) != 0)
7152 return DAG.getConstant(0, DL, VT);
7153
7156 // Check for kernel and shader functions
7157 if (Info->isEntryFunction())
7158 return DAG.getConstant(0, DL, VT);
7159
7160 MachineFrameInfo &MFI = MF.getFrameInfo();
7161 // There is a call to @llvm.returnaddress in this function
7162 MFI.setReturnAddressIsTaken(true);
7163
7165 // Get the return address reg and mark it as an implicit live-in
7166 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
7167 getRegClassFor(VT, Op.getNode()->isDivergent()));
7168
7169 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
7170}
7171
7172SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
7173 const SDLoc &DL, EVT VT) const {
7174 return Op.getValueType().bitsLE(VT)
7175 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
7176 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
7177 DAG.getTargetConstant(0, DL, MVT::i32));
7178}
7179
7180SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
7181 SelectionDAG &DAG) const {
7182 EVT DstVT = Op.getValueType();
7183 unsigned NumElts = DstVT.getVectorNumElements();
7184 assert(NumElts > 2 && isPowerOf2_32(NumElts));
7185
7186 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
7187
7188 SDLoc DL(Op);
7189 unsigned Opc = Op.getOpcode();
7190 SDValue Flags = Op.getOperand(1);
7191 EVT HalfDstVT =
7192 EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2);
7193 SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags);
7194 SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags);
7195
7196 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi);
7197}
7198
7199SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
7200 SDValue Src = Op.getOperand(0);
7201 EVT SrcVT = Src.getValueType();
7202 EVT DstVT = Op.getValueType();
7203
7204 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
7205 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
7206 if (SrcVT.getScalarType() != MVT::f32)
7207 return SDValue();
7208 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
7209 }
7210
7211 if (SrcVT.getScalarType() != MVT::f64)
7212 return Op;
7213
7214 SDLoc DL(Op);
7215 if (DstVT == MVT::f16) {
7216 // TODO: Handle strictfp
7217 if (Op.getOpcode() != ISD::FP_ROUND)
7218 return Op;
7219
7220 if (!Subtarget->has16BitInsts()) {
7221 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
7222 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7223 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7224 }
7225 if (Op->getFlags().hasApproximateFuncs()) {
7226 SDValue Flags = Op.getOperand(1);
7227 SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
7228 return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
7229 }
7230 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
7231 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7232 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7233 }
7234
7235 assert(DstVT.getScalarType() == MVT::bf16 &&
7236 "custom lower FP_ROUND for f16 or bf16");
7237 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
7238
7239 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
7240 // hardware f32 -> bf16 instruction.
7241 EVT F32VT = SrcVT.isVector() ? SrcVT.changeVectorElementType(MVT::f32) :
7242 MVT::f32;
7243 SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);
7244 return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,
7245 DAG.getTargetConstant(0, DL, MVT::i32));
7246}
7247
7248SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
7249 SelectionDAG &DAG) const {
7250 EVT VT = Op.getValueType();
7251 const MachineFunction &MF = DAG.getMachineFunction();
7253 bool IsIEEEMode = Info->getMode().IEEE;
7254
7255 // FIXME: Assert during selection that this is only selected for
7256 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
7257 // mode functions, but this happens to be OK since it's only done in cases
7258 // where there is known no sNaN.
7259 if (IsIEEEMode)
7260 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
7261
7262 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7263 VT == MVT::v16bf16)
7264 return splitBinaryVectorOp(Op, DAG);
7265 return Op;
7266}
7267
7268SDValue
7269SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
7270 SelectionDAG &DAG) const {
7271 EVT VT = Op.getValueType();
7272 const MachineFunction &MF = DAG.getMachineFunction();
7274 bool IsIEEEMode = Info->getMode().IEEE;
7275
7276 if (IsIEEEMode)
7277 return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
7278
7279 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7280 VT == MVT::v16bf16)
7281 return splitBinaryVectorOp(Op, DAG);
7282 return Op;
7283}
7284
7285SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
7286 SelectionDAG &DAG) const {
7287 EVT VT = Op.getValueType();
7288 if (VT.isVector())
7289 return splitBinaryVectorOp(Op, DAG);
7290
7291 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7292 !Subtarget->hasMinimum3Maximum3F16() &&
7293 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7294 "should not need to widen f16 minimum/maximum to v2f16");
7295
7296 // Widen f16 operation to v2f16
7297
7298 // fminimum f16:x, f16:y ->
7299 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
7300 // (v2f16 (scalar_to_vector y))), 0
7301 SDLoc SL(Op);
7302 SDValue WideSrc0 =
7303 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
7304 SDValue WideSrc1 =
7305 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
7306
7307 SDValue Widened =
7308 DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7309
7310 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
7311 DAG.getConstant(0, SL, MVT::i32));
7312}
7313
7314SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
7315 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
7316 EVT VT = Op.getValueType();
7317 assert(VT == MVT::f16);
7318
7319 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
7320 EVT ExpVT = Exp.getValueType();
7321 if (ExpVT == MVT::i16)
7322 return Op;
7323
7324 SDLoc DL(Op);
7325
7326 // Correct the exponent type for f16 to i16.
7327 // Clamp the range of the exponent to the instruction's range.
7328
7329 // TODO: This should be a generic narrowing legalization, and can easily be
7330 // for GlobalISel.
7331
7332 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
7333 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
7334
7335 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
7336 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
7337
7338 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
7339
7340 if (IsStrict) {
7341 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
7342 {Op.getOperand(0), Op.getOperand(1), TruncExp});
7343 }
7344
7345 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
7346}
7347
7349 switch (Op->getOpcode()) {
7350 case ISD::SRA:
7351 case ISD::SMIN:
7352 case ISD::SMAX:
7353 return ISD::SIGN_EXTEND;
7354 case ISD::SRL:
7355 case ISD::UMIN:
7356 case ISD::UMAX:
7357 return ISD::ZERO_EXTEND;
7358 case ISD::ADD:
7359 case ISD::SUB:
7360 case ISD::AND:
7361 case ISD::OR:
7362 case ISD::XOR:
7363 case ISD::SHL:
7364 case ISD::SELECT:
7365 case ISD::MUL:
7366 // operation result won't be influenced by garbage high bits.
7367 // TODO: are all of those cases correct, and are there more?
7368 return ISD::ANY_EXTEND;
7369 case ISD::SETCC: {
7370 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7372 }
7373 default:
7374 llvm_unreachable("unexpected opcode!");
7375 }
7376}
7377
7378SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
7379 DAGCombinerInfo &DCI) const {
7380 const unsigned Opc = Op.getOpcode();
7381 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
7382 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
7383 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
7384 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
7385 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
7386
7387 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
7388 : Op->getOperand(0).getValueType();
7389 auto ExtTy = OpTy.changeElementType(MVT::i32);
7390
7391 if (DCI.isBeforeLegalizeOps() ||
7392 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
7393 return SDValue();
7394
7395 auto &DAG = DCI.DAG;
7396
7397 SDLoc DL(Op);
7398 SDValue LHS;
7399 SDValue RHS;
7400 if (Opc == ISD::SELECT) {
7401 LHS = Op->getOperand(1);
7402 RHS = Op->getOperand(2);
7403 } else {
7404 LHS = Op->getOperand(0);
7405 RHS = Op->getOperand(1);
7406 }
7407
7408 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
7409 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
7410
7411 // Special case: for shifts, the RHS always needs a zext.
7412 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
7413 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
7414 else
7415 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
7416
7417 // setcc always return i1/i1 vec so no need to truncate after.
7418 if (Opc == ISD::SETCC) {
7419 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7420 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
7421 }
7422
7423 // For other ops, we extend the operation's return type as well so we need to
7424 // truncate back to the original type.
7425 SDValue NewVal;
7426 if (Opc == ISD::SELECT)
7427 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
7428 else
7429 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
7430
7431 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
7432}
7433
7434SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
7435 SDValue Mag = Op.getOperand(0);
7436 EVT MagVT = Mag.getValueType();
7437
7438 if (MagVT.getVectorNumElements() > 2)
7439 return splitBinaryVectorOp(Op, DAG);
7440
7441 SDValue Sign = Op.getOperand(1);
7442 EVT SignVT = Sign.getValueType();
7443
7444 if (MagVT == SignVT)
7445 return Op;
7446
7447 // fcopysign v2f16:mag, v2f32:sign ->
7448 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
7449
7450 SDLoc SL(Op);
7451 SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
7452 SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32);
7453
7454 SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
7455
7456 return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
7457}
7458
7459// Custom lowering for vector multiplications and s_mul_u64.
7460SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
7461 EVT VT = Op.getValueType();
7462
7463 // Split vector operands.
7464 if (VT.isVector())
7465 return splitBinaryVectorOp(Op, DAG);
7466
7467 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
7468
7469 // There are four ways to lower s_mul_u64:
7470 //
7471 // 1. If all the operands are uniform, then we lower it as it is.
7472 //
7473 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
7474 // multiplications because there is not a vector equivalent of s_mul_u64.
7475 //
7476 // 3. If the cost model decides that it is more efficient to use vector
7477 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
7478 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
7479 //
7480 // 4. If the cost model decides to use vector registers and both of the
7481 // operands are zero-extended/sign-extended from 32-bits, then we split the
7482 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
7483 // possible to check if the operands are zero-extended or sign-extended in
7484 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
7485 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
7486 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
7487 // If the cost model decides that we have to use vector registers, then
7488 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
7489 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
7490 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
7491 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
7492 // SIInstrInfo.cpp .
7493
7494 if (Op->isDivergent())
7495 return SDValue();
7496
7497 SDValue Op0 = Op.getOperand(0);
7498 SDValue Op1 = Op.getOperand(1);
7499 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
7500 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
7501 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
7502 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
7503 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
7504 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
7505 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
7506 SDLoc SL(Op);
7507 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7508 return SDValue(
7509 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7510 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
7511 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
7512 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7513 return SDValue(
7514 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7515 // If all the operands are uniform, then we lower s_mul_u64 as it is.
7516 return Op;
7517}
7518
7519SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
7520 EVT VT = Op.getValueType();
7521 SDLoc SL(Op);
7522 SDValue LHS = Op.getOperand(0);
7523 SDValue RHS = Op.getOperand(1);
7524 bool isSigned = Op.getOpcode() == ISD::SMULO;
7525
7526 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
7527 const APInt &C = RHSC->getAPIntValue();
7528 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
7529 if (C.isPowerOf2()) {
7530 // smulo(x, signed_min) is same as umulo(x, signed_min).
7531 bool UseArithShift = isSigned && !C.isMinSignedValue();
7532 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
7533 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
7534 SDValue Overflow =
7535 DAG.getSetCC(SL, MVT::i1,
7536 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
7537 Result, ShiftAmt),
7538 LHS, ISD::SETNE);
7539 return DAG.getMergeValues({Result, Overflow}, SL);
7540 }
7541 }
7542
7543 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
7544 SDValue Top =
7545 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
7546
7547 SDValue Sign = isSigned
7548 ? DAG.getNode(ISD::SRA, SL, VT, Result,
7549 DAG.getConstant(VT.getScalarSizeInBits() - 1,
7550 SL, MVT::i32))
7551 : DAG.getConstant(0, SL, VT);
7552 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
7553
7554 return DAG.getMergeValues({Result, Overflow}, SL);
7555}
7556
7557SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
7558 if (Op->isDivergent()) {
7559 // Select to V_MAD_[IU]64_[IU]32.
7560 return Op;
7561 }
7562 if (Subtarget->hasSMulHi()) {
7563 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
7564 return SDValue();
7565 }
7566 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
7567 // calculate the high part, so we might as well do the whole thing with
7568 // V_MAD_[IU]64_[IU]32.
7569 return Op;
7570}
7571
7572SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
7573 if (!Subtarget->isTrapHandlerEnabled() ||
7575 return lowerTrapEndpgm(Op, DAG);
7576
7577 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
7578 : lowerTrapHsaQueuePtr(Op, DAG);
7579}
7580
7581SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
7582 SDLoc SL(Op);
7583 SDValue Chain = Op.getOperand(0);
7584 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
7585}
7586
7587SDValue
7588SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
7589 const SDLoc &DL, Align Alignment,
7590 ImplicitParameter Param) const {
7593 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
7595 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
7598}
7599
7600SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
7601 SelectionDAG &DAG) const {
7602 SDLoc SL(Op);
7603 SDValue Chain = Op.getOperand(0);
7604
7605 SDValue QueuePtr;
7606 // For code object version 5, QueuePtr is passed through implicit kernarg.
7607 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7609 QueuePtr =
7610 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
7611 } else {
7614 Register UserSGPR = Info->getQueuePtrUserSGPR();
7615
7616 if (UserSGPR == AMDGPU::NoRegister) {
7617 // We probably are in a function incorrectly marked with
7618 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
7619 // trap, so just use a null pointer.
7620 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
7621 } else {
7622 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
7623 MVT::i64);
7624 }
7625 }
7626
7627 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
7628 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
7629
7631 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
7632 ToReg.getValue(1)};
7633 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7634}
7635
7636SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
7637 SDLoc SL(Op);
7638 SDValue Chain = Op.getOperand(0);
7639
7640 // We need to simulate the 's_trap 2' instruction on targets that run in
7641 // PRIV=1 (where it is treated as a nop).
7642 if (Subtarget->hasPrivEnabledTrap2NopBug())
7643 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
7644
7646 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7647 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7648}
7649
7650SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
7651 SDLoc SL(Op);
7652 SDValue Chain = Op.getOperand(0);
7654
7655 if (!Subtarget->isTrapHandlerEnabled() ||
7657 LLVMContext &Ctx = MF.getFunction().getContext();
7659 "debugtrap handler not supported",
7660 Op.getDebugLoc(), DS_Warning));
7661 return Chain;
7662 }
7663
7664 uint64_t TrapID =
7666 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7667 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7668}
7669
7670SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
7671 SelectionDAG &DAG) const {
7672 if (Subtarget->hasApertureRegs()) {
7673 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
7674 ? AMDGPU::SRC_SHARED_BASE
7675 : AMDGPU::SRC_PRIVATE_BASE;
7676 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
7677 !Subtarget->hasGloballyAddressableScratch()) &&
7678 "Cannot use src_private_base with globally addressable scratch!");
7679 // Note: this feature (register) is broken. When used as a 32-bit operand,
7680 // it returns a wrong value (all zeroes?). The real value is in the upper 32
7681 // bits.
7682 //
7683 // To work around the issue, directly emit a 64 bit mov from this register
7684 // then extract the high bits. Note that this shouldn't even result in a
7685 // shift being emitted and simply become a pair of registers (e.g.):
7686 // s_mov_b64 s[6:7], src_shared_base
7687 // v_mov_b32_e32 v1, s7
7688 //
7689 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
7690 // coalescing would kick in and it would think it's okay to use the "HI"
7691 // subregister directly (instead of extracting the HI 32 bits) which is an
7692 // artificial (unusable) register.
7693 // Register TableGen definitions would need an overhaul to get rid of the
7694 // artificial "HI" aperture registers and prevent this kind of issue from
7695 // happening.
7696 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
7697 DAG.getRegister(ApertureRegNo, MVT::i64));
7698 return DAG.getNode(
7699 ISD::TRUNCATE, DL, MVT::i32,
7700 DAG.getNode(ISD::SRL, DL, MVT::i64,
7701 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
7702 }
7703
7704 // For code object version 5, private_base and shared_base are passed through
7705 // implicit kernargs.
7706 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7710 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
7711 }
7712
7715 Register UserSGPR = Info->getQueuePtrUserSGPR();
7716 if (UserSGPR == AMDGPU::NoRegister) {
7717 // We probably are in a function incorrectly marked with
7718 // amdgpu-no-queue-ptr. This is undefined.
7719 return DAG.getPOISON(MVT::i32);
7720 }
7721
7722 SDValue QueuePtr =
7723 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
7724
7725 // Offset into amd_queue_t for group_segment_aperture_base_hi /
7726 // private_segment_aperture_base_hi.
7727 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
7728
7729 SDValue Ptr =
7730 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
7731
7732 // TODO: Use custom target PseudoSourceValue.
7733 // TODO: We should use the value from the IR intrinsic call, but it might not
7734 // be available and how do we get it?
7736 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
7737 commonAlignment(Align(64), StructOffset),
7740}
7741
7742/// Return true if the value is a known valid address, such that a null check is
7743/// not necessary.
7745 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
7746 if (isa<FrameIndexSDNode, GlobalAddressSDNode, BasicBlockSDNode>(Val))
7747 return true;
7748
7749 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7750 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7751
7752 // TODO: Search through arithmetic, handle arguments and loads
7753 // marked nonnull.
7754 return false;
7755}
7756
7757SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
7758 SelectionDAG &DAG) const {
7759 SDLoc SL(Op);
7760
7761 const AMDGPUTargetMachine &TM =
7762 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
7763
7764 unsigned DestAS, SrcAS;
7765 SDValue Src;
7766 bool IsNonNull = false;
7767 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
7768 SrcAS = ASC->getSrcAddressSpace();
7769 Src = ASC->getOperand(0);
7770 DestAS = ASC->getDestAddressSpace();
7771 } else {
7772 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
7773 Op.getConstantOperandVal(0) ==
7774 Intrinsic::amdgcn_addrspacecast_nonnull);
7775 Src = Op->getOperand(1);
7776 SrcAS = Op->getConstantOperandVal(2);
7777 DestAS = Op->getConstantOperandVal(3);
7778 IsNonNull = true;
7779 }
7780
7781 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
7782
7783 // flat -> local/private
7784 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
7785 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
7786 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
7787 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7788
7789 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
7790 Subtarget->hasGloballyAddressableScratch()) {
7791 // flat -> private with globally addressable scratch: subtract
7792 // src_flat_scratch_base_lo.
7793 SDValue FlatScratchBaseLo(
7794 DAG.getMachineNode(
7795 AMDGPU::S_MOV_B32, SL, MVT::i32,
7796 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
7797 0);
7798 Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
7799 }
7800
7801 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7802 return Ptr;
7803
7804 unsigned NullVal = TM.getNullPointerValue(DestAS);
7805 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7806 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
7807
7808 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
7809 SegmentNullPtr);
7810 }
7811 }
7812
7813 // local/private -> flat
7814 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
7815 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
7816 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
7817 SDValue CvtPtr;
7818 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
7819 Subtarget->hasGloballyAddressableScratch()) {
7820 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
7821 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
7822 SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
7823 SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
7824 ThreadID = DAG.getNode(
7825 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
7826 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
7827 AllOnes, ThreadID);
7828 if (Subtarget->isWave64())
7829 ThreadID = DAG.getNode(
7830 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
7831 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
7832 AllOnes, ThreadID);
7833 SDValue ShAmt = DAG.getShiftAmountConstant(
7834 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
7835 SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
7836 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
7837 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7838 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
7839 // 64-bit hi:lo value.
7840 SDValue FlatScratchBase = {
7841 DAG.getMachineNode(
7842 AMDGPU::S_MOV_B64, SL, MVT::i64,
7843 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
7844 0};
7845 CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
7846 } else {
7847 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7848 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
7849 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7850 }
7851
7852 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7853 return CvtPtr;
7854
7855 unsigned NullVal = TM.getNullPointerValue(SrcAS);
7856 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7857
7858 SDValue NonNull =
7859 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
7860
7861 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
7862 FlatNullPtr);
7863 }
7864 }
7865
7866 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7867 Op.getValueType() == MVT::i64) {
7870 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
7871 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
7872 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
7873 }
7874
7875 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7876 Src.getValueType() == MVT::i64)
7877 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7878
7879 // global <-> flat are no-ops and never emitted.
7880
7881 // Invalid casts are poison.
7882 return DAG.getPOISON(Op->getValueType(0));
7883}
7884
7885// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
7886// the small vector and inserting them into the big vector. That is better than
7887// the default expansion of doing it via a stack slot. Even though the use of
7888// the stack slot would be optimized away afterwards, the stack slot itself
7889// remains.
7890SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
7891 SelectionDAG &DAG) const {
7892 SDValue Vec = Op.getOperand(0);
7893 SDValue Ins = Op.getOperand(1);
7894 SDValue Idx = Op.getOperand(2);
7895 EVT VecVT = Vec.getValueType();
7896 EVT InsVT = Ins.getValueType();
7897 EVT EltVT = VecVT.getVectorElementType();
7898 unsigned InsNumElts = InsVT.getVectorNumElements();
7899 unsigned IdxVal = Idx->getAsZExtVal();
7900 SDLoc SL(Op);
7901
7902 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
7903 // Insert 32-bit registers at a time.
7904 assert(InsNumElts % 2 == 0 && "expect legal vector types");
7905
7906 unsigned VecNumElts = VecVT.getVectorNumElements();
7907 EVT NewVecVT =
7908 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
7909 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7911 MVT::i32, InsNumElts / 2);
7912
7913 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
7914 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
7915
7916 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
7917 SDValue Elt;
7918 if (InsNumElts == 2) {
7919 Elt = Ins;
7920 } else {
7921 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
7922 DAG.getConstant(I, SL, MVT::i32));
7923 }
7924 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
7925 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
7926 }
7927
7928 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
7929 }
7930
7931 for (unsigned I = 0; I != InsNumElts; ++I) {
7932 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
7933 DAG.getConstant(I, SL, MVT::i32));
7934 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
7935 DAG.getConstant(IdxVal + I, SL, MVT::i32));
7936 }
7937 return Vec;
7938}
7939
7940SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
7941 SelectionDAG &DAG) const {
7942 SDValue Vec = Op.getOperand(0);
7943 SDValue InsVal = Op.getOperand(1);
7944 SDValue Idx = Op.getOperand(2);
7945 EVT VecVT = Vec.getValueType();
7946 EVT EltVT = VecVT.getVectorElementType();
7947 unsigned VecSize = VecVT.getSizeInBits();
7948 unsigned EltSize = EltVT.getSizeInBits();
7949 SDLoc SL(Op);
7950
7951 // Specially handle the case of v4i16 with static indexing.
7952 unsigned NumElts = VecVT.getVectorNumElements();
7953 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
7954 if (NumElts == 4 && EltSize == 16 && KIdx) {
7955 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
7956
7957 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7958 DAG.getConstant(0, SL, MVT::i32));
7959 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7960 DAG.getConstant(1, SL, MVT::i32));
7961
7962 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
7963 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
7964
7965 unsigned Idx = KIdx->getZExtValue();
7966 bool InsertLo = Idx < 2;
7967 SDValue InsHalf = DAG.getNode(
7968 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
7969 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
7970 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
7971
7972 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
7973
7974 SDValue Concat =
7975 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
7976 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
7977
7978 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
7979 }
7980
7981 // Static indexing does not lower to stack access, and hence there is no need
7982 // for special custom lowering to avoid stack access.
7983 if (isa<ConstantSDNode>(Idx))
7984 return SDValue();
7985
7986 // Avoid stack access for dynamic indexing by custom lowering to
7987 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
7988
7989 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
7990
7991 MVT IntVT = MVT::getIntegerVT(VecSize);
7992
7993 // Convert vector index to bit-index and get the required bit mask.
7994 assert(isPowerOf2_32(EltSize));
7995 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7996 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7997 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7998 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
7999 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
8000
8001 // 1. Create a congruent vector with the target value in each element.
8002 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
8003 DAG.getSplatBuildVector(VecVT, SL, InsVal));
8004
8005 // 2. Mask off all other indices except the required index within (1).
8006 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
8007
8008 // 3. Mask off the required index within the target vector.
8009 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8010 SDValue RHS =
8011 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
8012
8013 // 4. Get (2) and (3) ORed into the target vector.
8014 SDValue BFI =
8015 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
8016
8017 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
8018}
8019
8020SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
8021 SelectionDAG &DAG) const {
8022 SDLoc SL(Op);
8023
8024 EVT ResultVT = Op.getValueType();
8025 SDValue Vec = Op.getOperand(0);
8026 SDValue Idx = Op.getOperand(1);
8027 EVT VecVT = Vec.getValueType();
8028 unsigned VecSize = VecVT.getSizeInBits();
8029 EVT EltVT = VecVT.getVectorElementType();
8030
8031 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
8032
8033 // Make sure we do any optimizations that will make it easier to fold
8034 // source modifiers before obscuring it with bit operations.
8035
8036 // XXX - Why doesn't this get called when vector_shuffle is expanded?
8037 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
8038 return Combined;
8039
8040 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8041 SDValue Lo, Hi;
8042 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
8043
8044 if (VecSize == 128) {
8045 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
8046 Lo = DAG.getBitcast(LoVT,
8047 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8048 DAG.getConstant(0, SL, MVT::i32)));
8049 Hi = DAG.getBitcast(HiVT,
8050 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8051 DAG.getConstant(1, SL, MVT::i32)));
8052 } else if (VecSize == 256) {
8053 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
8054 SDValue Parts[4];
8055 for (unsigned P = 0; P < 4; ++P) {
8056 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8057 DAG.getConstant(P, SL, MVT::i32));
8058 }
8059
8060 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8061 Parts[0], Parts[1]));
8062 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8063 Parts[2], Parts[3]));
8064 } else {
8065 assert(VecSize == 512);
8066
8067 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
8068 SDValue Parts[8];
8069 for (unsigned P = 0; P < 8; ++P) {
8070 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8071 DAG.getConstant(P, SL, MVT::i32));
8072 }
8073
8074 Lo = DAG.getBitcast(LoVT,
8075 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8076 Parts[0], Parts[1], Parts[2], Parts[3]));
8077 Hi = DAG.getBitcast(HiVT,
8078 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8079 Parts[4], Parts[5], Parts[6], Parts[7]));
8080 }
8081
8082 EVT IdxVT = Idx.getValueType();
8083 unsigned NElem = VecVT.getVectorNumElements();
8084 assert(isPowerOf2_32(NElem));
8085 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
8086 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
8087 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
8088 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
8089 }
8090
8091 assert(VecSize <= 64);
8092
8093 MVT IntVT = MVT::getIntegerVT(VecSize);
8094
8095 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
8096 SDValue VecBC = peekThroughBitcasts(Vec);
8097 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8098 SDValue Src = VecBC.getOperand(0);
8099 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8100 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
8101 }
8102
8103 unsigned EltSize = EltVT.getSizeInBits();
8104 assert(isPowerOf2_32(EltSize));
8105
8106 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8107
8108 // Convert vector index to bit-index (* EltSize)
8109 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8110
8111 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8112 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
8113
8114 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8115 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
8116 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
8117 }
8118
8119 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
8120}
8121
8122static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
8123 assert(Elt % 2 == 0);
8124 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8125}
8126
8127static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
8128 assert(Elt % 2 == 0);
8129 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8130 !(Mask[Elt + 1] & 1);
8131}
8132
8133SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
8134 SelectionDAG &DAG) const {
8135 SDLoc SL(Op);
8136 EVT ResultVT = Op.getValueType();
8137 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
8138 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
8139 const int NewSrcNumElts = 2;
8140 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
8141 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
8142
8143 // Break up the shuffle into registers sized pieces.
8144 //
8145 // We're trying to form sub-shuffles that the register allocation pipeline
8146 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
8147 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
8148 // pair of copies into a consecutive register copy, so use the ordinary
8149 // extract_vector_elt lowering unless we can use the shuffle.
8150 //
8151 // TODO: This is a bit of hack, and we should probably always use
8152 // extract_subvector for the largest possible subvector we can (or at least
8153 // use it for PackVT aligned pieces). However we have worse support for
8154 // combines on them don't directly treat extract_subvector / insert_subvector
8155 // as legal. The DAG scheduler also ends up doing a worse job with the
8156 // extract_subvectors.
8157 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
8158
8159 // vector_shuffle <0,1,6,7> lhs, rhs
8160 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
8161 //
8162 // vector_shuffle <6,7,2,3> lhs, rhs
8163 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
8164 //
8165 // vector_shuffle <6,7,0,1> lhs, rhs
8166 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
8167
8168 // Avoid scalarizing when both halves are reading from consecutive elements.
8169
8170 // If we're treating 2 element shuffles as legal, also create odd-to-even
8171 // shuffles of neighboring pairs.
8172 //
8173 // vector_shuffle <3,2,7,6> lhs, rhs
8174 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
8175 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
8176
8178 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
8179 if (ShouldUseConsecutiveExtract &&
8181 const int Idx = SVN->getMaskElt(I);
8182 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8183 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8184 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
8185 SVN->getOperand(VecIdx),
8186 DAG.getConstant(EltIdx, SL, MVT::i32));
8187 Pieces.push_back(SubVec);
8188 } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
8190 int Idx0 = SVN->getMaskElt(I);
8191 int Idx1 = SVN->getMaskElt(I + 1);
8192
8193 SDValue SrcOp0 = SVN->getOperand(0);
8194 SDValue SrcOp1 = SrcOp0;
8195 if (Idx0 >= SrcNumElts) {
8196 SrcOp0 = SVN->getOperand(1);
8197 Idx0 -= SrcNumElts;
8198 }
8199
8200 if (Idx1 >= SrcNumElts) {
8201 SrcOp1 = SVN->getOperand(1);
8202 Idx1 -= SrcNumElts;
8203 }
8204
8205 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8206 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8207
8208 // Extract nearest even aligned piece.
8209 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
8210 DAG.getConstant(AlignedIdx0, SL, MVT::i32));
8211 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
8212 DAG.getConstant(AlignedIdx1, SL, MVT::i32));
8213
8214 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8215 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8216
8217 SDValue Result0 = SubVec0;
8218 SDValue Result1 = SubVec0;
8219
8220 if (SubVec0 != SubVec1) {
8221 NewMaskIdx1 += NewSrcNumElts;
8222 Result1 = SubVec1;
8223 } else {
8224 Result1 = DAG.getPOISON(PackVT);
8225 }
8226
8227 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
8228 {NewMaskIdx0, NewMaskIdx1});
8229 Pieces.push_back(Shuf);
8230 } else {
8231 const int Idx0 = SVN->getMaskElt(I);
8232 const int Idx1 = SVN->getMaskElt(I + 1);
8233 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8234 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8235 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8236 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8237
8238 SDValue Vec0 = SVN->getOperand(VecIdx0);
8239 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
8240 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
8241
8242 SDValue Vec1 = SVN->getOperand(VecIdx1);
8243 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
8244 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
8245 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
8246 }
8247 }
8248
8249 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
8250}
8251
8252SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
8253 SelectionDAG &DAG) const {
8254 SDValue SVal = Op.getOperand(0);
8255 EVT ResultVT = Op.getValueType();
8256 EVT SValVT = SVal.getValueType();
8257 SDValue UndefVal = DAG.getPOISON(SValVT);
8258 SDLoc SL(Op);
8259
8261 VElts.push_back(SVal);
8262 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
8263 VElts.push_back(UndefVal);
8264
8265 return DAG.getBuildVector(ResultVT, SL, VElts);
8266}
8267
8268SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
8269 SelectionDAG &DAG) const {
8270 SDLoc SL(Op);
8271 EVT VT = Op.getValueType();
8272
8273 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8274 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
8275
8276 SDValue Lo = Op.getOperand(0);
8277 SDValue Hi = Op.getOperand(1);
8278
8279 // Avoid adding defined bits with the zero_extend.
8280 if (Hi.isUndef()) {
8281 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8282 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
8283 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
8284 }
8285
8286 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
8287 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
8288
8289 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
8290 DAG.getConstant(16, SL, MVT::i32));
8291 if (Lo.isUndef())
8292 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
8293
8294 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8295 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
8296
8297 SDValue Or =
8298 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
8299 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
8300 }
8301
8302 // Split into 2-element chunks.
8303 const unsigned NumParts = VT.getVectorNumElements() / 2;
8305 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
8306
8308 for (unsigned P = 0; P < NumParts; ++P) {
8309 SDValue Vec = DAG.getBuildVector(
8310 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
8311 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
8312 }
8313
8314 SDValue Blend =
8315 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
8316 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
8317}
8318
8320 const GlobalAddressSDNode *GA) const {
8321 // OSes that use ELF REL relocations (instead of RELA) can only store a
8322 // 32-bit addend in the instruction, so it is not safe to allow offset folding
8323 // which can create arbitrary 64-bit addends. (This is only a problem for
8324 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
8325 // the high 32 bits of the addend.)
8326 //
8327 // This should be kept in sync with how HasRelocationAddend is initialized in
8328 // the constructor of ELFAMDGPUAsmBackend.
8329 if (!Subtarget->isAmdHsaOS())
8330 return false;
8331
8332 // We can fold offsets for anything that doesn't require a GOT relocation.
8333 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
8337}
8338
8339static SDValue
8341 const SDLoc &DL, int64_t Offset, EVT PtrVT,
8342 unsigned GAFlags = SIInstrInfo::MO_NONE) {
8343 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
8344 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
8345 // lowered to the following code sequence:
8346 //
8347 // For constant address space:
8348 // s_getpc_b64 s[0:1]
8349 // s_add_u32 s0, s0, $symbol
8350 // s_addc_u32 s1, s1, 0
8351 //
8352 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8353 // a fixup or relocation is emitted to replace $symbol with a literal
8354 // constant, which is a pc-relative offset from the encoding of the $symbol
8355 // operand to the global variable.
8356 //
8357 // For global address space:
8358 // s_getpc_b64 s[0:1]
8359 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
8360 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
8361 //
8362 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8363 // fixups or relocations are emitted to replace $symbol@*@lo and
8364 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
8365 // which is a 64-bit pc-relative offset from the encoding of the $symbol
8366 // operand to the global variable.
8367 if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
8368 assert(GAFlags != SIInstrInfo::MO_NONE);
8369
8370 SDValue Ptr =
8371 DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2);
8372 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr);
8373 }
8374
8375 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
8376 SDValue PtrHi;
8377 if (GAFlags == SIInstrInfo::MO_NONE)
8378 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
8379 else
8380 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
8381 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
8382}
8383
8384SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
8385 SDValue Op,
8386 SelectionDAG &DAG) const {
8387 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
8388 SDLoc DL(GSD);
8389 EVT PtrVT = Op.getValueType();
8390
8391 const GlobalValue *GV = GSD->getGlobal();
8397 GV->hasExternalLinkage()) {
8398 Type *Ty = GV->getValueType();
8399 // HIP uses an unsized array `extern __shared__ T s[]` or similar
8400 // zero-sized type in other languages to declare the dynamic shared
8401 // memory which size is not known at the compile time. They will be
8402 // allocated by the runtime and placed directly after the static
8403 // allocated ones. They all share the same offset.
8404 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
8405 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
8406 // Adjust alignment for that dynamic shared memory array.
8408 MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
8409 MFI->setUsesDynamicLDS(true);
8410 return SDValue(
8411 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
8412 }
8413 }
8415 }
8416
8418 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
8420 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
8421 }
8422
8423 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8424 if (Subtarget->has64BitLiterals()) {
8426 GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64);
8427 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr),
8428 0);
8429 }
8430
8431 SDValue AddrLo = DAG.getTargetGlobalAddress(
8432 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
8433 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
8434
8435 SDValue AddrHi = DAG.getTargetGlobalAddress(
8436 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
8437 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
8438
8439 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
8440 }
8441
8442 if (shouldEmitFixup(GV))
8443 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
8444
8445 if (shouldEmitPCReloc(GV))
8446 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
8448
8449 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
8451 PointerType *PtrTy =
8453 const DataLayout &DataLayout = DAG.getDataLayout();
8454 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
8455 MachinePointerInfo PtrInfo =
8457
8458 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
8461}
8462
8464 const SDLoc &DL, SDValue V) const {
8465 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
8466 // the destination register.
8467 //
8468 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
8469 // so we will end up with redundant moves to m0.
8470 //
8471 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
8472
8473 // A Null SDValue creates a glue result.
8474 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
8475 V, Chain);
8476 return SDValue(M0, 0);
8477}
8478
8479SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
8480 MVT VT,
8481 unsigned Offset) const {
8482 SDLoc SL(Op);
8483 SDValue Param = lowerKernargMemParameter(
8484 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
8485 // The local size values will have the hi 16-bits as zero.
8486 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
8487 DAG.getValueType(VT));
8488}
8489
8491 EVT VT) {
8494 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
8495 return DAG.getPOISON(VT);
8496}
8497
8499 EVT VT) {
8502 "intrinsic not supported on subtarget", DL.getDebugLoc()));
8503 return DAG.getPOISON(VT);
8504}
8505
8507 ArrayRef<SDValue> Elts) {
8508 assert(!Elts.empty());
8509 MVT Type;
8510 unsigned NumElts = Elts.size();
8511
8512 if (NumElts <= 12) {
8513 Type = MVT::getVectorVT(MVT::f32, NumElts);
8514 } else {
8515 assert(Elts.size() <= 16);
8516 Type = MVT::v16f32;
8517 NumElts = 16;
8518 }
8519
8520 SmallVector<SDValue, 16> VecElts(NumElts);
8521 for (unsigned i = 0; i < Elts.size(); ++i) {
8522 SDValue Elt = Elts[i];
8523 if (Elt.getValueType() != MVT::f32)
8524 Elt = DAG.getBitcast(MVT::f32, Elt);
8525 VecElts[i] = Elt;
8526 }
8527 for (unsigned i = Elts.size(); i < NumElts; ++i)
8528 VecElts[i] = DAG.getPOISON(MVT::f32);
8529
8530 if (NumElts == 1)
8531 return VecElts[0];
8532 return DAG.getBuildVector(Type, DL, VecElts);
8533}
8534
8535static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
8536 SDValue Src, int ExtraElts) {
8537 EVT SrcVT = Src.getValueType();
8538
8540
8541 if (SrcVT.isVector())
8542 DAG.ExtractVectorElements(Src, Elts);
8543 else
8544 Elts.push_back(Src);
8545
8546 SDValue Undef = DAG.getPOISON(SrcVT.getScalarType());
8547 while (ExtraElts--)
8548 Elts.push_back(Undef);
8549
8550 return DAG.getBuildVector(CastVT, DL, Elts);
8551}
8552
8553// Re-construct the required return value for a image load intrinsic.
8554// This is more complicated due to the optional use TexFailCtrl which means the
8555// required return type is an aggregate
8557 ArrayRef<EVT> ResultTypes, bool IsTexFail,
8558 bool Unpacked, bool IsD16, int DMaskPop,
8559 int NumVDataDwords, bool IsAtomicPacked16Bit,
8560 const SDLoc &DL) {
8561 // Determine the required return type. This is the same regardless of
8562 // IsTexFail flag
8563 EVT ReqRetVT = ResultTypes[0];
8564 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
8565 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
8566 ? (ReqRetNumElts + 1) / 2
8567 : ReqRetNumElts;
8568
8569 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
8570
8571 MVT DataDwordVT =
8572 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
8573
8574 MVT MaskPopVT =
8575 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
8576
8577 SDValue Data(Result, 0);
8578 SDValue TexFail;
8579
8580 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
8581 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
8582 if (MaskPopVT.isVector()) {
8583 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
8584 SDValue(Result, 0), ZeroIdx);
8585 } else {
8586 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
8587 SDValue(Result, 0), ZeroIdx);
8588 }
8589 }
8590
8591 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
8592 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
8593 NumDataDwords - MaskPopDwords);
8594
8595 if (IsD16)
8596 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
8597
8598 EVT LegalReqRetVT = ReqRetVT;
8599 if (!ReqRetVT.isVector()) {
8600 if (!Data.getValueType().isInteger())
8601 Data = DAG.getNode(ISD::BITCAST, DL,
8602 Data.getValueType().changeTypeToInteger(), Data);
8603 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
8604 } else {
8605 // We need to widen the return vector to a legal type
8606 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
8607 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
8608 LegalReqRetVT =
8610 ReqRetVT.getVectorNumElements() + 1);
8611 }
8612 }
8613 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
8614
8615 if (IsTexFail) {
8616 TexFail =
8617 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
8618 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
8619
8620 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
8621 }
8622
8623 if (Result->getNumValues() == 1)
8624 return Data;
8625
8626 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
8627}
8628
8629static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
8630 SDValue *LWE, bool &IsTexFail) {
8631 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
8632
8633 uint64_t Value = TexFailCtrlConst->getZExtValue();
8634 if (Value) {
8635 IsTexFail = true;
8636 }
8637
8638 SDLoc DL(TexFailCtrlConst);
8639 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
8640 Value &= ~(uint64_t)0x1;
8641 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
8642 Value &= ~(uint64_t)0x2;
8643
8644 return Value == 0;
8645}
8646
8648 MVT PackVectorVT,
8649 SmallVectorImpl<SDValue> &PackedAddrs,
8650 unsigned DimIdx, unsigned EndIdx,
8651 unsigned NumGradients) {
8652 SDLoc DL(Op);
8653 for (unsigned I = DimIdx; I < EndIdx; I++) {
8654 SDValue Addr = Op.getOperand(I);
8655
8656 // Gradients are packed with undef for each coordinate.
8657 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
8658 // 1D: undef,dx/dh; undef,dx/dv
8659 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
8660 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
8661 if (((I + 1) >= EndIdx) ||
8662 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
8663 I == DimIdx + NumGradients - 1))) {
8664 if (Addr.getValueType() != MVT::i16)
8665 Addr = DAG.getBitcast(MVT::i16, Addr);
8666 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
8667 } else {
8668 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
8669 I++;
8670 }
8671 Addr = DAG.getBitcast(MVT::f32, Addr);
8672 PackedAddrs.push_back(Addr);
8673 }
8674}
8675
8676SDValue SITargetLowering::lowerImage(SDValue Op,
8678 SelectionDAG &DAG, bool WithChain) const {
8679 SDLoc DL(Op);
8681 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
8682 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
8684 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
8685 unsigned IntrOpcode = Intr->BaseOpcode;
8686 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
8687 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
8688 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
8689
8690 SmallVector<EVT, 3> ResultTypes(Op->values());
8691 SmallVector<EVT, 3> OrigResultTypes(Op->values());
8692 bool IsD16 = false;
8693 bool IsG16 = false;
8694 bool IsA16 = false;
8695 SDValue VData;
8696 int NumVDataDwords = 0;
8697 bool AdjustRetType = false;
8698 bool IsAtomicPacked16Bit = false;
8699
8700 // Offset of intrinsic arguments
8701 const unsigned ArgOffset = WithChain ? 2 : 1;
8702
8703 unsigned DMask;
8704 unsigned DMaskLanes = 0;
8705
8706 if (BaseOpcode->Atomic) {
8707 VData = Op.getOperand(2);
8708
8709 IsAtomicPacked16Bit =
8710 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
8711 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
8712
8713 bool Is64Bit = VData.getValueSizeInBits() == 64;
8714 if (BaseOpcode->AtomicX2) {
8715 SDValue VData2 = Op.getOperand(3);
8716 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
8717 {VData, VData2});
8718 if (Is64Bit)
8719 VData = DAG.getBitcast(MVT::v4i32, VData);
8720
8721 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
8722 DMask = Is64Bit ? 0xf : 0x3;
8723 NumVDataDwords = Is64Bit ? 4 : 2;
8724 } else {
8725 DMask = Is64Bit ? 0x3 : 0x1;
8726 NumVDataDwords = Is64Bit ? 2 : 1;
8727 }
8728 } else {
8729 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
8730 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
8731
8732 if (BaseOpcode->Store) {
8733 VData = Op.getOperand(2);
8734
8735 MVT StoreVT = VData.getSimpleValueType();
8736 if (StoreVT.getScalarType() == MVT::f16) {
8737 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8738 return Op; // D16 is unsupported for this instruction
8739
8740 IsD16 = true;
8741 VData = handleD16VData(VData, DAG, true);
8742 }
8743
8744 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
8745 } else if (!BaseOpcode->NoReturn) {
8746 // Work out the num dwords based on the dmask popcount and underlying type
8747 // and whether packing is supported.
8748 MVT LoadVT = ResultTypes[0].getSimpleVT();
8749 if (LoadVT.getScalarType() == MVT::f16) {
8750 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8751 return Op; // D16 is unsupported for this instruction
8752
8753 IsD16 = true;
8754 }
8755
8756 // Confirm that the return type is large enough for the dmask specified
8757 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
8758 (!LoadVT.isVector() && DMaskLanes > 1))
8759 return Op;
8760
8761 // The sq block of gfx8 and gfx9 do not estimate register use correctly
8762 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
8763 // instructions.
8764 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
8765 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
8766 NumVDataDwords = (DMaskLanes + 1) / 2;
8767 else
8768 NumVDataDwords = DMaskLanes;
8769
8770 AdjustRetType = true;
8771 }
8772 }
8773
8774 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
8776
8777 // Check for 16 bit addresses or derivatives and pack if true.
8778 MVT VAddrVT =
8779 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
8780 MVT VAddrScalarVT = VAddrVT.getScalarType();
8781 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8782 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8783
8784 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
8785 VAddrScalarVT = VAddrVT.getScalarType();
8786 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8787 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8788
8789 // Push back extra arguments.
8790 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
8791 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
8792 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
8793 // Special handling of bias when A16 is on. Bias is of type half but
8794 // occupies full 32-bit.
8795 SDValue Bias = DAG.getBuildVector(
8796 MVT::v2f16, DL,
8797 {Op.getOperand(ArgOffset + I), DAG.getPOISON(MVT::f16)});
8798 VAddrs.push_back(Bias);
8799 } else {
8800 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
8801 "Bias needs to be converted to 16 bit in A16 mode");
8802 VAddrs.push_back(Op.getOperand(ArgOffset + I));
8803 }
8804 }
8805
8806 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
8807 // 16 bit gradients are supported, but are tied to the A16 control
8808 // so both gradients and addresses must be 16 bit
8809 LLVM_DEBUG(
8810 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
8811 "require 16 bit args for both gradients and addresses");
8812 return Op;
8813 }
8814
8815 if (IsA16) {
8816 if (!ST->hasA16()) {
8817 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
8818 "support 16 bit addresses\n");
8819 return Op;
8820 }
8821 }
8822
8823 // We've dealt with incorrect input so we know that if IsA16, IsG16
8824 // are set then we have to compress/pack operands (either address,
8825 // gradient or both)
8826 // In the case where a16 and gradients are tied (no G16 support) then we
8827 // have already verified that both IsA16 and IsG16 are true
8828 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
8829 // Activate g16
8830 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
8832 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
8833 }
8834
8835 // Add gradients (packed or unpacked)
8836 if (IsG16) {
8837 // Pack the gradients
8838 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
8839 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
8840 ArgOffset + Intr->GradientStart,
8841 ArgOffset + Intr->CoordStart, Intr->NumGradients);
8842 } else {
8843 for (unsigned I = ArgOffset + Intr->GradientStart;
8844 I < ArgOffset + Intr->CoordStart; I++)
8845 VAddrs.push_back(Op.getOperand(I));
8846 }
8847
8848 // Add addresses (packed or unpacked)
8849 if (IsA16) {
8850 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
8851 ArgOffset + Intr->CoordStart, VAddrEnd,
8852 0 /* No gradients */);
8853 } else {
8854 // Add uncompressed address
8855 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
8856 VAddrs.push_back(Op.getOperand(I));
8857 }
8858
8859 // If the register allocator cannot place the address registers contiguously
8860 // without introducing moves, then using the non-sequential address encoding
8861 // is always preferable, since it saves VALU instructions and is usually a
8862 // wash in terms of code size or even better.
8863 //
8864 // However, we currently have no way of hinting to the register allocator that
8865 // MIMG addresses should be placed contiguously when it is possible to do so,
8866 // so force non-NSA for the common 2-address case as a heuristic.
8867 //
8868 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
8869 // allocation when possible.
8870 //
8871 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
8872 // set of the remaining addresses.
8873 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
8874 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
8875 const bool UseNSA = ST->hasNSAEncoding() &&
8876 VAddrs.size() >= ST->getNSAThreshold(MF) &&
8877 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
8878 const bool UsePartialNSA =
8879 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
8880
8881 SDValue VAddr;
8882 if (UsePartialNSA) {
8883 VAddr = getBuildDwordsVector(DAG, DL,
8884 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8885 } else if (!UseNSA) {
8886 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
8887 }
8888
8889 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
8890 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
8891 SDValue Unorm;
8892 if (!BaseOpcode->Sampler) {
8893 Unorm = True;
8894 } else {
8895 uint64_t UnormConst =
8896 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
8897
8898 Unorm = UnormConst ? True : False;
8899 }
8900
8901 SDValue TFE;
8902 SDValue LWE;
8903 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
8904 bool IsTexFail = false;
8905 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8906 return Op;
8907
8908 if (IsTexFail) {
8909 if (!DMaskLanes) {
8910 // Expecting to get an error flag since TFC is on - and dmask is 0
8911 // Force dmask to be at least 1 otherwise the instruction will fail
8912 DMask = 0x1;
8913 DMaskLanes = 1;
8914 NumVDataDwords = 1;
8915 }
8916 NumVDataDwords += 1;
8917 AdjustRetType = true;
8918 }
8919
8920 // Has something earlier tagged that the return type needs adjusting
8921 // This happens if the instruction is a load or has set TexFailCtrl flags
8922 if (AdjustRetType) {
8923 // NumVDataDwords reflects the true number of dwords required in the return
8924 // type
8925 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8926 // This is a no-op load. This can be eliminated
8927 SDValue Undef = DAG.getPOISON(Op.getValueType());
8928 if (isa<MemSDNode>(Op))
8929 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
8930 return Undef;
8931 }
8932
8933 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
8934 MVT::i32, NumVDataDwords)
8935 : MVT::i32;
8936
8937 ResultTypes[0] = NewVT;
8938 if (ResultTypes.size() == 3) {
8939 // Original result was aggregate type used for TexFailCtrl results
8940 // The actual instruction returns as a vector type which has now been
8941 // created. Remove the aggregate result.
8942 ResultTypes.erase(&ResultTypes[1]);
8943 }
8944 }
8945
8946 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
8947 if (BaseOpcode->Atomic)
8948 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
8949 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
8951 return Op;
8952
8954 if (BaseOpcode->Store || BaseOpcode->Atomic)
8955 Ops.push_back(VData); // vdata
8956 if (UsePartialNSA) {
8957 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
8958 Ops.push_back(VAddr);
8959 } else if (UseNSA)
8960 append_range(Ops, VAddrs);
8961 else
8962 Ops.push_back(VAddr);
8963 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
8964 EVT RsrcVT = Rsrc.getValueType();
8965 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
8966 return Op;
8967 Ops.push_back(Rsrc);
8968 if (BaseOpcode->Sampler) {
8969 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
8970 if (Samp.getValueType() != MVT::v4i32)
8971 return Op;
8972 Ops.push_back(Samp);
8973 }
8974 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
8975 if (IsGFX10Plus)
8976 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
8977 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8978 Ops.push_back(Unorm);
8979 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
8980 Ops.push_back(IsA16 && // r128, a16 for gfx9
8981 ST->hasFeature(AMDGPU::FeatureR128A16)
8982 ? True
8983 : False);
8984 if (IsGFX10Plus)
8985 Ops.push_back(IsA16 ? True : False);
8986
8987 if (!Subtarget->hasGFX90AInsts())
8988 Ops.push_back(TFE); // tfe
8989 else if (TFE->getAsZExtVal()) {
8992 "TFE is not supported on this GPU", DL.getDebugLoc()));
8993 }
8994
8995 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8996 Ops.push_back(LWE); // lwe
8997 if (!IsGFX10Plus)
8998 Ops.push_back(DimInfo->DA ? True : False);
8999 if (BaseOpcode->HasD16)
9000 Ops.push_back(IsD16 ? True : False);
9001 if (isa<MemSDNode>(Op))
9002 Ops.push_back(Op.getOperand(0)); // chain
9003
9004 int NumVAddrDwords =
9005 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
9006 int Opcode = -1;
9007
9008 if (IsGFX12Plus) {
9009 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
9010 NumVDataDwords, NumVAddrDwords);
9011 } else if (IsGFX11Plus) {
9012 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9013 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9014 : AMDGPU::MIMGEncGfx11Default,
9015 NumVDataDwords, NumVAddrDwords);
9016 } else if (IsGFX10Plus) {
9017 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9018 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9019 : AMDGPU::MIMGEncGfx10Default,
9020 NumVDataDwords, NumVAddrDwords);
9021 } else {
9022 if (Subtarget->hasGFX90AInsts()) {
9023 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
9024 NumVDataDwords, NumVAddrDwords);
9025 if (Opcode == -1) {
9028 "requested image instruction is not supported on this GPU",
9029 DL.getDebugLoc()));
9030
9031 unsigned Idx = 0;
9032 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
9033 for (EVT VT : OrigResultTypes) {
9034 if (VT == MVT::Other)
9035 RetValues[Idx++] = Op.getOperand(0); // Chain
9036 else
9037 RetValues[Idx++] = DAG.getPOISON(VT);
9038 }
9039
9040 return DAG.getMergeValues(RetValues, DL);
9041 }
9042 }
9043 if (Opcode == -1 &&
9045 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
9046 NumVDataDwords, NumVAddrDwords);
9047 if (Opcode == -1)
9048 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
9049 NumVDataDwords, NumVAddrDwords);
9050 }
9051 if (Opcode == -1)
9052 return Op;
9053
9054 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
9055 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
9056 MachineMemOperand *MemRef = MemOp->getMemOperand();
9057 DAG.setNodeMemRefs(NewNode, {MemRef});
9058 }
9059
9060 if (BaseOpcode->AtomicX2) {
9062 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
9063 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
9064 }
9065 if (BaseOpcode->NoReturn)
9066 return SDValue(NewNode, 0);
9067 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
9068 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9069 NumVDataDwords, IsAtomicPacked16Bit, DL);
9070}
9071
9072SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
9073 SDValue Offset, SDValue CachePolicy,
9074 SelectionDAG &DAG) const {
9076
9077 const DataLayout &DataLayout = DAG.getDataLayout();
9078 Align Alignment =
9080
9085 VT.getStoreSize(), Alignment);
9086
9087 if (!Offset->isDivergent()) {
9088 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
9089
9090 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
9091 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
9092 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
9093 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
9094 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9095 SDValue BufferLoad =
9097 DAG.getVTList(MVT::i32), Ops, VT, MMO);
9098 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
9099 }
9100
9101 // Widen vec3 load to vec4.
9102 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
9103 !Subtarget->hasScalarDwordx3Loads()) {
9104 EVT WidenedVT =
9106 auto WidenedOp = DAG.getMemIntrinsicNode(
9107 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
9108 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
9109 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
9110 DAG.getVectorIdxConstant(0, DL));
9111 return Subvector;
9112 }
9113
9115 DAG.getVTList(VT), Ops, VT, MMO);
9116 }
9117
9118 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
9119 // assume that the buffer is unswizzled.
9120 SDValue Ops[] = {
9121 DAG.getEntryNode(), // Chain
9122 Rsrc, // rsrc
9123 DAG.getConstant(0, DL, MVT::i32), // vindex
9124 {}, // voffset
9125 {}, // soffset
9126 {}, // offset
9127 CachePolicy, // cachepolicy
9128 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9129 };
9130 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9131 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
9132 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
9133 }
9134
9136 unsigned NumLoads = 1;
9137 MVT LoadVT = VT.getSimpleVT();
9138 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
9139 assert((LoadVT.getScalarType() == MVT::i32 ||
9140 LoadVT.getScalarType() == MVT::f32));
9141
9142 if (NumElts == 8 || NumElts == 16) {
9143 NumLoads = NumElts / 4;
9144 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
9145 }
9146
9147 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other});
9148
9149 // Use the alignment to ensure that the required offsets will fit into the
9150 // immediate offsets.
9151 setBufferOffsets(Offset, DAG, &Ops[3],
9152 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
9153
9154 uint64_t InstOffset = Ops[5]->getAsZExtVal();
9155 for (unsigned i = 0; i < NumLoads; ++i) {
9156 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
9157 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
9158 LoadVT, MMO, DAG));
9159 }
9160
9161 if (NumElts == 8 || NumElts == 16)
9162 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
9163
9164 return Loads[0];
9165}
9166
9167SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
9168 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
9169 if (!Subtarget->hasArchitectedSGPRs())
9170 return {};
9171 SDLoc SL(Op);
9172 MVT VT = MVT::i32;
9173 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
9174 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
9175 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
9176}
9177
9178SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
9179 unsigned Dim,
9180 const ArgDescriptor &Arg) const {
9181 SDLoc SL(Op);
9183 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
9184 if (MaxID == 0)
9185 return DAG.getConstant(0, SL, MVT::i32);
9186
9187 // It's undefined behavior if a function marked with the amdgpu-no-*
9188 // attributes uses the corresponding intrinsic.
9189 if (!Arg)
9190 return DAG.getPOISON(Op->getValueType(0));
9191
9192 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
9193 SDLoc(DAG.getEntryNode()), Arg);
9194
9195 // Don't bother inserting AssertZext for packed IDs since we're emitting the
9196 // masking operations anyway.
9197 //
9198 // TODO: We could assert the top bit is 0 for the source copy.
9199 if (Arg.isMasked())
9200 return Val;
9201
9202 // Preserve the known bits after expansion to a copy.
9204 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
9205 DAG.getValueType(SmallVT));
9206}
9207
9208SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
9209 SelectionDAG &DAG) const {
9211 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
9212
9213 EVT VT = Op.getValueType();
9214 SDLoc DL(Op);
9215 unsigned IntrinsicID = Op.getConstantOperandVal(0);
9216
9217 // TODO: Should this propagate fast-math-flags?
9218
9219 switch (IntrinsicID) {
9220 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9221 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
9222 return emitNonHSAIntrinsicError(DAG, DL, VT);
9223 return getPreloadedValue(DAG, *MFI, VT,
9225 }
9226 case Intrinsic::amdgcn_dispatch_ptr:
9227 case Intrinsic::amdgcn_queue_ptr: {
9228 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
9230 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
9231 DL.getDebugLoc()));
9232 return DAG.getPOISON(VT);
9233 }
9234
9235 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9238 return getPreloadedValue(DAG, *MFI, VT, RegID);
9239 }
9240 case Intrinsic::amdgcn_implicitarg_ptr: {
9241 if (MFI->isEntryFunction())
9242 return getImplicitArgPtr(DAG, DL);
9243 return getPreloadedValue(DAG, *MFI, VT,
9245 }
9246 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9248 // This only makes sense to call in a kernel, so just lower to null.
9249 return DAG.getConstant(0, DL, VT);
9250 }
9251
9252 return getPreloadedValue(DAG, *MFI, VT,
9254 }
9255 case Intrinsic::amdgcn_dispatch_id: {
9256 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
9257 }
9258 case Intrinsic::amdgcn_rcp:
9259 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
9260 case Intrinsic::amdgcn_rsq:
9261 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9262 case Intrinsic::amdgcn_rsq_legacy:
9264 return emitRemovedIntrinsicError(DAG, DL, VT);
9265 return SDValue();
9266 case Intrinsic::amdgcn_rcp_legacy:
9268 return emitRemovedIntrinsicError(DAG, DL, VT);
9269 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
9270 case Intrinsic::amdgcn_rsq_clamp: {
9272 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
9273
9274 Type *Type = VT.getTypeForEVT(*DAG.getContext());
9277
9278 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9279 SDValue Tmp =
9280 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
9281 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
9282 DAG.getConstantFP(Min, DL, VT));
9283 }
9284 case Intrinsic::r600_read_ngroups_x:
9285 if (Subtarget->isAmdHsaOS())
9286 return emitNonHSAIntrinsicError(DAG, DL, VT);
9287
9288 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9290 false);
9291 case Intrinsic::r600_read_ngroups_y:
9292 if (Subtarget->isAmdHsaOS())
9293 return emitNonHSAIntrinsicError(DAG, DL, VT);
9294
9295 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9297 false);
9298 case Intrinsic::r600_read_ngroups_z:
9299 if (Subtarget->isAmdHsaOS())
9300 return emitNonHSAIntrinsicError(DAG, DL, VT);
9301
9302 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9304 false);
9305 case Intrinsic::r600_read_local_size_x:
9306 if (Subtarget->isAmdHsaOS())
9307 return emitNonHSAIntrinsicError(DAG, DL, VT);
9308
9309 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9311 case Intrinsic::r600_read_local_size_y:
9312 if (Subtarget->isAmdHsaOS())
9313 return emitNonHSAIntrinsicError(DAG, DL, VT);
9314
9315 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9317 case Intrinsic::r600_read_local_size_z:
9318 if (Subtarget->isAmdHsaOS())
9319 return emitNonHSAIntrinsicError(DAG, DL, VT);
9320
9321 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9323 case Intrinsic::amdgcn_workgroup_id_x:
9324 return getPreloadedValue(DAG, *MFI, VT,
9326 case Intrinsic::amdgcn_workgroup_id_y:
9327 return getPreloadedValue(DAG, *MFI, VT,
9329 case Intrinsic::amdgcn_workgroup_id_z:
9330 return getPreloadedValue(DAG, *MFI, VT,
9332 case Intrinsic::amdgcn_wave_id:
9333 return lowerWaveID(DAG, Op);
9334 case Intrinsic::amdgcn_lds_kernel_id: {
9335 if (MFI->isEntryFunction())
9336 return getLDSKernelId(DAG, DL);
9337 return getPreloadedValue(DAG, *MFI, VT,
9339 }
9340 case Intrinsic::amdgcn_workitem_id_x:
9341 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
9342 case Intrinsic::amdgcn_workitem_id_y:
9343 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
9344 case Intrinsic::amdgcn_workitem_id_z:
9345 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
9346 case Intrinsic::amdgcn_wavefrontsize:
9348 SDLoc(Op), MVT::i32);
9349 case Intrinsic::amdgcn_s_buffer_load: {
9350 unsigned CPol = Op.getConstantOperandVal(3);
9351 // s_buffer_load, because of how it's optimized, can't be volatile
9352 // so reject ones with the volatile bit set.
9353 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
9356 return Op;
9357 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
9358 Op.getOperand(3), DAG);
9359 }
9360 case Intrinsic::amdgcn_fdiv_fast:
9361 return lowerFDIV_FAST(Op, DAG);
9362 case Intrinsic::amdgcn_sin:
9363 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
9364
9365 case Intrinsic::amdgcn_cos:
9366 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
9367
9368 case Intrinsic::amdgcn_mul_u24:
9369 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
9370 Op.getOperand(2));
9371 case Intrinsic::amdgcn_mul_i24:
9372 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
9373 Op.getOperand(2));
9374
9375 case Intrinsic::amdgcn_log_clamp: {
9377 return SDValue();
9378
9379 return emitRemovedIntrinsicError(DAG, DL, VT);
9380 }
9381 case Intrinsic::amdgcn_fract:
9382 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
9383
9384 case Intrinsic::amdgcn_class:
9385 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
9386 Op.getOperand(2));
9387 case Intrinsic::amdgcn_div_fmas:
9388 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
9389 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9390
9391 case Intrinsic::amdgcn_div_fixup:
9392 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
9393 Op.getOperand(2), Op.getOperand(3));
9394
9395 case Intrinsic::amdgcn_div_scale: {
9396 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
9397
9398 // Translate to the operands expected by the machine instruction. The
9399 // first parameter must be the same as the first instruction.
9400 SDValue Numerator = Op.getOperand(1);
9401 SDValue Denominator = Op.getOperand(2);
9402
9403 // Note this order is opposite of the machine instruction's operations,
9404 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
9405 // intrinsic has the numerator as the first operand to match a normal
9406 // division operation.
9407
9408 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
9409
9410 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
9411 Denominator, Numerator);
9412 }
9413 case Intrinsic::amdgcn_icmp: {
9414 // There is a Pat that handles this variant, so return it as-is.
9415 if (Op.getOperand(1).getValueType() == MVT::i1 &&
9416 Op.getConstantOperandVal(2) == 0 &&
9417 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
9418 return Op;
9419 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
9420 }
9421 case Intrinsic::amdgcn_fcmp: {
9422 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
9423 }
9424 case Intrinsic::amdgcn_ballot:
9425 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
9426 case Intrinsic::amdgcn_fmed3:
9427 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
9428 Op.getOperand(2), Op.getOperand(3));
9429 case Intrinsic::amdgcn_fdot2:
9430 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
9431 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9432 case Intrinsic::amdgcn_fmul_legacy:
9433 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
9434 Op.getOperand(2));
9435 case Intrinsic::amdgcn_sffbh:
9436 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
9437 case Intrinsic::amdgcn_sbfe:
9438 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
9439 Op.getOperand(2), Op.getOperand(3));
9440 case Intrinsic::amdgcn_ubfe:
9441 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
9442 Op.getOperand(2), Op.getOperand(3));
9443 case Intrinsic::amdgcn_cvt_pkrtz:
9444 case Intrinsic::amdgcn_cvt_pknorm_i16:
9445 case Intrinsic::amdgcn_cvt_pknorm_u16:
9446 case Intrinsic::amdgcn_cvt_pk_i16:
9447 case Intrinsic::amdgcn_cvt_pk_u16: {
9448 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
9449 EVT VT = Op.getValueType();
9450 unsigned Opcode;
9451
9452 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
9454 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
9456 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
9458 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
9460 else
9462
9463 if (isTypeLegal(VT))
9464 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
9465
9466 SDValue Node =
9467 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
9468 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
9469 }
9470 case Intrinsic::amdgcn_fmad_ftz:
9471 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
9472 Op.getOperand(2), Op.getOperand(3));
9473
9474 case Intrinsic::amdgcn_if_break:
9475 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
9476 Op->getOperand(1), Op->getOperand(2)),
9477 0);
9478
9479 case Intrinsic::amdgcn_groupstaticsize: {
9481 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
9482 return Op;
9483
9484 const Module *M = MF.getFunction().getParent();
9485 const GlobalValue *GV =
9486 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
9487 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
9489 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
9490 }
9491 case Intrinsic::amdgcn_is_shared:
9492 case Intrinsic::amdgcn_is_private: {
9493 SDLoc SL(Op);
9494 SDValue SrcVec =
9495 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
9496 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
9497 DAG.getConstant(1, SL, MVT::i32));
9498
9499 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
9502 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
9503 Subtarget->hasGloballyAddressableScratch()) {
9504 SDValue FlatScratchBaseHi(
9505 DAG.getMachineNode(
9506 AMDGPU::S_MOV_B32, DL, MVT::i32,
9507 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
9508 0);
9509 // Test bits 63..58 against the aperture address.
9510 return DAG.getSetCC(
9511 SL, MVT::i1,
9512 DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
9513 DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
9514 }
9515
9516 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
9517 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
9518 }
9519 case Intrinsic::amdgcn_perm:
9520 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
9521 Op.getOperand(2), Op.getOperand(3));
9522 case Intrinsic::amdgcn_reloc_constant: {
9523 Module *M = MF.getFunction().getParent();
9524 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
9525 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
9526 auto *RelocSymbol = cast<GlobalVariable>(
9527 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
9528 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
9530 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
9531 }
9532 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
9533 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
9534 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
9535 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
9536 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
9537 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
9538 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
9539 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
9540 if (Op.getOperand(4).getValueType() == MVT::i32)
9541 return SDValue();
9542
9543 SDLoc SL(Op);
9544 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
9545 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
9546 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9547 Op.getOperand(3), IndexKeyi32);
9548 }
9549 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
9550 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
9551 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
9552 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
9553 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
9554 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
9555 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
9556 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
9557 if (Op.getOperand(4).getValueType() == MVT::i64)
9558 return SDValue();
9559
9560 SDLoc SL(Op);
9561 auto IndexKeyi64 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64);
9562 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
9563 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9564 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
9565 Op.getOperand(6)});
9566 }
9567 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
9568 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
9569 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
9570 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
9571 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
9572 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
9573 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
9574 ? MVT::i64
9575 : MVT::i32;
9576 if (Op.getOperand(6).getValueType() == IndexKeyTy)
9577 return SDValue();
9578
9579 SDLoc SL(Op);
9580 auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
9581 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
9582 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9583 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
9584 IndexKey, Op.getOperand(7),
9585 Op.getOperand(8)}); // No clamp operand
9586 }
9587 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
9588 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
9589 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
9590 if (Op.getOperand(6).getValueType() == MVT::i32)
9591 return SDValue();
9592
9593 SDLoc SL(Op);
9594 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
9595 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
9596 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9597 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
9598 IndexKeyi32, Op.getOperand(7)});
9599 }
9600 case Intrinsic::amdgcn_addrspacecast_nonnull:
9601 return lowerADDRSPACECAST(Op, DAG);
9602 case Intrinsic::amdgcn_readlane:
9603 case Intrinsic::amdgcn_readfirstlane:
9604 case Intrinsic::amdgcn_writelane:
9605 case Intrinsic::amdgcn_permlane16:
9606 case Intrinsic::amdgcn_permlanex16:
9607 case Intrinsic::amdgcn_permlane64:
9608 case Intrinsic::amdgcn_set_inactive:
9609 case Intrinsic::amdgcn_set_inactive_chain_arg:
9610 case Intrinsic::amdgcn_mov_dpp8:
9611 case Intrinsic::amdgcn_update_dpp:
9612 return lowerLaneOp(*this, Op.getNode(), DAG);
9613 case Intrinsic::amdgcn_dead: {
9615 for (const EVT ValTy : Op.getNode()->values())
9616 Poisons.push_back(DAG.getPOISON(ValTy));
9617 return DAG.getMergeValues(Poisons, SDLoc(Op));
9618 }
9619 default:
9620 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9622 return lowerImage(Op, ImageDimIntr, DAG, false);
9623
9624 return Op;
9625 }
9626}
9627
9628// On targets not supporting constant in soffset field, turn zero to
9629// SGPR_NULL to avoid generating an extra s_mov with zero.
9631 const GCNSubtarget *Subtarget) {
9632 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
9633 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
9634 return SOffset;
9635}
9636
9637SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
9638 SelectionDAG &DAG,
9639 unsigned NewOpcode) const {
9640 SDLoc DL(Op);
9641
9642 SDValue VData = Op.getOperand(2);
9643 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9644 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9645 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9646 SDValue Ops[] = {
9647 Op.getOperand(0), // Chain
9648 VData, // vdata
9649 Rsrc, // rsrc
9650 DAG.getConstant(0, DL, MVT::i32), // vindex
9651 VOffset, // voffset
9652 SOffset, // soffset
9653 Offset, // offset
9654 Op.getOperand(6), // cachepolicy
9655 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9656 };
9657
9658 auto *M = cast<MemSDNode>(Op);
9659
9660 EVT MemVT = VData.getValueType();
9661 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
9662 M->getMemOperand());
9663}
9664
9665SDValue
9666SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
9667 unsigned NewOpcode) const {
9668 SDLoc DL(Op);
9669
9670 SDValue VData = Op.getOperand(2);
9671 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9672 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9673 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9674 SDValue Ops[] = {
9675 Op.getOperand(0), // Chain
9676 VData, // vdata
9677 Rsrc, // rsrc
9678 Op.getOperand(4), // vindex
9679 VOffset, // voffset
9680 SOffset, // soffset
9681 Offset, // offset
9682 Op.getOperand(7), // cachepolicy
9683 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9684 };
9685
9686 auto *M = cast<MemSDNode>(Op);
9687
9688 EVT MemVT = VData.getValueType();
9689 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
9690 M->getMemOperand());
9691}
9692
9693SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
9694 SelectionDAG &DAG) const {
9695 unsigned IntrID = Op.getConstantOperandVal(1);
9696 SDLoc DL(Op);
9697
9698 switch (IntrID) {
9699 case Intrinsic::amdgcn_ds_ordered_add:
9700 case Intrinsic::amdgcn_ds_ordered_swap: {
9701 MemSDNode *M = cast<MemSDNode>(Op);
9702 SDValue Chain = M->getOperand(0);
9703 SDValue M0 = M->getOperand(2);
9704 SDValue Value = M->getOperand(3);
9705 unsigned IndexOperand = M->getConstantOperandVal(7);
9706 unsigned WaveRelease = M->getConstantOperandVal(8);
9707 unsigned WaveDone = M->getConstantOperandVal(9);
9708
9709 unsigned OrderedCountIndex = IndexOperand & 0x3f;
9710 IndexOperand &= ~0x3f;
9711 unsigned CountDw = 0;
9712
9713 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
9714 CountDw = (IndexOperand >> 24) & 0xf;
9715 IndexOperand &= ~(0xf << 24);
9716
9717 if (CountDw < 1 || CountDw > 4) {
9718 const Function &Fn = DAG.getMachineFunction().getFunction();
9720 Fn, "ds_ordered_count: dword count must be between 1 and 4",
9721 DL.getDebugLoc()));
9722 CountDw = 1;
9723 }
9724 }
9725
9726 if (IndexOperand) {
9727 const Function &Fn = DAG.getMachineFunction().getFunction();
9729 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
9730 }
9731
9732 if (WaveDone && !WaveRelease) {
9733 // TODO: Move this to IR verifier
9734 const Function &Fn = DAG.getMachineFunction().getFunction();
9736 Fn, "ds_ordered_count: wave_done requires wave_release",
9737 DL.getDebugLoc()));
9738 }
9739
9740 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
9741 unsigned ShaderType =
9743 unsigned Offset0 = OrderedCountIndex << 2;
9744 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
9745
9746 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
9747 Offset1 |= (CountDw - 1) << 6;
9748
9749 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
9750 Offset1 |= ShaderType << 2;
9751
9752 unsigned Offset = Offset0 | (Offset1 << 8);
9753
9754 SDValue Ops[] = {
9755 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
9756 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
9757 };
9759 M->getVTList(), Ops, M->getMemoryVT(),
9760 M->getMemOperand());
9761 }
9762 case Intrinsic::amdgcn_raw_buffer_load:
9763 case Intrinsic::amdgcn_raw_ptr_buffer_load:
9764 case Intrinsic::amdgcn_raw_atomic_buffer_load:
9765 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
9766 case Intrinsic::amdgcn_raw_buffer_load_format:
9767 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
9768 const bool IsFormat =
9769 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
9770 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
9771
9772 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9773 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
9774 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
9775 SDValue Ops[] = {
9776 Op.getOperand(0), // Chain
9777 Rsrc, // rsrc
9778 DAG.getConstant(0, DL, MVT::i32), // vindex
9779 VOffset, // voffset
9780 SOffset, // soffset
9781 Offset, // offset
9782 Op.getOperand(5), // cachepolicy, swizzled buffer
9783 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9784 };
9785
9786 auto *M = cast<MemSDNode>(Op);
9787 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
9788 }
9789 case Intrinsic::amdgcn_struct_buffer_load:
9790 case Intrinsic::amdgcn_struct_ptr_buffer_load:
9791 case Intrinsic::amdgcn_struct_buffer_load_format:
9792 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
9793 case Intrinsic::amdgcn_struct_atomic_buffer_load:
9794 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
9795 const bool IsFormat =
9796 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
9797 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
9798
9799 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9800 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9801 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9802 SDValue Ops[] = {
9803 Op.getOperand(0), // Chain
9804 Rsrc, // rsrc
9805 Op.getOperand(3), // vindex
9806 VOffset, // voffset
9807 SOffset, // soffset
9808 Offset, // offset
9809 Op.getOperand(6), // cachepolicy, swizzled buffer
9810 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9811 };
9812
9813 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
9814 }
9815 case Intrinsic::amdgcn_raw_tbuffer_load:
9816 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
9817 MemSDNode *M = cast<MemSDNode>(Op);
9818 EVT LoadVT = Op.getValueType();
9819 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9820 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
9821 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
9822
9823 SDValue Ops[] = {
9824 Op.getOperand(0), // Chain
9825 Rsrc, // rsrc
9826 DAG.getConstant(0, DL, MVT::i32), // vindex
9827 VOffset, // voffset
9828 SOffset, // soffset
9829 Offset, // offset
9830 Op.getOperand(5), // format
9831 Op.getOperand(6), // cachepolicy, swizzled buffer
9832 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9833 };
9834
9835 if (LoadVT.getScalarType() == MVT::f16)
9836 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9837 Ops);
9838 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9839 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9840 DAG);
9841 }
9842 case Intrinsic::amdgcn_struct_tbuffer_load:
9843 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
9844 MemSDNode *M = cast<MemSDNode>(Op);
9845 EVT LoadVT = Op.getValueType();
9846 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9847 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9848 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9849
9850 SDValue Ops[] = {
9851 Op.getOperand(0), // Chain
9852 Rsrc, // rsrc
9853 Op.getOperand(3), // vindex
9854 VOffset, // voffset
9855 SOffset, // soffset
9856 Offset, // offset
9857 Op.getOperand(6), // format
9858 Op.getOperand(7), // cachepolicy, swizzled buffer
9859 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9860 };
9861
9862 if (LoadVT.getScalarType() == MVT::f16)
9863 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9864 Ops);
9865 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9866 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9867 DAG);
9868 }
9869 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9870 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9871 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
9872 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9873 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9874 return lowerStructBufferAtomicIntrin(Op, DAG,
9876 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9877 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9878 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
9879 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9880 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9881 return lowerStructBufferAtomicIntrin(Op, DAG,
9883 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9884 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9885 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
9886 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9887 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9888 return lowerStructBufferAtomicIntrin(Op, DAG,
9890 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9891 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9892 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
9893 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9894 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9895 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9896 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9897 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9898 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9899 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9900 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9901 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
9902 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9903 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9904 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
9905 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9906 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9907 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
9908 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9909 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9910 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
9911 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9912 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9913 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9914 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9915 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9916 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9917 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9918 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9919 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9920 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9921 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9922 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9923 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9924 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9925 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9926 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9927 return lowerRawBufferAtomicIntrin(Op, DAG,
9929 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9930 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9931 return lowerStructBufferAtomicIntrin(Op, DAG,
9933 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9934 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9935 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9936 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9937 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9938 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9939 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9940 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9941 return lowerStructBufferAtomicIntrin(Op, DAG,
9943 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9944 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9945 return lowerStructBufferAtomicIntrin(Op, DAG,
9947 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9948 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9949 return lowerStructBufferAtomicIntrin(Op, DAG,
9951 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9952 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9953 return lowerStructBufferAtomicIntrin(Op, DAG,
9955 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9956 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9957 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9958 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9959 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9960 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9961 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9962 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9963 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9964 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9965 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9966 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9967 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9968 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9969 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9970 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9971 return lowerStructBufferAtomicIntrin(Op, DAG,
9973
9974 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9975 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9976 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
9977 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9978 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9979 SDValue Ops[] = {
9980 Op.getOperand(0), // Chain
9981 Op.getOperand(2), // src
9982 Op.getOperand(3), // cmp
9983 Rsrc, // rsrc
9984 DAG.getConstant(0, DL, MVT::i32), // vindex
9985 VOffset, // voffset
9986 SOffset, // soffset
9987 Offset, // offset
9988 Op.getOperand(7), // cachepolicy
9989 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9990 };
9991 EVT VT = Op.getValueType();
9992 auto *M = cast<MemSDNode>(Op);
9993
9995 Op->getVTList(), Ops, VT,
9996 M->getMemOperand());
9997 }
9998 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9999 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10000 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
10001 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
10002 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
10003 SDValue Ops[] = {
10004 Op.getOperand(0), // Chain
10005 Op.getOperand(2), // src
10006 Op.getOperand(3), // cmp
10007 Rsrc, // rsrc
10008 Op.getOperand(5), // vindex
10009 VOffset, // voffset
10010 SOffset, // soffset
10011 Offset, // offset
10012 Op.getOperand(8), // cachepolicy
10013 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10014 };
10015 EVT VT = Op.getValueType();
10016 auto *M = cast<MemSDNode>(Op);
10017
10019 Op->getVTList(), Ops, VT,
10020 M->getMemOperand());
10021 }
10022 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10023 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10024 MemSDNode *M = cast<MemSDNode>(Op);
10025 SDValue NodePtr = M->getOperand(2);
10026 SDValue RayExtent = M->getOperand(3);
10027 SDValue InstanceMask = M->getOperand(4);
10028 SDValue RayOrigin = M->getOperand(5);
10029 SDValue RayDir = M->getOperand(6);
10030 SDValue Offsets = M->getOperand(7);
10031 SDValue TDescr = M->getOperand(8);
10032
10033 assert(NodePtr.getValueType() == MVT::i64);
10034 assert(RayDir.getValueType() == MVT::v3f32);
10035
10036 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10037 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10038 return SDValue();
10039 }
10040
10041 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10042 const unsigned NumVDataDwords = 10;
10043 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10044 int Opcode = AMDGPU::getMIMGOpcode(
10045 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10046 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10047 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10048 assert(Opcode != -1);
10049
10051 Ops.push_back(NodePtr);
10052 Ops.push_back(DAG.getBuildVector(
10053 MVT::v2i32, DL,
10054 {DAG.getBitcast(MVT::i32, RayExtent),
10055 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10056 Ops.push_back(RayOrigin);
10057 Ops.push_back(RayDir);
10058 Ops.push_back(Offsets);
10059 Ops.push_back(TDescr);
10060 Ops.push_back(M->getChain());
10061
10062 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10063 MachineMemOperand *MemRef = M->getMemOperand();
10064 DAG.setNodeMemRefs(NewNode, {MemRef});
10065 return SDValue(NewNode, 0);
10066 }
10067 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10068 MemSDNode *M = cast<MemSDNode>(Op);
10069 SDValue NodePtr = M->getOperand(2);
10070 SDValue RayExtent = M->getOperand(3);
10071 SDValue RayOrigin = M->getOperand(4);
10072 SDValue RayDir = M->getOperand(5);
10073 SDValue RayInvDir = M->getOperand(6);
10074 SDValue TDescr = M->getOperand(7);
10075
10076 assert(NodePtr.getValueType() == MVT::i32 ||
10077 NodePtr.getValueType() == MVT::i64);
10078 assert(RayDir.getValueType() == MVT::v3f16 ||
10079 RayDir.getValueType() == MVT::v3f32);
10080
10081 if (!Subtarget->hasGFX10_AEncoding()) {
10082 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10083 return SDValue();
10084 }
10085
10086 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
10087 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
10088 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10089 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
10090 const bool Is64 = NodePtr.getValueType() == MVT::i64;
10091 const unsigned NumVDataDwords = 4;
10092 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10093 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10094 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10095 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
10096 IsGFX12Plus;
10097 const unsigned BaseOpcodes[2][2] = {
10098 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10099 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10100 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10101 int Opcode;
10102 if (UseNSA) {
10103 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10104 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10105 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10106 : AMDGPU::MIMGEncGfx10NSA,
10107 NumVDataDwords, NumVAddrDwords);
10108 } else {
10109 assert(!IsGFX12Plus);
10110 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10111 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10112 : AMDGPU::MIMGEncGfx10Default,
10113 NumVDataDwords, NumVAddrDwords);
10114 }
10115 assert(Opcode != -1);
10116
10118
10119 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
10121 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
10122 if (Lanes[0].getValueSizeInBits() == 32) {
10123 for (unsigned I = 0; I < 3; ++I)
10124 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
10125 } else {
10126 if (IsAligned) {
10127 Ops.push_back(DAG.getBitcast(
10128 MVT::i32,
10129 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
10130 Ops.push_back(Lanes[2]);
10131 } else {
10132 SDValue Elt0 = Ops.pop_back_val();
10133 Ops.push_back(DAG.getBitcast(
10134 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
10135 Ops.push_back(DAG.getBitcast(
10136 MVT::i32,
10137 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
10138 }
10139 }
10140 };
10141
10142 if (UseNSA && IsGFX11Plus) {
10143 Ops.push_back(NodePtr);
10144 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10145 Ops.push_back(RayOrigin);
10146 if (IsA16) {
10147 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
10148 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
10149 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
10150 for (unsigned I = 0; I < 3; ++I) {
10151 MergedLanes.push_back(DAG.getBitcast(
10152 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
10153 {DirLanes[I], InvDirLanes[I]})));
10154 }
10155 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
10156 } else {
10157 Ops.push_back(RayDir);
10158 Ops.push_back(RayInvDir);
10159 }
10160 } else {
10161 if (Is64)
10162 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
10163 2);
10164 else
10165 Ops.push_back(NodePtr);
10166
10167 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10168 packLanes(RayOrigin, true);
10169 packLanes(RayDir, true);
10170 packLanes(RayInvDir, false);
10171 }
10172
10173 if (!UseNSA) {
10174 // Build a single vector containing all the operands so far prepared.
10175 if (NumVAddrDwords > 12) {
10176 SDValue Undef = DAG.getPOISON(MVT::i32);
10177 Ops.append(16 - Ops.size(), Undef);
10178 }
10179 assert(Ops.size() >= 8 && Ops.size() <= 12);
10180 SDValue MergedOps =
10181 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
10182 Ops.clear();
10183 Ops.push_back(MergedOps);
10184 }
10185
10186 Ops.push_back(TDescr);
10187 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
10188 Ops.push_back(M->getChain());
10189
10190 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10191 MachineMemOperand *MemRef = M->getMemOperand();
10192 DAG.setNodeMemRefs(NewNode, {MemRef});
10193 return SDValue(NewNode, 0);
10194 }
10195 case Intrinsic::amdgcn_global_atomic_fmin_num:
10196 case Intrinsic::amdgcn_global_atomic_fmax_num:
10197 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10198 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10199 MemSDNode *M = cast<MemSDNode>(Op);
10200 SDValue Ops[] = {
10201 M->getOperand(0), // Chain
10202 M->getOperand(2), // Ptr
10203 M->getOperand(3) // Value
10204 };
10205 unsigned Opcode = 0;
10206 switch (IntrID) {
10207 case Intrinsic::amdgcn_global_atomic_fmin_num:
10208 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10209 Opcode = ISD::ATOMIC_LOAD_FMIN;
10210 break;
10211 }
10212 case Intrinsic::amdgcn_global_atomic_fmax_num:
10213 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10214 Opcode = ISD::ATOMIC_LOAD_FMAX;
10215 break;
10216 }
10217 default:
10218 llvm_unreachable("unhandled atomic opcode");
10219 }
10220 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
10221 Ops, M->getMemOperand());
10222 }
10223 case Intrinsic::amdgcn_s_get_barrier_state:
10224 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10225 SDValue Chain = Op->getOperand(0);
10227 unsigned Opc;
10228
10229 if (isa<ConstantSDNode>(Op->getOperand(2))) {
10230 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
10231 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10232 BarID = (BarID >> 4) & 0x3F;
10233 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10234 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10235 Ops.push_back(K);
10236 Ops.push_back(Chain);
10237 } else {
10238 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10239 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10240 SDValue M0Val;
10241 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
10242 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10243 M0Val = SDValue(
10244 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10245 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10246 0);
10247 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10248 } else
10249 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
10250 }
10251
10252 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10253 return SDValue(NewMI, 0);
10254 }
10255 default:
10256
10257 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10259 return lowerImage(Op, ImageDimIntr, DAG, true);
10260
10261 return SDValue();
10262 }
10263}
10264
10265// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
10266// dwordx4 if on SI and handle TFE loads.
10267SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
10268 SDVTList VTList,
10269 ArrayRef<SDValue> Ops, EVT MemVT,
10270 MachineMemOperand *MMO,
10271 SelectionDAG &DAG) const {
10272 LLVMContext &C = *DAG.getContext();
10274 EVT VT = VTList.VTs[0];
10275
10276 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
10277 bool IsTFE = VTList.NumVTs == 3;
10278 if (IsTFE) {
10279 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
10280 unsigned NumOpDWords = NumValueDWords + 1;
10281 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
10282 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
10283 MachineMemOperand *OpDWordsMMO =
10284 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
10285 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
10286 OpDWordsVT, OpDWordsMMO, DAG);
10288 DAG.getVectorIdxConstant(NumValueDWords, DL));
10289 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
10290 SDValue ValueDWords =
10291 NumValueDWords == 1
10292 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
10294 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
10295 ZeroIdx);
10296 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
10297 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10298 }
10299
10300 if (!Subtarget->hasDwordx3LoadStores() &&
10301 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10302 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
10303 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
10304 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
10305 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
10306 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
10307 WidenedMemVT, WidenedMMO);
10309 DAG.getVectorIdxConstant(0, DL));
10310 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
10311 }
10312
10313 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
10314}
10315
10316SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
10317 bool ImageStore) const {
10318 EVT StoreVT = VData.getValueType();
10319
10320 // No change for f16 and legal vector D16 types.
10321 if (!StoreVT.isVector())
10322 return VData;
10323
10324 SDLoc DL(VData);
10325 unsigned NumElements = StoreVT.getVectorNumElements();
10326
10327 if (Subtarget->hasUnpackedD16VMem()) {
10328 // We need to unpack the packed data to store.
10329 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10330 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10331
10332 EVT EquivStoreVT =
10333 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
10334 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
10335 return DAG.UnrollVectorOp(ZExt.getNode());
10336 }
10337
10338 // The sq block of gfx8.1 does not estimate register use correctly for d16
10339 // image store instructions. The data operand is computed as if it were not a
10340 // d16 image instruction.
10341 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
10342 // Bitcast to i16
10343 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10344 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10345
10346 // Decompose into scalars
10348 DAG.ExtractVectorElements(IntVData, Elts);
10349
10350 // Group pairs of i16 into v2i16 and bitcast to i32
10351 SmallVector<SDValue, 4> PackedElts;
10352 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
10353 SDValue Pair =
10354 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
10355 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10356 PackedElts.push_back(IntPair);
10357 }
10358 if ((NumElements % 2) == 1) {
10359 // Handle v3i16
10360 unsigned I = Elts.size() / 2;
10361 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
10362 {Elts[I * 2], DAG.getPOISON(MVT::i16)});
10363 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10364 PackedElts.push_back(IntPair);
10365 }
10366
10367 // Pad using UNDEF
10368 PackedElts.resize(Elts.size(), DAG.getPOISON(MVT::i32));
10369
10370 // Build final vector
10371 EVT VecVT =
10372 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
10373 return DAG.getBuildVector(VecVT, DL, PackedElts);
10374 }
10375
10376 if (NumElements == 3) {
10377 EVT IntStoreVT =
10379 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10380
10381 EVT WidenedStoreVT = EVT::getVectorVT(
10382 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
10383 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
10384 WidenedStoreVT.getStoreSizeInBits());
10385 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
10386 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
10387 }
10388
10389 assert(isTypeLegal(StoreVT));
10390 return VData;
10391}
10392
10393SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
10394 SelectionDAG &DAG) const {
10395 SDLoc DL(Op);
10396 SDValue Chain = Op.getOperand(0);
10397 unsigned IntrinsicID = Op.getConstantOperandVal(1);
10399
10400 switch (IntrinsicID) {
10401 case Intrinsic::amdgcn_exp_compr: {
10402 if (!Subtarget->hasCompressedExport()) {
10405 "intrinsic not supported on subtarget", DL.getDebugLoc()));
10406 }
10407 SDValue Src0 = Op.getOperand(4);
10408 SDValue Src1 = Op.getOperand(5);
10409 // Hack around illegal type on SI by directly selecting it.
10410 if (isTypeLegal(Src0.getValueType()))
10411 return SDValue();
10412
10413 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
10414 SDValue Undef = DAG.getPOISON(MVT::f32);
10415 const SDValue Ops[] = {
10416 Op.getOperand(2), // tgt
10417 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
10418 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
10419 Undef, // src2
10420 Undef, // src3
10421 Op.getOperand(7), // vm
10422 DAG.getTargetConstant(1, DL, MVT::i1), // compr
10423 Op.getOperand(3), // en
10424 Op.getOperand(0) // Chain
10425 };
10426
10427 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
10428 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
10429 }
10430
10431 case Intrinsic::amdgcn_struct_tbuffer_store:
10432 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
10433 SDValue VData = Op.getOperand(2);
10434 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
10435 if (IsD16)
10436 VData = handleD16VData(VData, DAG);
10437 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10438 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10439 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10440 SDValue Ops[] = {
10441 Chain,
10442 VData, // vdata
10443 Rsrc, // rsrc
10444 Op.getOperand(4), // vindex
10445 VOffset, // voffset
10446 SOffset, // soffset
10447 Offset, // offset
10448 Op.getOperand(7), // format
10449 Op.getOperand(8), // cachepolicy, swizzled buffer
10450 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10451 };
10452 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
10454 MemSDNode *M = cast<MemSDNode>(Op);
10455 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
10456 M->getMemoryVT(), M->getMemOperand());
10457 }
10458
10459 case Intrinsic::amdgcn_raw_tbuffer_store:
10460 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
10461 SDValue VData = Op.getOperand(2);
10462 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
10463 if (IsD16)
10464 VData = handleD16VData(VData, DAG);
10465 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10466 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10467 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10468 SDValue Ops[] = {
10469 Chain,
10470 VData, // vdata
10471 Rsrc, // rsrc
10472 DAG.getConstant(0, DL, MVT::i32), // vindex
10473 VOffset, // voffset
10474 SOffset, // soffset
10475 Offset, // offset
10476 Op.getOperand(6), // format
10477 Op.getOperand(7), // cachepolicy, swizzled buffer
10478 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10479 };
10480 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
10482 MemSDNode *M = cast<MemSDNode>(Op);
10483 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
10484 M->getMemoryVT(), M->getMemOperand());
10485 }
10486
10487 case Intrinsic::amdgcn_raw_buffer_store:
10488 case Intrinsic::amdgcn_raw_ptr_buffer_store:
10489 case Intrinsic::amdgcn_raw_buffer_store_format:
10490 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
10491 const bool IsFormat =
10492 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
10493 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
10494
10495 SDValue VData = Op.getOperand(2);
10496 EVT VDataVT = VData.getValueType();
10497 EVT EltType = VDataVT.getScalarType();
10498 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
10499 if (IsD16) {
10500 VData = handleD16VData(VData, DAG);
10501 VDataVT = VData.getValueType();
10502 }
10503
10504 if (!isTypeLegal(VDataVT)) {
10505 VData =
10506 DAG.getNode(ISD::BITCAST, DL,
10507 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
10508 }
10509
10510 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10511 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10512 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10513 SDValue Ops[] = {
10514 Chain,
10515 VData,
10516 Rsrc,
10517 DAG.getConstant(0, DL, MVT::i32), // vindex
10518 VOffset, // voffset
10519 SOffset, // soffset
10520 Offset, // offset
10521 Op.getOperand(6), // cachepolicy, swizzled buffer
10522 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10523 };
10524 unsigned Opc =
10527 MemSDNode *M = cast<MemSDNode>(Op);
10528
10529 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
10530 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
10531 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
10532
10533 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
10534 M->getMemoryVT(), M->getMemOperand());
10535 }
10536
10537 case Intrinsic::amdgcn_struct_buffer_store:
10538 case Intrinsic::amdgcn_struct_ptr_buffer_store:
10539 case Intrinsic::amdgcn_struct_buffer_store_format:
10540 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
10541 const bool IsFormat =
10542 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
10543 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
10544
10545 SDValue VData = Op.getOperand(2);
10546 EVT VDataVT = VData.getValueType();
10547 EVT EltType = VDataVT.getScalarType();
10548 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
10549
10550 if (IsD16) {
10551 VData = handleD16VData(VData, DAG);
10552 VDataVT = VData.getValueType();
10553 }
10554
10555 if (!isTypeLegal(VDataVT)) {
10556 VData =
10557 DAG.getNode(ISD::BITCAST, DL,
10558 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
10559 }
10560
10561 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10562 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10563 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10564 SDValue Ops[] = {
10565 Chain,
10566 VData,
10567 Rsrc,
10568 Op.getOperand(4), // vindex
10569 VOffset, // voffset
10570 SOffset, // soffset
10571 Offset, // offset
10572 Op.getOperand(7), // cachepolicy, swizzled buffer
10573 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10574 };
10575 unsigned Opc =
10578 MemSDNode *M = cast<MemSDNode>(Op);
10579
10580 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
10581 EVT VDataType = VData.getValueType().getScalarType();
10582 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
10583 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
10584
10585 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
10586 M->getMemoryVT(), M->getMemOperand());
10587 }
10588 case Intrinsic::amdgcn_raw_buffer_load_lds:
10589 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
10590 case Intrinsic::amdgcn_struct_buffer_load_lds:
10591 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
10592 if (!Subtarget->hasVMemToLDSLoad())
10593 return SDValue();
10594 unsigned Opc;
10595 bool HasVIndex =
10596 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
10597 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
10598 unsigned OpOffset = HasVIndex ? 1 : 0;
10599 SDValue VOffset = Op.getOperand(5 + OpOffset);
10600 bool HasVOffset = !isNullConstant(VOffset);
10601 unsigned Size = Op->getConstantOperandVal(4);
10602
10603 switch (Size) {
10604 default:
10605 return SDValue();
10606 case 1:
10607 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
10608 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
10609 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
10610 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
10611 break;
10612 case 2:
10613 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
10614 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
10615 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
10616 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
10617 break;
10618 case 4:
10619 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
10620 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
10621 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
10622 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
10623 break;
10624 case 12:
10625 if (!Subtarget->hasLDSLoadB96_B128())
10626 return SDValue();
10627 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
10628 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
10629 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
10630 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
10631 break;
10632 case 16:
10633 if (!Subtarget->hasLDSLoadB96_B128())
10634 return SDValue();
10635 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
10636 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
10637 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
10638 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
10639 break;
10640 }
10641
10642 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
10643
10645
10646 if (HasVIndex && HasVOffset)
10647 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
10648 {Op.getOperand(5), // VIndex
10649 VOffset}));
10650 else if (HasVIndex)
10651 Ops.push_back(Op.getOperand(5));
10652 else if (HasVOffset)
10653 Ops.push_back(VOffset);
10654
10655 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10656 Ops.push_back(Rsrc);
10657 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
10658 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
10659 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10660 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
10662 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
10663 DL, MVT::i8)); // cpol
10665 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
10666 ? 1
10667 : 0,
10668 DL, MVT::i8)); // swz
10669 Ops.push_back(M0Val.getValue(0)); // Chain
10670 Ops.push_back(M0Val.getValue(1)); // Glue
10671
10672 auto *M = cast<MemSDNode>(Op);
10673 MachineMemOperand *LoadMMO = M->getMemOperand();
10674 // Don't set the offset value here because the pointer points to the base of
10675 // the buffer.
10676 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
10677
10678 MachinePointerInfo StorePtrI = LoadPtrI;
10679 LoadPtrI.V = PoisonValue::get(
10683
10684 auto F = LoadMMO->getFlags() &
10686 LoadMMO =
10688 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
10689
10691 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
10692 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
10693
10694 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
10695 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
10696
10697 return SDValue(Load, 0);
10698 }
10699 // Buffers are handled by LowerBufferFatPointers, and we're going to go
10700 // for "trust me" that the remaining cases are global pointers until
10701 // such time as we can put two mem operands on an intrinsic.
10702 case Intrinsic::amdgcn_load_to_lds:
10703 case Intrinsic::amdgcn_global_load_lds: {
10704 if (!Subtarget->hasVMemToLDSLoad())
10705 return SDValue();
10706
10707 unsigned Opc;
10708 unsigned Size = Op->getConstantOperandVal(4);
10709 switch (Size) {
10710 default:
10711 return SDValue();
10712 case 1:
10713 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
10714 break;
10715 case 2:
10716 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
10717 break;
10718 case 4:
10719 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
10720 break;
10721 case 12:
10722 if (!Subtarget->hasLDSLoadB96_B128())
10723 return SDValue();
10724 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
10725 break;
10726 case 16:
10727 if (!Subtarget->hasLDSLoadB96_B128())
10728 return SDValue();
10729 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
10730 break;
10731 }
10732
10733 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
10734
10736
10737 SDValue Addr = Op.getOperand(2); // Global ptr
10738 SDValue VOffset;
10739 // Try to split SAddr and VOffset. Global and LDS pointers share the same
10740 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
10741 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
10742 SDValue LHS = Addr.getOperand(0);
10743 SDValue RHS = Addr.getOperand(1);
10744
10745 if (LHS->isDivergent())
10746 std::swap(LHS, RHS);
10747
10748 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
10749 RHS.getOperand(0).getValueType() == MVT::i32) {
10750 // add (i64 sgpr), (zero_extend (i32 vgpr))
10751 Addr = LHS;
10752 VOffset = RHS.getOperand(0);
10753 }
10754 }
10755
10756 Ops.push_back(Addr);
10757 if (!Addr->isDivergent()) {
10759 if (!VOffset)
10760 VOffset =
10761 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
10762 DAG.getTargetConstant(0, DL, MVT::i32)),
10763 0);
10764 Ops.push_back(VOffset);
10765 }
10766
10767 Ops.push_back(Op.getOperand(5)); // Offset
10768 Ops.push_back(Op.getOperand(6)); // CPol
10769 Ops.push_back(M0Val.getValue(0)); // Chain
10770 Ops.push_back(M0Val.getValue(1)); // Glue
10771
10772 auto *M = cast<MemSDNode>(Op);
10773 MachineMemOperand *LoadMMO = M->getMemOperand();
10774 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
10775 LoadPtrI.Offset = Op->getConstantOperandVal(5);
10776 MachinePointerInfo StorePtrI = LoadPtrI;
10777 LoadPtrI.V = PoisonValue::get(
10781 auto F = LoadMMO->getFlags() &
10783 LoadMMO =
10785 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
10787 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
10788 LoadMMO->getAAInfo());
10789
10790 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10791 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
10792
10793 return SDValue(Load, 0);
10794 }
10795 case Intrinsic::amdgcn_end_cf:
10796 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
10797 Op->getOperand(2), Chain),
10798 0);
10799 case Intrinsic::amdgcn_s_barrier_init:
10800 case Intrinsic::amdgcn_s_barrier_signal_var: {
10801 // these two intrinsics have two operands: barrier pointer and member count
10802 SDValue Chain = Op->getOperand(0);
10804 SDValue BarOp = Op->getOperand(2);
10805 SDValue CntOp = Op->getOperand(3);
10806 SDValue M0Val;
10807 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
10808 ? AMDGPU::S_BARRIER_INIT_M0
10809 : AMDGPU::S_BARRIER_SIGNAL_M0;
10810 // extract the BarrierID from bits 4-9 of BarOp
10811 SDValue BarID;
10812 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
10813 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10814 BarID =
10815 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
10816 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10817 0);
10818 // Member count should be put into M0[ShAmt:+6]
10819 // Barrier ID should be put into M0[5:0]
10820 M0Val =
10821 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
10822 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10823 0);
10824 constexpr unsigned ShAmt = 16;
10825 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
10826 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
10827
10828 M0Val = SDValue(
10829 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
10830
10831 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10832
10833 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10834 return SDValue(NewMI, 0);
10835 }
10836 case Intrinsic::amdgcn_s_barrier_join: {
10837 // these three intrinsics have one operand: barrier pointer
10838 SDValue Chain = Op->getOperand(0);
10840 SDValue BarOp = Op->getOperand(2);
10841 unsigned Opc;
10842
10843 if (isa<ConstantSDNode>(BarOp)) {
10844 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
10845 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
10846
10847 // extract the BarrierID from bits 4-9 of the immediate
10848 unsigned BarID = (BarVal >> 4) & 0x3F;
10849 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10850 Ops.push_back(K);
10851 Ops.push_back(Chain);
10852 } else {
10853 Opc = AMDGPU::S_BARRIER_JOIN_M0;
10854
10855 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
10856 SDValue M0Val;
10857 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
10858 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10859 M0Val =
10860 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10861 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10862 0);
10863 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10864 }
10865
10866 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10867 return SDValue(NewMI, 0);
10868 }
10869 case Intrinsic::amdgcn_s_prefetch_data: {
10870 // For non-global address space preserve the chain and remove the call.
10871 if (!AMDGPU::isFlatGlobalAddrSpace(cast<MemSDNode>(Op)->getAddressSpace()))
10872 return Op.getOperand(0);
10873 return Op;
10874 }
10875 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
10876 SDValue Ops[] = {
10877 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
10878 Op.getOperand(3), // offset
10879 Op.getOperand(4), // length
10880 };
10881
10882 MemSDNode *M = cast<MemSDNode>(Op);
10884 Op->getVTList(), Ops, M->getMemoryVT(),
10885 M->getMemOperand());
10886 }
10887 default: {
10888 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10890 return lowerImage(Op, ImageDimIntr, DAG, true);
10891
10892 return Op;
10893 }
10894 }
10895}
10896
10897// Return whether the operation has NoUnsignedWrap property.
10899 return (Addr.getOpcode() == ISD::ADD &&
10900 Addr->getFlags().hasNoUnsignedWrap()) ||
10901 Addr->getOpcode() == ISD::OR;
10902}
10903
10905 EVT PtrVT) const {
10906 return UseSelectionDAGPTRADD && PtrVT == MVT::i64;
10907}
10908
10909// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
10910// offset (the offset that is included in bounds checking and swizzling, to be
10911// split between the instruction's voffset and immoffset fields) and soffset
10912// (the offset that is excluded from bounds checking and swizzling, to go in
10913// the instruction's soffset field). This function takes the first kind of
10914// offset and figures out how to split it between voffset and immoffset.
10915std::pair<SDValue, SDValue>
10916SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
10917 SDLoc DL(Offset);
10918 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
10919 SDValue N0 = Offset;
10920 ConstantSDNode *C1 = nullptr;
10921
10922 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
10923 N0 = SDValue();
10924 else if (DAG.isBaseWithConstantOffset(N0)) {
10925 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
10926 // being added, so we can only safely match a 32-bit addition with no
10927 // unsigned overflow.
10928 bool CheckNUW = AMDGPU::isGFX1250(*Subtarget);
10929 if (!CheckNUW || isNoUnsignedWrap(N0)) {
10930 C1 = cast<ConstantSDNode>(N0.getOperand(1));
10931 N0 = N0.getOperand(0);
10932 }
10933 }
10934
10935 if (C1) {
10936 unsigned ImmOffset = C1->getZExtValue();
10937 // If the immediate value is too big for the immoffset field, put only bits
10938 // that would normally fit in the immoffset field. The remaining value that
10939 // is copied/added for the voffset field is a large power of 2, and it
10940 // stands more chance of being CSEd with the copy/add for another similar
10941 // load/store.
10942 // However, do not do that rounding down if that is a negative
10943 // number, as it appears to be illegal to have a negative offset in the
10944 // vgpr, even if adding the immediate offset makes it positive.
10945 unsigned Overflow = ImmOffset & ~MaxImm;
10946 ImmOffset -= Overflow;
10947 if ((int32_t)Overflow < 0) {
10948 Overflow += ImmOffset;
10949 ImmOffset = 0;
10950 }
10951 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
10952 if (Overflow) {
10953 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
10954 if (!N0)
10955 N0 = OverflowVal;
10956 else {
10957 SDValue Ops[] = {N0, OverflowVal};
10958 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
10959 }
10960 }
10961 }
10962 if (!N0)
10963 N0 = DAG.getConstant(0, DL, MVT::i32);
10964 if (!C1)
10965 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
10966 return {N0, SDValue(C1, 0)};
10967}
10968
10969// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
10970// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
10971// pointed to by Offsets.
10972void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
10973 SelectionDAG &DAG, SDValue *Offsets,
10974 Align Alignment) const {
10976 SDLoc DL(CombinedOffset);
10977 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10978 uint32_t Imm = C->getZExtValue();
10979 uint32_t SOffset, ImmOffset;
10980 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10981 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
10982 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10983 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10984 return;
10985 }
10986 }
10987 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
10988 SDValue N0 = CombinedOffset.getOperand(0);
10989 SDValue N1 = CombinedOffset.getOperand(1);
10990 uint32_t SOffset, ImmOffset;
10991 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10992 if (Offset >= 0 &&
10993 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
10994 Offsets[0] = N0;
10995 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10996 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10997 return;
10998 }
10999 }
11000
11001 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11002 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
11003 : DAG.getConstant(0, DL, MVT::i32);
11004
11005 Offsets[0] = CombinedOffset;
11006 Offsets[1] = SOffsetZero;
11007 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
11008}
11009
11010SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
11011 SelectionDAG &DAG) const {
11012 if (!MaybePointer.getValueType().isScalarInteger())
11013 return MaybePointer;
11014
11015 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
11016 return Rsrc;
11017}
11018
11019// Wrap a global or flat pointer into a buffer intrinsic using the flags
11020// specified in the intrinsic.
11021SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
11022 SelectionDAG &DAG) const {
11023 SDLoc Loc(Op);
11024
11025 SDValue Pointer = Op->getOperand(1);
11026 SDValue Stride = Op->getOperand(2);
11027 SDValue NumRecords = Op->getOperand(3);
11028 SDValue Flags = Op->getOperand(4);
11029
11030 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11031 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
11032 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
11033 std::optional<uint32_t> ConstStride = std::nullopt;
11034 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
11035 ConstStride = ConstNode->getZExtValue();
11036
11037 SDValue NewHighHalf = Masked;
11038 if (!ConstStride || *ConstStride != 0) {
11039 SDValue ShiftedStride;
11040 if (ConstStride) {
11041 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
11042 } else {
11043 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
11044 ShiftedStride =
11045 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11046 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
11047 }
11048 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
11049 }
11050
11051 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
11052 NewHighHalf, NumRecords, Flags);
11053 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11054 return RsrcPtr;
11055}
11056
11057// Handle 8 bit and 16 bit buffer loads
11058SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
11059 EVT LoadVT, SDLoc DL,
11061 MachineMemOperand *MMO,
11062 bool IsTFE) const {
11063 EVT IntVT = LoadVT.changeTypeToInteger();
11064
11065 if (IsTFE) {
11066 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
11070 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
11071 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
11072 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
11074 DAG.getConstant(1, DL, MVT::i32));
11075 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11076 DAG.getConstant(0, DL, MVT::i32));
11077 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
11078 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
11079 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
11080 }
11081
11082 unsigned Opc = LoadVT.getScalarType() == MVT::i8
11085
11086 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
11087 SDValue BufferLoad =
11088 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
11089 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
11090 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
11091
11092 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
11093}
11094
11095// Handle 8 bit and 16 bit buffer stores
11096SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
11097 EVT VDataType, SDLoc DL,
11098 SDValue Ops[],
11099 MemSDNode *M) const {
11100 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11101 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
11102
11103 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
11104 Ops[1] = BufferStoreExt;
11105 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
11107 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
11108 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
11109 M->getMemOperand());
11110}
11111
11113 SDValue Op, const SDLoc &SL, EVT VT) {
11114 if (VT.bitsLT(Op.getValueType()))
11115 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
11116
11117 switch (ExtType) {
11118 case ISD::SEXTLOAD:
11119 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
11120 case ISD::ZEXTLOAD:
11121 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
11122 case ISD::EXTLOAD:
11123 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
11124 case ISD::NON_EXTLOAD:
11125 return Op;
11126 }
11127
11128 llvm_unreachable("invalid ext type");
11129}
11130
11131// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
11132// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
11133SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
11134 DAGCombinerInfo &DCI) const {
11135 SelectionDAG &DAG = DCI.DAG;
11136 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
11137 return SDValue();
11138
11139 // FIXME: Constant loads should all be marked invariant.
11140 unsigned AS = Ld->getAddressSpace();
11141 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
11143 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
11144 return SDValue();
11145
11146 // Don't do this early, since it may interfere with adjacent load merging for
11147 // illegal types. We can avoid losing alignment information for exotic types
11148 // pre-legalize.
11149 EVT MemVT = Ld->getMemoryVT();
11150 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
11151 MemVT.getSizeInBits() >= 32)
11152 return SDValue();
11153
11154 SDLoc SL(Ld);
11155
11156 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
11157 "unexpected vector extload");
11158
11159 // TODO: Drop only high part of range.
11160 SDValue Ptr = Ld->getBasePtr();
11161 SDValue NewLoad = DAG.getLoad(
11162 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
11163 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
11164 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
11165 nullptr); // Drop ranges
11166
11167 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
11168 if (MemVT.isFloatingPoint()) {
11170 "unexpected fp extload");
11171 TruncVT = MemVT.changeTypeToInteger();
11172 }
11173
11174 SDValue Cvt = NewLoad;
11175 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
11176 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
11177 DAG.getValueType(TruncVT));
11178 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
11180 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
11181 } else {
11183 }
11184
11185 EVT VT = Ld->getValueType(0);
11186 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
11187
11188 DCI.AddToWorklist(Cvt.getNode());
11189
11190 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
11191 // the appropriate extension from the 32-bit load.
11192 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
11193 DCI.AddToWorklist(Cvt.getNode());
11194
11195 // Handle conversion back to floating point if necessary.
11196 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
11197
11198 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
11199}
11200
11202 const SIMachineFunctionInfo &Info) {
11203 // TODO: Should check if the address can definitely not access stack.
11204 if (Info.isEntryFunction())
11205 return Info.getUserSGPRInfo().hasFlatScratchInit();
11206 return true;
11207}
11208
11209SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
11210 SDLoc DL(Op);
11211 LoadSDNode *Load = cast<LoadSDNode>(Op);
11212 ISD::LoadExtType ExtType = Load->getExtensionType();
11213 EVT MemVT = Load->getMemoryVT();
11214 MachineMemOperand *MMO = Load->getMemOperand();
11215
11216 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
11217 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
11218 return SDValue();
11219
11220 // FIXME: Copied from PPC
11221 // First, load into 32 bits, then truncate to 1 bit.
11222
11223 SDValue Chain = Load->getChain();
11224 SDValue BasePtr = Load->getBasePtr();
11225
11226 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
11227
11228 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
11229 RealMemVT, MMO);
11230
11231 if (!MemVT.isVector()) {
11232 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
11233 NewLD.getValue(1)};
11234
11235 return DAG.getMergeValues(Ops, DL);
11236 }
11237
11239 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
11240 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
11241 DAG.getConstant(I, DL, MVT::i32));
11242
11243 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
11244 }
11245
11246 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
11247
11248 return DAG.getMergeValues(Ops, DL);
11249 }
11250
11251 if (!MemVT.isVector())
11252 return SDValue();
11253
11254 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
11255 "Custom lowering for non-i32 vectors hasn't been implemented.");
11256
11257 Align Alignment = Load->getAlign();
11258 unsigned AS = Load->getAddressSpace();
11259 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
11260 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
11261 return SplitVectorLoad(Op, DAG);
11262 }
11263
11266 // If there is a possibility that flat instruction access scratch memory
11267 // then we need to use the same legalization rules we use for private.
11268 if (AS == AMDGPUAS::FLAT_ADDRESS &&
11270 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
11273
11274 unsigned NumElements = MemVT.getVectorNumElements();
11275
11276 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11278 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
11279 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
11281 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
11282 Alignment >= Align(4) && NumElements < 32) {
11283 if (MemVT.isPow2VectorType() ||
11284 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
11285 return SDValue();
11286 return WidenOrSplitVectorLoad(Op, DAG);
11287 }
11288 // Non-uniform loads will be selected to MUBUF instructions, so they
11289 // have the same legalization requirements as global and private
11290 // loads.
11291 //
11292 }
11293 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11296 if (NumElements > 4)
11297 return SplitVectorLoad(Op, DAG);
11298 // v3 loads not supported on SI.
11299 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11300 return WidenOrSplitVectorLoad(Op, DAG);
11301
11302 // v3 and v4 loads are supported for private and global memory.
11303 return SDValue();
11304 }
11305 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11306 // Depending on the setting of the private_element_size field in the
11307 // resource descriptor, we can only make private accesses up to a certain
11308 // size.
11309 switch (Subtarget->getMaxPrivateElementSize()) {
11310 case 4: {
11311 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
11312 return DAG.getMergeValues({Op0, Op1}, DL);
11313 }
11314 case 8:
11315 if (NumElements > 2)
11316 return SplitVectorLoad(Op, DAG);
11317 return SDValue();
11318 case 16:
11319 // Same as global/flat
11320 if (NumElements > 4)
11321 return SplitVectorLoad(Op, DAG);
11322 // v3 loads not supported on SI.
11323 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11324 return WidenOrSplitVectorLoad(Op, DAG);
11325
11326 return SDValue();
11327 default:
11328 llvm_unreachable("unsupported private_element_size");
11329 }
11330 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
11331 unsigned Fast = 0;
11332 auto Flags = Load->getMemOperand()->getFlags();
11334 Load->getAlign(), Flags, &Fast) &&
11335 Fast > 1)
11336 return SDValue();
11337
11338 if (MemVT.isVector())
11339 return SplitVectorLoad(Op, DAG);
11340 }
11341
11343 MemVT, *Load->getMemOperand())) {
11344 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
11345 return DAG.getMergeValues({Op0, Op1}, DL);
11346 }
11347
11348 return SDValue();
11349}
11350
11351SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
11352 EVT VT = Op.getValueType();
11353 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
11354 VT.getSizeInBits() == 512)
11355 return splitTernaryVectorOp(Op, DAG);
11356
11357 assert(VT.getSizeInBits() == 64);
11358
11359 SDLoc DL(Op);
11360 SDValue Cond = DAG.getFreeze(Op.getOperand(0));
11361
11362 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
11363 SDValue One = DAG.getConstant(1, DL, MVT::i32);
11364
11365 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
11366 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
11367
11368 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
11369 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
11370
11371 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
11372
11373 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
11374 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
11375
11376 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
11377
11378 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
11379 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
11380}
11381
11382// Catch division cases where we can use shortcuts with rcp and rsq
11383// instructions.
11384SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
11385 SelectionDAG &DAG) const {
11386 SDLoc SL(Op);
11387 SDValue LHS = Op.getOperand(0);
11388 SDValue RHS = Op.getOperand(1);
11389 EVT VT = Op.getValueType();
11390 const SDNodeFlags Flags = Op->getFlags();
11391
11392 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
11393
11394 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
11395 // Without !fpmath accuracy information, we can't do more because we don't
11396 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
11397 // f16 is always accurate enough
11398 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
11399 return SDValue();
11400
11401 if (CLHS->isExactlyValue(1.0)) {
11402 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
11403 // the CI documentation has a worst case error of 1 ulp.
11404 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
11405 // use it as long as we aren't trying to use denormals.
11406 //
11407 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
11408
11409 // 1.0 / sqrt(x) -> rsq(x)
11410
11411 // XXX - Is afn sufficient to do this for f64? The maximum ULP
11412 // error seems really high at 2^29 ULP.
11413 // 1.0 / x -> rcp(x)
11414 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
11415 }
11416
11417 // Same as for 1.0, but expand the sign out of the constant.
11418 if (CLHS->isExactlyValue(-1.0)) {
11419 // -1.0 / x -> rcp (fneg x)
11420 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
11421 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
11422 }
11423 }
11424
11425 // For f16 and bf16 require afn or arcp.
11426 // For f32 require afn.
11427 if (!AllowInaccurateRcp &&
11428 ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
11429 return SDValue();
11430
11431 // Turn into multiply by the reciprocal.
11432 // x / y -> x * (1.0 / y)
11433 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
11434 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
11435}
11436
11437SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
11438 SelectionDAG &DAG) const {
11439 SDLoc SL(Op);
11440 SDValue X = Op.getOperand(0);
11441 SDValue Y = Op.getOperand(1);
11442 EVT VT = Op.getValueType();
11443 const SDNodeFlags Flags = Op->getFlags();
11444
11445 bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
11446 if (!AllowInaccurateDiv)
11447 return SDValue();
11448
11449 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
11450 SDValue One = DAG.getConstantFP(1.0, SL, VT);
11451
11452 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
11453 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
11454
11455 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
11456 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
11457 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
11458 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
11459 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
11460 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
11461}
11462
11463static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
11464 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
11465 SDNodeFlags Flags) {
11466 if (GlueChain->getNumValues() <= 1) {
11467 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
11468 }
11469
11470 assert(GlueChain->getNumValues() == 3);
11471
11472 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
11473 switch (Opcode) {
11474 default:
11475 llvm_unreachable("no chain equivalent for opcode");
11476 case ISD::FMUL:
11477 Opcode = AMDGPUISD::FMUL_W_CHAIN;
11478 break;
11479 }
11480
11481 return DAG.getNode(Opcode, SL, VTList,
11482 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
11483 Flags);
11484}
11485
11486static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
11487 EVT VT, SDValue A, SDValue B, SDValue C,
11488 SDValue GlueChain, SDNodeFlags Flags) {
11489 if (GlueChain->getNumValues() <= 1) {
11490 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
11491 }
11492
11493 assert(GlueChain->getNumValues() == 3);
11494
11495 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
11496 switch (Opcode) {
11497 default:
11498 llvm_unreachable("no chain equivalent for opcode");
11499 case ISD::FMA:
11500 Opcode = AMDGPUISD::FMA_W_CHAIN;
11501 break;
11502 }
11503
11504 return DAG.getNode(Opcode, SL, VTList,
11505 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
11506 Flags);
11507}
11508
11509SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
11510 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
11511 return FastLowered;
11512
11513 SDLoc SL(Op);
11514 EVT VT = Op.getValueType();
11515 SDValue LHS = Op.getOperand(0);
11516 SDValue RHS = Op.getOperand(1);
11517
11518 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
11519 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
11520
11521 if (VT == MVT::bf16) {
11522 SDValue ExtDiv =
11523 DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags());
11524 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv,
11525 DAG.getTargetConstant(0, SL, MVT::i32));
11526 }
11527
11528 assert(VT == MVT::f16);
11529
11530 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
11531 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
11532 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
11533 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
11534 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
11535 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
11536 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
11537 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
11538 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
11539 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
11540 // q16.u = opx(V_CVT_F16_F32, q32.u);
11541 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
11542
11543 // We will use ISD::FMA on targets that don't support ISD::FMAD.
11544 unsigned FMADOpCode =
11546 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
11547 SDValue Rcp =
11548 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
11549 SDValue Quot =
11550 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
11551 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
11552 Op->getFlags());
11553 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
11554 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
11555 Op->getFlags());
11556 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
11557 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
11558 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
11559 DAG.getConstant(0xff800000, SL, MVT::i32));
11560 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
11561 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
11562 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
11563 DAG.getTargetConstant(0, SL, MVT::i32));
11564 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
11565 Op->getFlags());
11566}
11567
11568// Faster 2.5 ULP division that does not support denormals.
11569SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
11570 SDNodeFlags Flags = Op->getFlags();
11571 SDLoc SL(Op);
11572 SDValue LHS = Op.getOperand(1);
11573 SDValue RHS = Op.getOperand(2);
11574
11575 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
11576
11577 const APFloat K0Val(0x1p+96f);
11578 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
11579
11580 const APFloat K1Val(0x1p-32f);
11581 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
11582
11583 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
11584
11585 EVT SetCCVT =
11586 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
11587
11588 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
11589
11590 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
11591
11592 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
11593
11594 // rcp does not support denormals.
11595 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
11596
11597 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
11598
11599 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
11600}
11601
11602// Returns immediate value for setting the F32 denorm mode when using the
11603// S_DENORM_MODE instruction.
11605 const SIMachineFunctionInfo *Info,
11606 const GCNSubtarget *ST) {
11607 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
11608 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
11609 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
11610 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
11611}
11612
11613SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
11614 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
11615 return FastLowered;
11616
11617 // The selection matcher assumes anything with a chain selecting to a
11618 // mayRaiseFPException machine instruction. Since we're introducing a chain
11619 // here, we need to explicitly report nofpexcept for the regular fdiv
11620 // lowering.
11621 SDNodeFlags Flags = Op->getFlags();
11622 Flags.setNoFPExcept(true);
11623
11624 SDLoc SL(Op);
11625 SDValue LHS = Op.getOperand(0);
11626 SDValue RHS = Op.getOperand(1);
11627
11628 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
11629
11630 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
11631
11632 SDValue DenominatorScaled =
11633 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
11634 SDValue NumeratorScaled =
11635 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
11636
11637 // Denominator is scaled to not be denormal, so using rcp is ok.
11638 SDValue ApproxRcp =
11639 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
11640 SDValue NegDivScale0 =
11641 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
11642
11643 using namespace AMDGPU::Hwreg;
11644 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
11645 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
11646
11647 const MachineFunction &MF = DAG.getMachineFunction();
11649 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
11650
11651 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
11652 const bool HasDynamicDenormals =
11653 (DenormMode.Input == DenormalMode::Dynamic) ||
11654 (DenormMode.Output == DenormalMode::Dynamic);
11655
11656 SDValue SavedDenormMode;
11657
11658 if (!PreservesDenormals) {
11659 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
11660 // lowering. The chain dependence is insufficient, and we need glue. We do
11661 // not need the glue variants in a strictfp function.
11662
11663 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
11664
11665 SDValue Glue = DAG.getEntryNode();
11666 if (HasDynamicDenormals) {
11667 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
11668 DAG.getVTList(MVT::i32, MVT::Glue),
11669 {BitField, Glue});
11670 SavedDenormMode = SDValue(GetReg, 0);
11671
11672 Glue = DAG.getMergeValues(
11673 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
11674 }
11675
11676 SDNode *EnableDenorm;
11677 if (Subtarget->hasDenormModeInst()) {
11678 const SDValue EnableDenormValue =
11679 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
11680
11681 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
11682 EnableDenormValue)
11683 .getNode();
11684 } else {
11685 const SDValue EnableDenormValue =
11686 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
11687 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
11688 {EnableDenormValue, BitField, Glue});
11689 }
11690
11691 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
11692 SDValue(EnableDenorm, 1)};
11693
11694 NegDivScale0 = DAG.getMergeValues(Ops, SL);
11695 }
11696
11697 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
11698 ApproxRcp, One, NegDivScale0, Flags);
11699
11700 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
11701 ApproxRcp, Fma0, Flags);
11702
11703 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
11704 Fma1, Flags);
11705
11706 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
11707 NumeratorScaled, Mul, Flags);
11708
11709 SDValue Fma3 =
11710 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
11711
11712 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
11713 NumeratorScaled, Fma3, Flags);
11714
11715 if (!PreservesDenormals) {
11716 SDNode *DisableDenorm;
11717 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
11718 const SDValue DisableDenormValue = getSPDenormModeValue(
11719 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
11720
11721 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
11722 DisableDenorm =
11723 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
11724 Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2))
11725 .getNode();
11726 } else {
11727 assert(HasDynamicDenormals == (bool)SavedDenormMode);
11728 const SDValue DisableDenormValue =
11729 HasDynamicDenormals
11730 ? SavedDenormMode
11731 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
11732
11733 DisableDenorm = DAG.getMachineNode(
11734 AMDGPU::S_SETREG_B32, SL, MVT::Other,
11735 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
11736 }
11737
11738 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
11739 SDValue(DisableDenorm, 0), DAG.getRoot());
11740 DAG.setRoot(OutputChain);
11741 }
11742
11743 SDValue Scale = NumeratorScaled.getValue(1);
11744 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
11745 {Fma4, Fma1, Fma3, Scale}, Flags);
11746
11747 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
11748}
11749
11750SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
11751 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
11752 return FastLowered;
11753
11754 SDLoc SL(Op);
11755 SDValue X = Op.getOperand(0);
11756 SDValue Y = Op.getOperand(1);
11757
11758 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
11759
11760 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
11761
11762 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
11763
11764 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
11765
11766 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
11767
11768 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
11769
11770 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
11771
11772 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
11773
11774 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
11775
11776 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
11777 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
11778
11779 SDValue Fma4 =
11780 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
11781
11782 SDValue Scale;
11783
11784 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
11785 // Workaround a hardware bug on SI where the condition output from div_scale
11786 // is not usable.
11787
11788 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
11789
11790 // Figure out if the scale to use for div_fmas.
11791 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
11792 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
11793 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
11794 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
11795
11796 SDValue NumHi =
11797 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
11798 SDValue DenHi =
11799 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
11800
11801 SDValue Scale0Hi =
11802 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
11803 SDValue Scale1Hi =
11804 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
11805
11806 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
11807 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
11808 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
11809 } else {
11810 Scale = DivScale1.getValue(1);
11811 }
11812
11813 SDValue Fmas =
11814 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
11815
11816 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
11817}
11818
11819SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
11820 EVT VT = Op.getValueType();
11821
11822 if (VT == MVT::f32)
11823 return LowerFDIV32(Op, DAG);
11824
11825 if (VT == MVT::f64)
11826 return LowerFDIV64(Op, DAG);
11827
11828 if (VT == MVT::f16 || VT == MVT::bf16)
11829 return LowerFDIV16(Op, DAG);
11830
11831 llvm_unreachable("Unexpected type for fdiv");
11832}
11833
11834SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
11835 SDLoc dl(Op);
11836 SDValue Val = Op.getOperand(0);
11837 EVT VT = Val.getValueType();
11838 EVT ResultExpVT = Op->getValueType(1);
11839 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
11840
11841 SDValue Mant = DAG.getNode(
11843 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
11844
11845 SDValue Exp = DAG.getNode(
11846 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
11847 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
11848
11849 if (Subtarget->hasFractBug()) {
11850 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
11851 SDValue Inf =
11853
11854 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
11855 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
11856 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
11857 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
11858 }
11859
11860 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
11861 return DAG.getMergeValues({Mant, CastExp}, dl);
11862}
11863
11864SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
11865 SDLoc DL(Op);
11866 StoreSDNode *Store = cast<StoreSDNode>(Op);
11867 EVT VT = Store->getMemoryVT();
11868
11869 if (VT == MVT::i1) {
11870 return DAG.getTruncStore(
11871 Store->getChain(), DL,
11872 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
11873 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
11874 }
11875
11876 assert(VT.isVector() &&
11877 Store->getValue().getValueType().getScalarType() == MVT::i32);
11878
11879 unsigned AS = Store->getAddressSpace();
11880 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
11881 Store->getAlign().value() < VT.getStoreSize() &&
11882 VT.getSizeInBits() > 32) {
11883 return SplitVectorStore(Op, DAG);
11884 }
11885
11888 // If there is a possibility that flat instruction access scratch memory
11889 // then we need to use the same legalization rules we use for private.
11890 if (AS == AMDGPUAS::FLAT_ADDRESS &&
11892 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
11895
11896 unsigned NumElements = VT.getVectorNumElements();
11898 if (NumElements > 4)
11899 return SplitVectorStore(Op, DAG);
11900 // v3 stores not supported on SI.
11901 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11902 return SplitVectorStore(Op, DAG);
11903
11905 VT, *Store->getMemOperand()))
11906 return expandUnalignedStore(Store, DAG);
11907
11908 return SDValue();
11909 }
11910 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11911 switch (Subtarget->getMaxPrivateElementSize()) {
11912 case 4:
11913 return scalarizeVectorStore(Store, DAG);
11914 case 8:
11915 if (NumElements > 2)
11916 return SplitVectorStore(Op, DAG);
11917 return SDValue();
11918 case 16:
11919 if (NumElements > 4 ||
11920 (NumElements == 3 && !Subtarget->enableFlatScratch()))
11921 return SplitVectorStore(Op, DAG);
11922 return SDValue();
11923 default:
11924 llvm_unreachable("unsupported private_element_size");
11925 }
11926 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
11927 unsigned Fast = 0;
11928 auto Flags = Store->getMemOperand()->getFlags();
11930 Store->getAlign(), Flags, &Fast) &&
11931 Fast > 1)
11932 return SDValue();
11933
11934 if (VT.isVector())
11935 return SplitVectorStore(Op, DAG);
11936
11937 return expandUnalignedStore(Store, DAG);
11938 }
11939
11940 // Probably an invalid store. If so we'll end up emitting a selection error.
11941 return SDValue();
11942}
11943
11944// Avoid the full correct expansion for f32 sqrt when promoting from f16.
11945SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
11946 SDLoc SL(Op);
11947 assert(!Subtarget->has16BitInsts());
11948 SDNodeFlags Flags = Op->getFlags();
11949 SDValue Ext =
11950 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
11951
11952 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
11953 SDValue Sqrt =
11954 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
11955
11956 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
11957 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
11958}
11959
11960SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
11961 SDLoc DL(Op);
11962 SDNodeFlags Flags = Op->getFlags();
11963 MVT VT = Op.getValueType().getSimpleVT();
11964 const SDValue X = Op.getOperand(0);
11965
11966 if (allowApproxFunc(DAG, Flags)) {
11967 // Instruction is 1ulp but ignores denormals.
11968 return DAG.getNode(
11970 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
11971 }
11972
11973 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
11974 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
11975
11976 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
11977
11978 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
11979
11980 SDValue SqrtX =
11981 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
11982
11983 SDValue SqrtS;
11984 if (needsDenormHandlingF32(DAG, X, Flags)) {
11985 SDValue SqrtID =
11986 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
11987 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
11988
11989 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
11990 SDValue SqrtSNextDownInt =
11991 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11992 DAG.getAllOnesConstant(DL, MVT::i32));
11993 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
11994
11995 SDValue NegSqrtSNextDown =
11996 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
11997
11998 SDValue SqrtVP =
11999 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
12000
12001 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12002 DAG.getConstant(1, DL, MVT::i32));
12003 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
12004
12005 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
12006 SDValue SqrtVS =
12007 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
12008
12009 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
12010 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
12011
12012 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
12013 Flags);
12014
12015 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
12016 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
12017 Flags);
12018 } else {
12019 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
12020
12021 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
12022
12023 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
12024 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
12025 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
12026
12027 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
12028 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
12029 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
12030
12031 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
12032 SDValue SqrtD =
12033 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
12034 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
12035 }
12036
12037 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
12038
12039 SDValue ScaledDown =
12040 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
12041
12042 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
12043 SDValue IsZeroOrInf =
12044 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12045 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12046
12047 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
12048}
12049
12050SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
12051 // For double type, the SQRT and RSQ instructions don't have required
12052 // precision, we apply Goldschmidt's algorithm to improve the result:
12053 //
12054 // y0 = rsq(x)
12055 // g0 = x * y0
12056 // h0 = 0.5 * y0
12057 //
12058 // r0 = 0.5 - h0 * g0
12059 // g1 = g0 * r0 + g0
12060 // h1 = h0 * r0 + h0
12061 //
12062 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
12063 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
12064 // h2 = h1 * r1 + h1
12065 //
12066 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
12067 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
12068 //
12069 // sqrt(x) = g3
12070
12071 SDNodeFlags Flags = Op->getFlags();
12072
12073 SDLoc DL(Op);
12074
12075 SDValue X = Op.getOperand(0);
12076 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
12077
12078 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
12079
12080 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
12081
12082 // Scale up input if it is too small.
12083 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
12084 SDValue ScaleUp =
12085 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
12086 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
12087
12088 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
12089
12090 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
12091
12092 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
12093 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
12094
12095 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
12096 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
12097
12098 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
12099
12100 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
12101
12102 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
12103 SDValue SqrtD0 =
12104 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
12105
12106 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
12107
12108 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
12109 SDValue SqrtD1 =
12110 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
12111
12112 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
12113
12114 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
12115 SDValue ScaleDown =
12116 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
12117 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12118
12119 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
12120 // with finite only or nsz because rsq(+/-0) = +/-inf
12121
12122 // TODO: Check for DAZ and expand to subnormals
12123 SDValue IsZeroOrInf =
12124 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12125 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12126
12127 // If x is +INF, +0, or -0, use its original value
12128 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
12129 Flags);
12130}
12131
12132SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
12133 SDLoc DL(Op);
12134 EVT VT = Op.getValueType();
12135 SDValue Arg = Op.getOperand(0);
12136 SDValue TrigVal;
12137
12138 // Propagate fast-math flags so that the multiply we introduce can be folded
12139 // if Arg is already the result of a multiply by constant.
12140 auto Flags = Op->getFlags();
12141
12142 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
12143
12144 if (Subtarget->hasTrigReducedRange()) {
12145 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12146 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
12147 } else {
12148 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12149 }
12150
12151 switch (Op.getOpcode()) {
12152 case ISD::FCOS:
12153 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
12154 case ISD::FSIN:
12155 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
12156 default:
12157 llvm_unreachable("Wrong trig opcode");
12158 }
12159}
12160
12161SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
12162 SelectionDAG &DAG) const {
12163 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
12164 assert(AtomicNode->isCompareAndSwap());
12165 unsigned AS = AtomicNode->getAddressSpace();
12166
12167 // No custom lowering required for local address space
12169 return Op;
12170
12171 // Non-local address space requires custom lowering for atomic compare
12172 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
12173 SDLoc DL(Op);
12174 SDValue ChainIn = Op.getOperand(0);
12175 SDValue Addr = Op.getOperand(1);
12176 SDValue Old = Op.getOperand(2);
12177 SDValue New = Op.getOperand(3);
12178 EVT VT = Op.getValueType();
12179 MVT SimpleVT = VT.getSimpleVT();
12180 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
12181
12182 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
12183 SDValue Ops[] = {ChainIn, Addr, NewOld};
12184
12186 Op->getVTList(), Ops, VT,
12187 AtomicNode->getMemOperand());
12188}
12189
12190//===----------------------------------------------------------------------===//
12191// Custom DAG optimizations
12192//===----------------------------------------------------------------------===//
12193
12194SDValue
12195SITargetLowering::performUCharToFloatCombine(SDNode *N,
12196 DAGCombinerInfo &DCI) const {
12197 EVT VT = N->getValueType(0);
12198 EVT ScalarVT = VT.getScalarType();
12199 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12200 return SDValue();
12201
12202 SelectionDAG &DAG = DCI.DAG;
12203 SDLoc DL(N);
12204
12205 SDValue Src = N->getOperand(0);
12206 EVT SrcVT = Src.getValueType();
12207
12208 // TODO: We could try to match extracting the higher bytes, which would be
12209 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
12210 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
12211 // about in practice.
12212 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12213 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
12214 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
12215 DCI.AddToWorklist(Cvt.getNode());
12216
12217 // For the f16 case, fold to a cast to f32 and then cast back to f16.
12218 if (ScalarVT != MVT::f32) {
12219 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
12220 DAG.getTargetConstant(0, DL, MVT::i32));
12221 }
12222 return Cvt;
12223 }
12224 }
12225
12226 return SDValue();
12227}
12228
12229SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
12230 DAGCombinerInfo &DCI) const {
12231 SDValue MagnitudeOp = N->getOperand(0);
12232 SDValue SignOp = N->getOperand(1);
12233
12234 // The generic combine for fcopysign + fp cast is too conservative with
12235 // vectors, and also gets confused by the splitting we will perform here, so
12236 // peek through FP casts.
12237 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
12238 SignOp.getOpcode() == ISD::FP_ROUND)
12239 SignOp = SignOp.getOperand(0);
12240
12241 SelectionDAG &DAG = DCI.DAG;
12242 SDLoc DL(N);
12243 EVT SignVT = SignOp.getValueType();
12244
12245 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
12246 // lower half with a copy.
12247 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
12248 EVT MagVT = MagnitudeOp.getValueType();
12249
12250 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
12251
12252 if (MagVT.getScalarType() == MVT::f64) {
12253 EVT F32VT = MagVT.isVector()
12254 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
12255 : MVT::v2f32;
12256
12257 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
12258
12260 for (unsigned I = 0; I != NumElts; ++I) {
12261 SDValue MagLo =
12262 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
12263 DAG.getConstant(2 * I, DL, MVT::i32));
12264 SDValue MagHi =
12265 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
12266 DAG.getConstant(2 * I + 1, DL, MVT::i32));
12267
12268 SDValue SignOpElt =
12269 MagVT.isVector()
12271 SignOp, DAG.getConstant(I, DL, MVT::i32))
12272 : SignOp;
12273
12274 SDValue HiOp =
12275 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
12276
12277 SDValue Vector =
12278 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
12279
12280 SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
12281 NewElts.push_back(NewElt);
12282 }
12283
12284 if (NewElts.size() == 1)
12285 return NewElts[0];
12286
12287 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
12288 }
12289
12290 if (SignVT.getScalarType() != MVT::f64)
12291 return SDValue();
12292
12293 // Reduce width of sign operand, we only need the highest bit.
12294 //
12295 // fcopysign f64:x, f64:y ->
12296 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
12297 // TODO: In some cases it might make sense to go all the way to f16.
12298
12299 EVT F32VT = MagVT.isVector()
12300 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
12301 : MVT::v2f32;
12302
12303 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
12304
12305 SmallVector<SDValue, 8> F32Signs;
12306 for (unsigned I = 0; I != NumElts; ++I) {
12307 // Take sign from odd elements of cast vector
12308 SDValue SignAsF32 =
12309 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
12310 DAG.getConstant(2 * I + 1, DL, MVT::i32));
12311 F32Signs.push_back(SignAsF32);
12312 }
12313
12314 SDValue NewSign =
12315 NumElts == 1
12316 ? F32Signs.back()
12318 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
12319 F32Signs);
12320
12321 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
12322 NewSign);
12323}
12324
12325// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
12326// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
12327// bits
12328
12329// This is a variant of
12330// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
12331//
12332// The normal DAG combiner will do this, but only if the add has one use since
12333// that would increase the number of instructions.
12334//
12335// This prevents us from seeing a constant offset that can be folded into a
12336// memory instruction's addressing mode. If we know the resulting add offset of
12337// a pointer can be folded into an addressing offset, we can replace the pointer
12338// operand with the add of new constant offset. This eliminates one of the uses,
12339// and may allow the remaining use to also be simplified.
12340//
12341SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
12342 EVT MemVT,
12343 DAGCombinerInfo &DCI) const {
12344 SDValue N0 = N->getOperand(0);
12345 SDValue N1 = N->getOperand(1);
12346
12347 // We only do this to handle cases where it's profitable when there are
12348 // multiple uses of the add, so defer to the standard combine.
12349 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
12350 N0->hasOneUse())
12351 return SDValue();
12352
12353 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
12354 if (!CN1)
12355 return SDValue();
12356
12357 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
12358 if (!CAdd)
12359 return SDValue();
12360
12361 SelectionDAG &DAG = DCI.DAG;
12362
12363 if (N0->getOpcode() == ISD::OR &&
12364 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
12365 return SDValue();
12366
12367 // If the resulting offset is too large, we can't fold it into the
12368 // addressing mode offset.
12369 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
12370 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
12371
12372 AddrMode AM;
12373 AM.HasBaseReg = true;
12374 AM.BaseOffs = Offset.getSExtValue();
12375 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
12376 return SDValue();
12377
12378 SDLoc SL(N);
12379 EVT VT = N->getValueType(0);
12380
12381 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
12382 SDValue COffset = DAG.getConstant(Offset, SL, VT);
12383
12385 Flags.setNoUnsignedWrap(
12386 N->getFlags().hasNoUnsignedWrap() &&
12387 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
12388
12389 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
12390}
12391
12392/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
12393/// by the chain and intrinsic ID. Theoretically we would also need to check the
12394/// specific intrinsic, but they all place the pointer operand first.
12395static unsigned getBasePtrIndex(const MemSDNode *N) {
12396 switch (N->getOpcode()) {
12397 case ISD::STORE:
12400 return 2;
12401 default:
12402 return 1;
12403 }
12404}
12405
12406SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
12407 DAGCombinerInfo &DCI) const {
12408 SelectionDAG &DAG = DCI.DAG;
12409
12410 unsigned PtrIdx = getBasePtrIndex(N);
12411 SDValue Ptr = N->getOperand(PtrIdx);
12412
12413 // TODO: We could also do this for multiplies.
12414 if (Ptr.getOpcode() == ISD::SHL) {
12415 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
12416 N->getMemoryVT(), DCI);
12417 if (NewPtr) {
12418 SmallVector<SDValue, 8> NewOps(N->ops());
12419
12420 NewOps[PtrIdx] = NewPtr;
12421 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
12422 }
12423 }
12424
12425 return SDValue();
12426}
12427
12428static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
12429 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
12430 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
12431 (Opc == ISD::XOR && Val == 0);
12432}
12433
12434// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
12435// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
12436// integer combine opportunities since most 64-bit operations are decomposed
12437// this way. TODO: We won't want this for SALU especially if it is an inline
12438// immediate.
12439SDValue SITargetLowering::splitBinaryBitConstantOp(
12440 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
12441 const ConstantSDNode *CRHS) const {
12442 uint64_t Val = CRHS->getZExtValue();
12443 uint32_t ValLo = Lo_32(Val);
12444 uint32_t ValHi = Hi_32(Val);
12446
12447 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
12449 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
12450 // We have 64-bit scalar and/or/xor, but do not have vector forms.
12451 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
12452 !CRHS->user_begin()->isDivergent())
12453 return SDValue();
12454
12455 // If we need to materialize a 64-bit immediate, it will be split up later
12456 // anyway. Avoid creating the harder to understand 64-bit immediate
12457 // materialization.
12458 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
12459 }
12460
12461 return SDValue();
12462}
12463
12465 if (V.getValueType() != MVT::i1)
12466 return false;
12467 switch (V.getOpcode()) {
12468 default:
12469 break;
12470 case ISD::SETCC:
12471 case ISD::IS_FPCLASS:
12473 return true;
12474 case ISD::AND:
12475 case ISD::OR:
12476 case ISD::XOR:
12477 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
12478 case ISD::SADDO:
12479 case ISD::UADDO:
12480 case ISD::SSUBO:
12481 case ISD::USUBO:
12482 case ISD::SMULO:
12483 case ISD::UMULO:
12484 return V.getResNo() == 1;
12486 unsigned IntrinsicID = V.getConstantOperandVal(0);
12487 switch (IntrinsicID) {
12488 case Intrinsic::amdgcn_is_shared:
12489 case Intrinsic::amdgcn_is_private:
12490 return true;
12491 default:
12492 return false;
12493 }
12494
12495 return false;
12496 }
12497 }
12498 return false;
12499}
12500
12501// If a constant has all zeroes or all ones within each byte return it.
12502// Otherwise return 0.
12504 // 0xff for any zero byte in the mask
12505 uint32_t ZeroByteMask = 0;
12506 if (!(C & 0x000000ff))
12507 ZeroByteMask |= 0x000000ff;
12508 if (!(C & 0x0000ff00))
12509 ZeroByteMask |= 0x0000ff00;
12510 if (!(C & 0x00ff0000))
12511 ZeroByteMask |= 0x00ff0000;
12512 if (!(C & 0xff000000))
12513 ZeroByteMask |= 0xff000000;
12514 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
12515 if ((NonZeroByteMask & C) != NonZeroByteMask)
12516 return 0; // Partial bytes selected.
12517 return C;
12518}
12519
12520// Check if a node selects whole bytes from its operand 0 starting at a byte
12521// boundary while masking the rest. Returns select mask as in the v_perm_b32
12522// or -1 if not succeeded.
12523// Note byte select encoding:
12524// value 0-3 selects corresponding source byte;
12525// value 0xc selects zero;
12526// value 0xff selects 0xff.
12528 assert(V.getValueSizeInBits() == 32);
12529
12530 if (V.getNumOperands() != 2)
12531 return ~0;
12532
12533 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
12534 if (!N1)
12535 return ~0;
12536
12537 uint32_t C = N1->getZExtValue();
12538
12539 switch (V.getOpcode()) {
12540 default:
12541 break;
12542 case ISD::AND:
12543 if (uint32_t ConstMask = getConstantPermuteMask(C))
12544 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
12545 break;
12546
12547 case ISD::OR:
12548 if (uint32_t ConstMask = getConstantPermuteMask(C))
12549 return (0x03020100 & ~ConstMask) | ConstMask;
12550 break;
12551
12552 case ISD::SHL:
12553 if (C % 8)
12554 return ~0;
12555
12556 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
12557
12558 case ISD::SRL:
12559 if (C % 8)
12560 return ~0;
12561
12562 return uint32_t(0x0c0c0c0c03020100ull >> C);
12563 }
12564
12565 return ~0;
12566}
12567
12568SDValue SITargetLowering::performAndCombine(SDNode *N,
12569 DAGCombinerInfo &DCI) const {
12570 if (DCI.isBeforeLegalize())
12571 return SDValue();
12572
12573 SelectionDAG &DAG = DCI.DAG;
12574 EVT VT = N->getValueType(0);
12575 SDValue LHS = N->getOperand(0);
12576 SDValue RHS = N->getOperand(1);
12577
12578 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12579 if (VT == MVT::i64 && CRHS) {
12580 if (SDValue Split =
12581 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
12582 return Split;
12583 }
12584
12585 if (CRHS && VT == MVT::i32) {
12586 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
12587 // nb = number of trailing zeroes in mask
12588 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
12589 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
12590 uint64_t Mask = CRHS->getZExtValue();
12591 unsigned Bits = llvm::popcount(Mask);
12592 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
12593 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
12594 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
12595 unsigned Shift = CShift->getZExtValue();
12596 unsigned NB = CRHS->getAPIntValue().countr_zero();
12597 unsigned Offset = NB + Shift;
12598 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
12599 SDLoc SL(N);
12600 SDValue BFE =
12601 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
12602 DAG.getConstant(Offset, SL, MVT::i32),
12603 DAG.getConstant(Bits, SL, MVT::i32));
12604 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
12605 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
12606 DAG.getValueType(NarrowVT));
12607 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
12608 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
12609 return Shl;
12610 }
12611 }
12612 }
12613
12614 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12615 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
12616 isa<ConstantSDNode>(LHS.getOperand(2))) {
12617 uint32_t Sel = getConstantPermuteMask(Mask);
12618 if (!Sel)
12619 return SDValue();
12620
12621 // Select 0xc for all zero bytes
12622 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
12623 SDLoc DL(N);
12624 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12625 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
12626 }
12627 }
12628
12629 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
12630 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
12631 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
12632 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
12633 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
12634
12635 SDValue X = LHS.getOperand(0);
12636 SDValue Y = RHS.getOperand(0);
12637 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
12638 !isTypeLegal(X.getValueType()))
12639 return SDValue();
12640
12641 if (LCC == ISD::SETO) {
12642 if (X != LHS.getOperand(1))
12643 return SDValue();
12644
12645 if (RCC == ISD::SETUNE) {
12646 const ConstantFPSDNode *C1 =
12647 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
12648 if (!C1 || !C1->isInfinity() || C1->isNegative())
12649 return SDValue();
12650
12655
12656 static_assert(
12659 0x3ff) == Mask,
12660 "mask not equal");
12661
12662 SDLoc DL(N);
12663 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
12664 DAG.getConstant(Mask, DL, MVT::i32));
12665 }
12666 }
12667 }
12668
12669 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
12670 std::swap(LHS, RHS);
12671
12672 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
12673 RHS.hasOneUse()) {
12674 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
12675 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
12676 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
12677 // | n_nan)
12678 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
12679 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
12680 (RHS.getOperand(0) == LHS.getOperand(0) &&
12681 LHS.getOperand(0) == LHS.getOperand(1))) {
12682 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
12683 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
12684 : Mask->getZExtValue() & OrdMask;
12685
12686 SDLoc DL(N);
12687 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
12688 DAG.getConstant(NewMask, DL, MVT::i32));
12689 }
12690 }
12691
12692 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
12693 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
12694 // and x, (sext cc from i1) => select cc, x, 0
12695 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
12696 std::swap(LHS, RHS);
12697 if (isBoolSGPR(RHS.getOperand(0)))
12698 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
12699 DAG.getConstant(0, SDLoc(N), MVT::i32));
12700 }
12701
12702 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12704 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
12705 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12706 uint32_t LHSMask = getPermuteMask(LHS);
12707 uint32_t RHSMask = getPermuteMask(RHS);
12708 if (LHSMask != ~0u && RHSMask != ~0u) {
12709 // Canonicalize the expression in an attempt to have fewer unique masks
12710 // and therefore fewer registers used to hold the masks.
12711 if (LHSMask > RHSMask) {
12712 std::swap(LHSMask, RHSMask);
12713 std::swap(LHS, RHS);
12714 }
12715
12716 // Select 0xc for each lane used from source operand. Zero has 0xc mask
12717 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12718 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12719 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12720
12721 // Check of we need to combine values from two sources within a byte.
12722 if (!(LHSUsedLanes & RHSUsedLanes) &&
12723 // If we select high and lower word keep it for SDWA.
12724 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
12725 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12726 // Each byte in each mask is either selector mask 0-3, or has higher
12727 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
12728 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
12729 // mask which is not 0xff wins. By anding both masks we have a correct
12730 // result except that 0x0c shall be corrected to give 0x0c only.
12731 uint32_t Mask = LHSMask & RHSMask;
12732 for (unsigned I = 0; I < 32; I += 8) {
12733 uint32_t ByteSel = 0xff << I;
12734 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
12735 Mask &= (0x0c << I) & 0xffffffff;
12736 }
12737
12738 // Add 4 to each active LHS lane. It will not affect any existing 0xff
12739 // or 0x0c.
12740 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
12741 SDLoc DL(N);
12742
12743 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12744 RHS.getOperand(0),
12745 DAG.getConstant(Sel, DL, MVT::i32));
12746 }
12747 }
12748 }
12749
12750 return SDValue();
12751}
12752
12753// A key component of v_perm is a mapping between byte position of the src
12754// operands, and the byte position of the dest. To provide such, we need: 1. the
12755// node that provides x byte of the dest of the OR, and 2. the byte of the node
12756// used to provide that x byte. calculateByteProvider finds which node provides
12757// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
12758// and finds an ultimate src and byte position For example: The supported
12759// LoadCombine pattern for vector loads is as follows
12760// t1
12761// or
12762// / \
12763// t2 t3
12764// zext shl
12765// | | \
12766// t4 t5 16
12767// or anyext
12768// / \ |
12769// t6 t7 t8
12770// srl shl or
12771// / | / \ / \
12772// t9 t10 t11 t12 t13 t14
12773// trunc* 8 trunc* 8 and and
12774// | | / | | \
12775// t15 t16 t17 t18 t19 t20
12776// trunc* 255 srl -256
12777// | / \
12778// t15 t15 16
12779//
12780// *In this example, the truncs are from i32->i16
12781//
12782// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
12783// respectively. calculateSrcByte would find (given node) -> ultimate src &
12784// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
12785// After finding the mapping, we can combine the tree into vperm t15, t16,
12786// 0x05000407
12787
12788// Find the source and byte position from a node.
12789// \p DestByte is the byte position of the dest of the or that the src
12790// ultimately provides. \p SrcIndex is the byte of the src that maps to this
12791// dest of the or byte. \p Depth tracks how many recursive iterations we have
12792// performed.
12793static const std::optional<ByteProvider<SDValue>>
12794calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
12795 unsigned Depth = 0) {
12796 // We may need to recursively traverse a series of SRLs
12797 if (Depth >= 6)
12798 return std::nullopt;
12799
12800 if (Op.getValueSizeInBits() < 8)
12801 return std::nullopt;
12802
12803 if (Op.getValueType().isVector())
12804 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
12805
12806 switch (Op->getOpcode()) {
12807 case ISD::TRUNCATE: {
12808 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
12809 }
12810
12811 case ISD::SIGN_EXTEND:
12812 case ISD::ZERO_EXTEND:
12814 SDValue NarrowOp = Op->getOperand(0);
12815 auto NarrowVT = NarrowOp.getValueType();
12816 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
12817 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
12818 NarrowVT = VTSign->getVT();
12819 }
12820 if (!NarrowVT.isByteSized())
12821 return std::nullopt;
12822 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
12823
12824 if (SrcIndex >= NarrowByteWidth)
12825 return std::nullopt;
12826 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
12827 }
12828
12829 case ISD::SRA:
12830 case ISD::SRL: {
12831 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12832 if (!ShiftOp)
12833 return std::nullopt;
12834
12835 uint64_t BitShift = ShiftOp->getZExtValue();
12836
12837 if (BitShift % 8 != 0)
12838 return std::nullopt;
12839
12840 SrcIndex += BitShift / 8;
12841
12842 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
12843 }
12844
12845 default: {
12846 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
12847 }
12848 }
12849 llvm_unreachable("fully handled switch");
12850}
12851
12852// For a byte position in the result of an Or, traverse the tree and find the
12853// node (and the byte of the node) which ultimately provides this {Or,
12854// BytePosition}. \p Op is the operand we are currently examining. \p Index is
12855// the byte position of the Op that corresponds with the originally requested
12856// byte of the Or \p Depth tracks how many recursive iterations we have
12857// performed. \p StartingIndex is the originally requested byte of the Or
12858static const std::optional<ByteProvider<SDValue>>
12859calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
12860 unsigned StartingIndex = 0) {
12861 // Finding Src tree of RHS of or typically requires at least 1 additional
12862 // depth
12863 if (Depth > 6)
12864 return std::nullopt;
12865
12866 unsigned BitWidth = Op.getScalarValueSizeInBits();
12867 if (BitWidth % 8 != 0)
12868 return std::nullopt;
12869 if (Index > BitWidth / 8 - 1)
12870 return std::nullopt;
12871
12872 bool IsVec = Op.getValueType().isVector();
12873 switch (Op.getOpcode()) {
12874 case ISD::OR: {
12875 if (IsVec)
12876 return std::nullopt;
12877
12878 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
12879 StartingIndex);
12880 if (!RHS)
12881 return std::nullopt;
12882 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
12883 StartingIndex);
12884 if (!LHS)
12885 return std::nullopt;
12886 // A well formed Or will have two ByteProviders for each byte, one of which
12887 // is constant zero
12888 if (!LHS->isConstantZero() && !RHS->isConstantZero())
12889 return std::nullopt;
12890 if (!LHS || LHS->isConstantZero())
12891 return RHS;
12892 if (!RHS || RHS->isConstantZero())
12893 return LHS;
12894 return std::nullopt;
12895 }
12896
12897 case ISD::AND: {
12898 if (IsVec)
12899 return std::nullopt;
12900
12901 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12902 if (!BitMaskOp)
12903 return std::nullopt;
12904
12905 uint32_t BitMask = BitMaskOp->getZExtValue();
12906 // Bits we expect for our StartingIndex
12907 uint32_t IndexMask = 0xFF << (Index * 8);
12908
12909 if ((IndexMask & BitMask) != IndexMask) {
12910 // If the result of the and partially provides the byte, then it
12911 // is not well formatted
12912 if (IndexMask & BitMask)
12913 return std::nullopt;
12915 }
12916
12917 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
12918 }
12919
12920 case ISD::FSHR: {
12921 if (IsVec)
12922 return std::nullopt;
12923
12924 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
12925 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12926 if (!ShiftOp || Op.getValueType().isVector())
12927 return std::nullopt;
12928
12929 uint64_t BitsProvided = Op.getValueSizeInBits();
12930 if (BitsProvided % 8 != 0)
12931 return std::nullopt;
12932
12933 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
12934 if (BitShift % 8)
12935 return std::nullopt;
12936
12937 uint64_t ConcatSizeInBytes = BitsProvided / 4;
12938 uint64_t ByteShift = BitShift / 8;
12939
12940 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
12941 uint64_t BytesProvided = BitsProvided / 8;
12942 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
12943 NewIndex %= BytesProvided;
12944 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
12945 }
12946
12947 case ISD::SRA:
12948 case ISD::SRL: {
12949 if (IsVec)
12950 return std::nullopt;
12951
12952 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12953 if (!ShiftOp)
12954 return std::nullopt;
12955
12956 uint64_t BitShift = ShiftOp->getZExtValue();
12957 if (BitShift % 8)
12958 return std::nullopt;
12959
12960 auto BitsProvided = Op.getScalarValueSizeInBits();
12961 if (BitsProvided % 8 != 0)
12962 return std::nullopt;
12963
12964 uint64_t BytesProvided = BitsProvided / 8;
12965 uint64_t ByteShift = BitShift / 8;
12966 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
12967 // If the byte we are trying to provide (as tracked by index) falls in this
12968 // range, then the SRL provides the byte. The byte of interest of the src of
12969 // the SRL is Index + ByteShift
12970 return BytesProvided - ByteShift > Index
12971 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
12972 Index + ByteShift)
12974 }
12975
12976 case ISD::SHL: {
12977 if (IsVec)
12978 return std::nullopt;
12979
12980 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12981 if (!ShiftOp)
12982 return std::nullopt;
12983
12984 uint64_t BitShift = ShiftOp->getZExtValue();
12985 if (BitShift % 8 != 0)
12986 return std::nullopt;
12987 uint64_t ByteShift = BitShift / 8;
12988
12989 // If we are shifting by an amount greater than (or equal to)
12990 // the index we are trying to provide, then it provides 0s. If not,
12991 // then this bytes are not definitively 0s, and the corresponding byte
12992 // of interest is Index - ByteShift of the src
12993 return Index < ByteShift
12995 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
12996 Depth + 1, StartingIndex);
12997 }
12998 case ISD::ANY_EXTEND:
12999 case ISD::SIGN_EXTEND:
13000 case ISD::ZERO_EXTEND:
13002 case ISD::AssertZext:
13003 case ISD::AssertSext: {
13004 if (IsVec)
13005 return std::nullopt;
13006
13007 SDValue NarrowOp = Op->getOperand(0);
13008 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
13009 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
13010 Op->getOpcode() == ISD::AssertZext ||
13011 Op->getOpcode() == ISD::AssertSext) {
13012 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13013 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13014 }
13015 if (NarrowBitWidth % 8 != 0)
13016 return std::nullopt;
13017 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13018
13019 if (Index >= NarrowByteWidth)
13020 return Op.getOpcode() == ISD::ZERO_EXTEND
13021 ? std::optional<ByteProvider<SDValue>>(
13023 : std::nullopt;
13024 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
13025 }
13026
13027 case ISD::TRUNCATE: {
13028 if (IsVec)
13029 return std::nullopt;
13030
13031 uint64_t NarrowByteWidth = BitWidth / 8;
13032
13033 if (NarrowByteWidth >= Index) {
13034 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13035 StartingIndex);
13036 }
13037
13038 return std::nullopt;
13039 }
13040
13041 case ISD::CopyFromReg: {
13042 if (BitWidth / 8 > Index)
13043 return calculateSrcByte(Op, StartingIndex, Index);
13044
13045 return std::nullopt;
13046 }
13047
13048 case ISD::LOAD: {
13049 auto *L = cast<LoadSDNode>(Op.getNode());
13050
13051 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13052 if (NarrowBitWidth % 8 != 0)
13053 return std::nullopt;
13054 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13055
13056 // If the width of the load does not reach byte we are trying to provide for
13057 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
13058 // question
13059 if (Index >= NarrowByteWidth) {
13060 return L->getExtensionType() == ISD::ZEXTLOAD
13061 ? std::optional<ByteProvider<SDValue>>(
13063 : std::nullopt;
13064 }
13065
13066 if (NarrowByteWidth > Index) {
13067 return calculateSrcByte(Op, StartingIndex, Index);
13068 }
13069
13070 return std::nullopt;
13071 }
13072
13073 case ISD::BSWAP: {
13074 if (IsVec)
13075 return std::nullopt;
13076
13077 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
13078 Depth + 1, StartingIndex);
13079 }
13080
13082 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13083 if (!IdxOp)
13084 return std::nullopt;
13085 auto VecIdx = IdxOp->getZExtValue();
13086 auto ScalarSize = Op.getScalarValueSizeInBits();
13087 if (ScalarSize < 32)
13088 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13089 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
13090 StartingIndex, Index);
13091 }
13092
13093 case AMDGPUISD::PERM: {
13094 if (IsVec)
13095 return std::nullopt;
13096
13097 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13098 if (!PermMask)
13099 return std::nullopt;
13100
13101 auto IdxMask =
13102 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13103 if (IdxMask > 0x07 && IdxMask != 0x0c)
13104 return std::nullopt;
13105
13106 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13107 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13108
13109 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
13112 }
13113
13114 default: {
13115 return std::nullopt;
13116 }
13117 }
13118
13119 llvm_unreachable("fully handled switch");
13120}
13121
13122// Returns true if the Operand is a scalar and is 16 bits
13123static bool isExtendedFrom16Bits(SDValue &Operand) {
13124
13125 switch (Operand.getOpcode()) {
13126 case ISD::ANY_EXTEND:
13127 case ISD::SIGN_EXTEND:
13128 case ISD::ZERO_EXTEND: {
13129 auto OpVT = Operand.getOperand(0).getValueType();
13130 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
13131 }
13132 case ISD::LOAD: {
13133 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
13134 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
13135 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
13136 ExtType == ISD::EXTLOAD) {
13137 auto MemVT = L->getMemoryVT();
13138 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
13139 }
13140 return L->getMemoryVT().getSizeInBits() == 16;
13141 }
13142 default:
13143 return false;
13144 }
13145}
13146
13147// Returns true if the mask matches consecutive bytes, and the first byte
13148// begins at a power of 2 byte offset from 0th byte
13149static bool addresses16Bits(int Mask) {
13150 int Low8 = Mask & 0xff;
13151 int Hi8 = (Mask & 0xff00) >> 8;
13152
13153 assert(Low8 < 8 && Hi8 < 8);
13154 // Are the bytes contiguous in the order of increasing addresses.
13155 bool IsConsecutive = (Hi8 - Low8 == 1);
13156 // Is the first byte at location that is aligned for 16 bit instructions.
13157 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
13158 // In this case, we still need code to extract the 16 bit operand, so it
13159 // is better to use i8 v_perm
13160 bool Is16Aligned = !(Low8 % 2);
13161
13162 return IsConsecutive && Is16Aligned;
13163}
13164
13165// Do not lower into v_perm if the operands are actually 16 bit
13166// and the selected bits (based on PermMask) correspond with two
13167// easily addressable 16 bit operands.
13169 SDValue &OtherOp) {
13170 int Low16 = PermMask & 0xffff;
13171 int Hi16 = (PermMask & 0xffff0000) >> 16;
13172
13173 auto TempOp = peekThroughBitcasts(Op);
13174 auto TempOtherOp = peekThroughBitcasts(OtherOp);
13175
13176 auto OpIs16Bit =
13177 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
13178 if (!OpIs16Bit)
13179 return true;
13180
13181 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13182 isExtendedFrom16Bits(TempOtherOp);
13183 if (!OtherOpIs16Bit)
13184 return true;
13185
13186 // Do we cleanly address both
13187 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
13188}
13189
13191 unsigned DWordOffset) {
13192 SDValue Ret;
13193
13194 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
13195 // ByteProvider must be at least 8 bits
13196 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13197
13198 if (TypeSize <= 32)
13199 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
13200
13201 if (Src.getValueType().isVector()) {
13202 auto ScalarTySize = Src.getScalarValueSizeInBits();
13203 auto ScalarTy = Src.getValueType().getScalarType();
13204 if (ScalarTySize == 32) {
13205 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
13206 DAG.getConstant(DWordOffset, SL, MVT::i32));
13207 }
13208 if (ScalarTySize > 32) {
13209 Ret = DAG.getNode(
13210 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
13211 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13212 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13213 if (ShiftVal)
13214 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
13215 DAG.getConstant(ShiftVal, SL, MVT::i32));
13216 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13217 }
13218
13219 assert(ScalarTySize < 32);
13220 auto NumElements = TypeSize / ScalarTySize;
13221 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13222 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13223 auto NumElementsIn32 = 32 / ScalarTySize;
13224 auto NumAvailElements = DWordOffset < Trunc32Elements
13225 ? NumElementsIn32
13226 : NumElements - NormalizedTrunc;
13227
13229 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
13230 NumAvailElements);
13231
13232 Ret = DAG.getBuildVector(
13233 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
13234 VecSrcs);
13235 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13236 }
13237
13238 /// Scalar Type
13239 auto ShiftVal = 32 * DWordOffset;
13240 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
13241 DAG.getConstant(ShiftVal, SL, MVT::i32));
13242 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13243}
13244
13246 SelectionDAG &DAG = DCI.DAG;
13247 [[maybe_unused]] EVT VT = N->getValueType(0);
13249
13250 // VT is known to be MVT::i32, so we need to provide 4 bytes.
13251 assert(VT == MVT::i32);
13252 for (int i = 0; i < 4; i++) {
13253 // Find the ByteProvider that provides the ith byte of the result of OR
13254 std::optional<ByteProvider<SDValue>> P =
13255 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
13256 // TODO support constantZero
13257 if (!P || P->isConstantZero())
13258 return SDValue();
13259
13260 PermNodes.push_back(*P);
13261 }
13262 if (PermNodes.size() != 4)
13263 return SDValue();
13264
13265 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
13266 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
13267 uint64_t PermMask = 0x00000000;
13268 for (size_t i = 0; i < PermNodes.size(); i++) {
13269 auto PermOp = PermNodes[i];
13270 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
13271 // by sizeof(Src2) = 4
13272 int SrcByteAdjust = 4;
13273
13274 // If the Src uses a byte from a different DWORD, then it corresponds
13275 // with a difference source
13276 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
13277 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
13278 if (SecondSrc)
13279 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
13280 ((PermOp.SrcOffset / 4) != SecondSrc->second))
13281 return SDValue();
13282
13283 // Set the index of the second distinct Src node
13284 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
13285 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
13286 SrcByteAdjust = 0;
13287 }
13288 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
13290 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
13291 }
13292 SDLoc DL(N);
13293 SDValue Op = *PermNodes[FirstSrc.first].Src;
13294 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
13295 assert(Op.getValueSizeInBits() == 32);
13296
13297 // Check that we are not just extracting the bytes in order from an op
13298 if (!SecondSrc) {
13299 int Low16 = PermMask & 0xffff;
13300 int Hi16 = (PermMask & 0xffff0000) >> 16;
13301
13302 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
13303 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
13304
13305 // The perm op would really just produce Op. So combine into Op
13306 if (WellFormedLow && WellFormedHi)
13307 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
13308 }
13309
13310 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
13311
13312 if (SecondSrc) {
13313 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
13314 assert(OtherOp.getValueSizeInBits() == 32);
13315 }
13316
13317 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
13318
13319 assert(Op.getValueType().isByteSized() &&
13320 OtherOp.getValueType().isByteSized());
13321
13322 // If the ultimate src is less than 32 bits, then we will only be
13323 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
13324 // CalculateByteProvider would not have returned Op as source if we
13325 // used a byte that is outside its ValueType. Thus, we are free to
13326 // ANY_EXTEND as the extended bits are dont-cares.
13327 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
13328 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
13329
13330 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
13331 DAG.getConstant(PermMask, DL, MVT::i32));
13332 }
13333 return SDValue();
13334}
13335
13336SDValue SITargetLowering::performOrCombine(SDNode *N,
13337 DAGCombinerInfo &DCI) const {
13338 SelectionDAG &DAG = DCI.DAG;
13339 SDValue LHS = N->getOperand(0);
13340 SDValue RHS = N->getOperand(1);
13341
13342 EVT VT = N->getValueType(0);
13343 if (VT == MVT::i1) {
13344 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
13345 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13346 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
13347 SDValue Src = LHS.getOperand(0);
13348 if (Src != RHS.getOperand(0))
13349 return SDValue();
13350
13351 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
13352 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13353 if (!CLHS || !CRHS)
13354 return SDValue();
13355
13356 // Only 10 bits are used.
13357 static const uint32_t MaxMask = 0x3ff;
13358
13359 uint32_t NewMask =
13360 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
13361 SDLoc DL(N);
13362 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
13363 DAG.getConstant(NewMask, DL, MVT::i32));
13364 }
13365
13366 return SDValue();
13367 }
13368
13369 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13370 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
13371 LHS.getOpcode() == AMDGPUISD::PERM &&
13372 isa<ConstantSDNode>(LHS.getOperand(2))) {
13373 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
13374 if (!Sel)
13375 return SDValue();
13376
13377 Sel |= LHS.getConstantOperandVal(2);
13378 SDLoc DL(N);
13379 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13380 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13381 }
13382
13383 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13385 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13386 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13387
13388 // If all the uses of an or need to extract the individual elements, do not
13389 // attempt to lower into v_perm
13390 auto usesCombinedOperand = [](SDNode *OrUse) {
13391 // If we have any non-vectorized use, then it is a candidate for v_perm
13392 if (OrUse->getOpcode() != ISD::BITCAST ||
13393 !OrUse->getValueType(0).isVector())
13394 return true;
13395
13396 // If we have any non-vectorized use, then it is a candidate for v_perm
13397 for (auto *VUser : OrUse->users()) {
13398 if (!VUser->getValueType(0).isVector())
13399 return true;
13400
13401 // If the use of a vector is a store, then combining via a v_perm
13402 // is beneficial.
13403 // TODO -- whitelist more uses
13404 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
13405 if (VUser->getOpcode() == VectorwiseOp)
13406 return true;
13407 }
13408 return false;
13409 };
13410
13411 if (!any_of(N->users(), usesCombinedOperand))
13412 return SDValue();
13413
13414 uint32_t LHSMask = getPermuteMask(LHS);
13415 uint32_t RHSMask = getPermuteMask(RHS);
13416
13417 if (LHSMask != ~0u && RHSMask != ~0u) {
13418 // Canonicalize the expression in an attempt to have fewer unique masks
13419 // and therefore fewer registers used to hold the masks.
13420 if (LHSMask > RHSMask) {
13421 std::swap(LHSMask, RHSMask);
13422 std::swap(LHS, RHS);
13423 }
13424
13425 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13426 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13427 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13428 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13429
13430 // Check of we need to combine values from two sources within a byte.
13431 if (!(LHSUsedLanes & RHSUsedLanes) &&
13432 // If we select high and lower word keep it for SDWA.
13433 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13434 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13435 // Kill zero bytes selected by other mask. Zero value is 0xc.
13436 LHSMask &= ~RHSUsedLanes;
13437 RHSMask &= ~LHSUsedLanes;
13438 // Add 4 to each active LHS lane
13439 LHSMask |= LHSUsedLanes & 0x04040404;
13440 // Combine masks
13441 uint32_t Sel = LHSMask | RHSMask;
13442 SDLoc DL(N);
13443
13444 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13445 RHS.getOperand(0),
13446 DAG.getConstant(Sel, DL, MVT::i32));
13447 }
13448 }
13449 if (LHSMask == ~0u || RHSMask == ~0u) {
13450 if (SDValue Perm = matchPERM(N, DCI))
13451 return Perm;
13452 }
13453 }
13454
13455 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
13456 return SDValue();
13457
13458 // TODO: This could be a generic combine with a predicate for extracting the
13459 // high half of an integer being free.
13460
13461 // (or i64:x, (zero_extend i32:y)) ->
13462 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
13463 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
13464 RHS.getOpcode() != ISD::ZERO_EXTEND)
13465 std::swap(LHS, RHS);
13466
13467 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
13468 SDValue ExtSrc = RHS.getOperand(0);
13469 EVT SrcVT = ExtSrc.getValueType();
13470 if (SrcVT == MVT::i32) {
13471 SDLoc SL(N);
13472 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
13473 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
13474
13475 DCI.AddToWorklist(LowOr.getNode());
13476 DCI.AddToWorklist(HiBits.getNode());
13477
13478 SDValue Vec =
13479 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
13480 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
13481 }
13482 }
13483
13484 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
13485 if (CRHS) {
13486 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
13487 N->getOperand(0), CRHS))
13488 return Split;
13489 }
13490
13491 return SDValue();
13492}
13493
13494SDValue SITargetLowering::performXorCombine(SDNode *N,
13495 DAGCombinerInfo &DCI) const {
13496 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
13497 return RV;
13498
13499 SDValue LHS = N->getOperand(0);
13500 SDValue RHS = N->getOperand(1);
13501
13502 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13503 SelectionDAG &DAG = DCI.DAG;
13504
13505 EVT VT = N->getValueType(0);
13506 if (CRHS && VT == MVT::i64) {
13507 if (SDValue Split =
13508 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
13509 return Split;
13510 }
13511
13512 // Make sure to apply the 64-bit constant splitting fold before trying to fold
13513 // fneg-like xors into 64-bit select.
13514 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
13515 // This looks like an fneg, try to fold as a source modifier.
13516 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
13517 shouldFoldFNegIntoSrc(N, LHS)) {
13518 // xor (select c, a, b), 0x80000000 ->
13519 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
13520 SDLoc DL(N);
13521 SDValue CastLHS =
13522 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
13523 SDValue CastRHS =
13524 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
13525 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
13526 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
13527 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
13528 LHS->getOperand(0), FNegLHS, FNegRHS);
13529 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
13530 }
13531 }
13532
13533 return SDValue();
13534}
13535
13536SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
13537 DAGCombinerInfo &DCI) const {
13538 if (!Subtarget->has16BitInsts() ||
13539 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
13540 return SDValue();
13541
13542 EVT VT = N->getValueType(0);
13543 if (VT != MVT::i32)
13544 return SDValue();
13545
13546 SDValue Src = N->getOperand(0);
13547 if (Src.getValueType() != MVT::i16)
13548 return SDValue();
13549
13550 return SDValue();
13551}
13552
13553SDValue
13554SITargetLowering::performSignExtendInRegCombine(SDNode *N,
13555 DAGCombinerInfo &DCI) const {
13556 SDValue Src = N->getOperand(0);
13557 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
13558
13559 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
13560 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
13561 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
13562 VTSign->getVT() == MVT::i8) ||
13563 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
13564 VTSign->getVT() == MVT::i16))) {
13565 assert(Subtarget->hasScalarSubwordLoads() &&
13566 "s_buffer_load_{u8, i8} are supported "
13567 "in GFX12 (or newer) architectures.");
13568 EVT VT = Src.getValueType();
13569 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
13572 SDLoc DL(N);
13573 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
13574 SDValue Ops[] = {
13575 Src.getOperand(0), // source register
13576 Src.getOperand(1), // offset
13577 Src.getOperand(2) // cachePolicy
13578 };
13579 auto *M = cast<MemSDNode>(Src);
13580 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
13581 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
13582 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
13583 return LoadVal;
13584 }
13585 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
13586 VTSign->getVT() == MVT::i8) ||
13587 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
13588 VTSign->getVT() == MVT::i16)) &&
13589 Src.hasOneUse()) {
13590 auto *M = cast<MemSDNode>(Src);
13591 SDValue Ops[] = {Src.getOperand(0), // Chain
13592 Src.getOperand(1), // rsrc
13593 Src.getOperand(2), // vindex
13594 Src.getOperand(3), // voffset
13595 Src.getOperand(4), // soffset
13596 Src.getOperand(5), // offset
13597 Src.getOperand(6), Src.getOperand(7)};
13598 // replace with BUFFER_LOAD_BYTE/SHORT
13599 SDVTList ResList =
13600 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
13601 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
13604 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
13605 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
13606 return DCI.DAG.getMergeValues(
13607 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
13608 }
13609 return SDValue();
13610}
13611
13612SDValue SITargetLowering::performClassCombine(SDNode *N,
13613 DAGCombinerInfo &DCI) const {
13614 SelectionDAG &DAG = DCI.DAG;
13615 SDValue Mask = N->getOperand(1);
13616
13617 // fp_class x, 0 -> false
13618 if (isNullConstant(Mask))
13619 return DAG.getConstant(0, SDLoc(N), MVT::i1);
13620
13621 if (N->getOperand(0).isUndef())
13622 return DAG.getUNDEF(MVT::i1);
13623
13624 return SDValue();
13625}
13626
13627SDValue SITargetLowering::performRcpCombine(SDNode *N,
13628 DAGCombinerInfo &DCI) const {
13629 EVT VT = N->getValueType(0);
13630 SDValue N0 = N->getOperand(0);
13631
13632 if (N0.isUndef()) {
13633 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
13634 SDLoc(N), VT);
13635 }
13636
13637 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
13638 N0.getOpcode() == ISD::SINT_TO_FP)) {
13639 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
13640 N->getFlags());
13641 }
13642
13643 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
13644 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
13645 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
13646 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
13647 N->getFlags());
13648 }
13649
13651}
13652
13654 unsigned MaxDepth) const {
13655 unsigned Opcode = Op.getOpcode();
13656 if (Opcode == ISD::FCANONICALIZE)
13657 return true;
13658
13659 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
13660 const auto &F = CFP->getValueAPF();
13661 if (F.isNaN() && F.isSignaling())
13662 return false;
13663 if (!F.isDenormal())
13664 return true;
13665
13666 DenormalMode Mode =
13667 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
13668 return Mode == DenormalMode::getIEEE();
13669 }
13670
13671 // If source is a result of another standard FP operation it is already in
13672 // canonical form.
13673 if (MaxDepth == 0)
13674 return false;
13675
13676 switch (Opcode) {
13677 // These will flush denorms if required.
13678 case ISD::FADD:
13679 case ISD::FSUB:
13680 case ISD::FMUL:
13681 case ISD::FCEIL:
13682 case ISD::FFLOOR:
13683 case ISD::FMA:
13684 case ISD::FMAD:
13685 case ISD::FSQRT:
13686 case ISD::FDIV:
13687 case ISD::FREM:
13688 case ISD::FP_ROUND:
13689 case ISD::FP_EXTEND:
13690 case ISD::FP16_TO_FP:
13691 case ISD::FP_TO_FP16:
13692 case ISD::BF16_TO_FP:
13693 case ISD::FP_TO_BF16:
13694 case ISD::FLDEXP:
13697 case AMDGPUISD::RCP:
13698 case AMDGPUISD::RSQ:
13702 case AMDGPUISD::LOG:
13703 case AMDGPUISD::EXP:
13707 case AMDGPUISD::FRACT:
13714 case AMDGPUISD::SIN_HW:
13715 case AMDGPUISD::COS_HW:
13716 return true;
13717
13718 // It can/will be lowered or combined as a bit operation.
13719 // Need to check their input recursively to handle.
13720 case ISD::FNEG:
13721 case ISD::FABS:
13722 case ISD::FCOPYSIGN:
13723 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
13724
13725 case ISD::AND:
13726 if (Op.getValueType() == MVT::i32) {
13727 // Be careful as we only know it is a bitcast floating point type. It
13728 // could be f32, v2f16, we have no way of knowing. Luckily the constant
13729 // value that we optimize for, which comes up in fp32 to bf16 conversions,
13730 // is valid to optimize for all types.
13731 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
13732 if (RHS->getZExtValue() == 0xffff0000) {
13733 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
13734 }
13735 }
13736 }
13737 break;
13738
13739 case ISD::FSIN:
13740 case ISD::FCOS:
13741 case ISD::FSINCOS:
13742 return Op.getValueType().getScalarType() != MVT::f16;
13743
13744 case ISD::FMINNUM:
13745 case ISD::FMAXNUM:
13746 case ISD::FMINNUM_IEEE:
13747 case ISD::FMAXNUM_IEEE:
13748 case ISD::FMINIMUM:
13749 case ISD::FMAXIMUM:
13750 case ISD::FMINIMUMNUM:
13751 case ISD::FMAXIMUMNUM:
13752 case AMDGPUISD::CLAMP:
13753 case AMDGPUISD::FMED3:
13754 case AMDGPUISD::FMAX3:
13755 case AMDGPUISD::FMIN3:
13757 case AMDGPUISD::FMINIMUM3: {
13758 // FIXME: Shouldn't treat the generic operations different based these.
13759 // However, we aren't really required to flush the result from
13760 // minnum/maxnum..
13761
13762 // snans will be quieted, so we only need to worry about denormals.
13763 if (Subtarget->supportsMinMaxDenormModes() ||
13764 // FIXME: denormalsEnabledForType is broken for dynamic
13765 denormalsEnabledForType(DAG, Op.getValueType()))
13766 return true;
13767
13768 // Flushing may be required.
13769 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
13770 // targets need to check their input recursively.
13771
13772 // FIXME: Does this apply with clamp? It's implemented with max.
13773 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
13774 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
13775 return false;
13776 }
13777
13778 return true;
13779 }
13780 case ISD::SELECT: {
13781 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
13782 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
13783 }
13784 case ISD::BUILD_VECTOR: {
13785 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
13786 SDValue SrcOp = Op.getOperand(i);
13787 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
13788 return false;
13789 }
13790
13791 return true;
13792 }
13795 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
13796 }
13798 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
13799 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
13800 }
13801 case ISD::UNDEF:
13802 // Could be anything.
13803 return false;
13804
13805 case ISD::BITCAST:
13806 // TODO: This is incorrect as it loses track of the operand's type. We may
13807 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
13808 // same bits that are canonicalized in one type need not be in the other.
13809 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
13810 case ISD::TRUNCATE: {
13811 // Hack round the mess we make when legalizing extract_vector_elt
13812 if (Op.getValueType() == MVT::i16) {
13813 SDValue TruncSrc = Op.getOperand(0);
13814 if (TruncSrc.getValueType() == MVT::i32 &&
13815 TruncSrc.getOpcode() == ISD::BITCAST &&
13816 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
13817 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
13818 }
13819 }
13820 return false;
13821 }
13823 unsigned IntrinsicID = Op.getConstantOperandVal(0);
13824 // TODO: Handle more intrinsics
13825 switch (IntrinsicID) {
13826 case Intrinsic::amdgcn_cvt_pkrtz:
13827 case Intrinsic::amdgcn_cubeid:
13828 case Intrinsic::amdgcn_frexp_mant:
13829 case Intrinsic::amdgcn_fdot2:
13830 case Intrinsic::amdgcn_rcp:
13831 case Intrinsic::amdgcn_rsq:
13832 case Intrinsic::amdgcn_rsq_clamp:
13833 case Intrinsic::amdgcn_rcp_legacy:
13834 case Intrinsic::amdgcn_rsq_legacy:
13835 case Intrinsic::amdgcn_trig_preop:
13836 case Intrinsic::amdgcn_tanh:
13837 case Intrinsic::amdgcn_log:
13838 case Intrinsic::amdgcn_exp2:
13839 case Intrinsic::amdgcn_sqrt:
13840 return true;
13841 default:
13842 break;
13843 }
13844
13845 break;
13846 }
13847 default:
13848 break;
13849 }
13850
13851 // FIXME: denormalsEnabledForType is broken for dynamic
13852 return denormalsEnabledForType(DAG, Op.getValueType()) &&
13853 DAG.isKnownNeverSNaN(Op);
13854}
13855
13857 unsigned MaxDepth) const {
13858 const MachineRegisterInfo &MRI = MF.getRegInfo();
13859 MachineInstr *MI = MRI.getVRegDef(Reg);
13860 unsigned Opcode = MI->getOpcode();
13861
13862 if (Opcode == AMDGPU::G_FCANONICALIZE)
13863 return true;
13864
13865 std::optional<FPValueAndVReg> FCR;
13866 // Constant splat (can be padded with undef) or scalar constant.
13868 if (FCR->Value.isSignaling())
13869 return false;
13870 if (!FCR->Value.isDenormal())
13871 return true;
13872
13873 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
13874 return Mode == DenormalMode::getIEEE();
13875 }
13876
13877 if (MaxDepth == 0)
13878 return false;
13879
13880 switch (Opcode) {
13881 case AMDGPU::G_FADD:
13882 case AMDGPU::G_FSUB:
13883 case AMDGPU::G_FMUL:
13884 case AMDGPU::G_FCEIL:
13885 case AMDGPU::G_FFLOOR:
13886 case AMDGPU::G_FRINT:
13887 case AMDGPU::G_FNEARBYINT:
13888 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
13889 case AMDGPU::G_INTRINSIC_TRUNC:
13890 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
13891 case AMDGPU::G_FMA:
13892 case AMDGPU::G_FMAD:
13893 case AMDGPU::G_FSQRT:
13894 case AMDGPU::G_FDIV:
13895 case AMDGPU::G_FREM:
13896 case AMDGPU::G_FPOW:
13897 case AMDGPU::G_FPEXT:
13898 case AMDGPU::G_FLOG:
13899 case AMDGPU::G_FLOG2:
13900 case AMDGPU::G_FLOG10:
13901 case AMDGPU::G_FPTRUNC:
13902 case AMDGPU::G_AMDGPU_RCP_IFLAG:
13903 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
13904 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
13905 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
13906 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
13907 return true;
13908 case AMDGPU::G_FNEG:
13909 case AMDGPU::G_FABS:
13910 case AMDGPU::G_FCOPYSIGN:
13911 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
13912 case AMDGPU::G_FMINNUM:
13913 case AMDGPU::G_FMAXNUM:
13914 case AMDGPU::G_FMINNUM_IEEE:
13915 case AMDGPU::G_FMAXNUM_IEEE:
13916 case AMDGPU::G_FMINIMUM:
13917 case AMDGPU::G_FMAXIMUM:
13918 case AMDGPU::G_FMINIMUMNUM:
13919 case AMDGPU::G_FMAXIMUMNUM: {
13920 if (Subtarget->supportsMinMaxDenormModes() ||
13921 // FIXME: denormalsEnabledForType is broken for dynamic
13922 denormalsEnabledForType(MRI.getType(Reg), MF))
13923 return true;
13924
13925 [[fallthrough]];
13926 }
13927 case AMDGPU::G_BUILD_VECTOR:
13928 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
13929 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
13930 return false;
13931 return true;
13932 case AMDGPU::G_INTRINSIC:
13933 case AMDGPU::G_INTRINSIC_CONVERGENT:
13934 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
13935 case Intrinsic::amdgcn_fmul_legacy:
13936 case Intrinsic::amdgcn_fmad_ftz:
13937 case Intrinsic::amdgcn_sqrt:
13938 case Intrinsic::amdgcn_fmed3:
13939 case Intrinsic::amdgcn_sin:
13940 case Intrinsic::amdgcn_cos:
13941 case Intrinsic::amdgcn_log:
13942 case Intrinsic::amdgcn_exp2:
13943 case Intrinsic::amdgcn_log_clamp:
13944 case Intrinsic::amdgcn_rcp:
13945 case Intrinsic::amdgcn_rcp_legacy:
13946 case Intrinsic::amdgcn_rsq:
13947 case Intrinsic::amdgcn_rsq_clamp:
13948 case Intrinsic::amdgcn_rsq_legacy:
13949 case Intrinsic::amdgcn_div_scale:
13950 case Intrinsic::amdgcn_div_fmas:
13951 case Intrinsic::amdgcn_div_fixup:
13952 case Intrinsic::amdgcn_fract:
13953 case Intrinsic::amdgcn_cvt_pkrtz:
13954 case Intrinsic::amdgcn_cubeid:
13955 case Intrinsic::amdgcn_cubema:
13956 case Intrinsic::amdgcn_cubesc:
13957 case Intrinsic::amdgcn_cubetc:
13958 case Intrinsic::amdgcn_frexp_mant:
13959 case Intrinsic::amdgcn_fdot2:
13960 case Intrinsic::amdgcn_trig_preop:
13961 case Intrinsic::amdgcn_tanh:
13962 return true;
13963 default:
13964 break;
13965 }
13966
13967 [[fallthrough]];
13968 default:
13969 return false;
13970 }
13971
13972 llvm_unreachable("invalid operation");
13973}
13974
13975// Constant fold canonicalize.
13976SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
13977 const SDLoc &SL, EVT VT,
13978 const APFloat &C) const {
13979 // Flush denormals to 0 if not enabled.
13980 if (C.isDenormal()) {
13981 DenormalMode Mode =
13982 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
13983 if (Mode == DenormalMode::getPreserveSign()) {
13984 return DAG.getConstantFP(
13985 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
13986 }
13987
13988 if (Mode != DenormalMode::getIEEE())
13989 return SDValue();
13990 }
13991
13992 if (C.isNaN()) {
13993 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
13994 if (C.isSignaling()) {
13995 // Quiet a signaling NaN.
13996 // FIXME: Is this supposed to preserve payload bits?
13997 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
13998 }
13999
14000 // Make sure it is the canonical NaN bitpattern.
14001 //
14002 // TODO: Can we use -1 as the canonical NaN value since it's an inline
14003 // immediate?
14004 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
14005 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14006 }
14007
14008 // Already canonical.
14009 return DAG.getConstantFP(C, SL, VT);
14010}
14011
14013 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
14014}
14015
14016SDValue
14017SITargetLowering::performFCanonicalizeCombine(SDNode *N,
14018 DAGCombinerInfo &DCI) const {
14019 SelectionDAG &DAG = DCI.DAG;
14020 SDValue N0 = N->getOperand(0);
14021 EVT VT = N->getValueType(0);
14022
14023 // fcanonicalize undef -> qnan
14024 if (N0.isUndef()) {
14026 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
14027 }
14028
14029 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
14030 EVT VT = N->getValueType(0);
14031 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
14032 }
14033
14034 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
14035 // (fcanonicalize k)
14036 //
14037 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
14038
14039 // TODO: This could be better with wider vectors that will be split to v2f16,
14040 // and to consider uses since there aren't that many packed operations.
14041 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
14042 isTypeLegal(MVT::v2f16)) {
14043 SDLoc SL(N);
14044 SDValue NewElts[2];
14045 SDValue Lo = N0.getOperand(0);
14046 SDValue Hi = N0.getOperand(1);
14047 EVT EltVT = Lo.getValueType();
14048
14050 for (unsigned I = 0; I != 2; ++I) {
14051 SDValue Op = N0.getOperand(I);
14052 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14053 NewElts[I] =
14054 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14055 } else if (Op.isUndef()) {
14056 // Handled below based on what the other operand is.
14057 NewElts[I] = Op;
14058 } else {
14059 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
14060 }
14061 }
14062
14063 // If one half is undef, and one is constant, prefer a splat vector rather
14064 // than the normal qNaN. If it's a register, prefer 0.0 since that's
14065 // cheaper to use and may be free with a packed operation.
14066 if (NewElts[0].isUndef()) {
14067 if (isa<ConstantFPSDNode>(NewElts[1]))
14068 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
14069 ? NewElts[1]
14070 : DAG.getConstantFP(0.0f, SL, EltVT);
14071 }
14072
14073 if (NewElts[1].isUndef()) {
14074 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
14075 ? NewElts[0]
14076 : DAG.getConstantFP(0.0f, SL, EltVT);
14077 }
14078
14079 return DAG.getBuildVector(VT, SL, NewElts);
14080 }
14081 }
14082
14083 return SDValue();
14084}
14085
14086static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
14087 switch (Opc) {
14088 case ISD::FMAXNUM:
14089 case ISD::FMAXNUM_IEEE:
14090 case ISD::FMAXIMUMNUM:
14091 return AMDGPUISD::FMAX3;
14092 case ISD::FMAXIMUM:
14093 return AMDGPUISD::FMAXIMUM3;
14094 case ISD::SMAX:
14095 return AMDGPUISD::SMAX3;
14096 case ISD::UMAX:
14097 return AMDGPUISD::UMAX3;
14098 case ISD::FMINNUM:
14099 case ISD::FMINNUM_IEEE:
14100 case ISD::FMINIMUMNUM:
14101 return AMDGPUISD::FMIN3;
14102 case ISD::FMINIMUM:
14103 return AMDGPUISD::FMINIMUM3;
14104 case ISD::SMIN:
14105 return AMDGPUISD::SMIN3;
14106 case ISD::UMIN:
14107 return AMDGPUISD::UMIN3;
14108 default:
14109 llvm_unreachable("Not a min/max opcode");
14110 }
14111}
14112
14113SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
14114 const SDLoc &SL, SDValue Src,
14115 SDValue MinVal,
14116 SDValue MaxVal,
14117 bool Signed) const {
14118
14119 // med3 comes from
14120 // min(max(x, K0), K1), K0 < K1
14121 // max(min(x, K0), K1), K1 < K0
14122 //
14123 // "MinVal" and "MaxVal" respectively refer to the rhs of the
14124 // min/max op.
14125 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
14126 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
14127
14128 if (!MinK || !MaxK)
14129 return SDValue();
14130
14131 if (Signed) {
14132 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
14133 return SDValue();
14134 } else {
14135 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
14136 return SDValue();
14137 }
14138
14139 EVT VT = MinK->getValueType(0);
14140 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
14141 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
14142 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
14143
14144 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
14145 // not available, but this is unlikely to be profitable as constants
14146 // will often need to be materialized & extended, especially on
14147 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
14148 return SDValue();
14149}
14150
14152 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
14153 return C;
14154
14155 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
14156 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
14157 return C;
14158 }
14159
14160 return nullptr;
14161}
14162
14163SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
14164 const SDLoc &SL, SDValue Op0,
14165 SDValue Op1) const {
14167 if (!K1)
14168 return SDValue();
14169
14171 if (!K0)
14172 return SDValue();
14173
14174 // Ordered >= (although NaN inputs should have folded away by now).
14175 if (K0->getValueAPF() > K1->getValueAPF())
14176 return SDValue();
14177
14178 // med3 with a nan input acts like
14179 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
14180 //
14181 // So the result depends on whether the IEEE mode bit is enabled or not with a
14182 // signaling nan input.
14183 // ieee=1
14184 // s0 snan: yields s2
14185 // s1 snan: yields s2
14186 // s2 snan: qnan
14187
14188 // s0 qnan: min(s1, s2)
14189 // s1 qnan: min(s0, s2)
14190 // s2 qnan: min(s0, s1)
14191
14192 // ieee=0
14193 // s0 snan: min(s1, s2)
14194 // s1 snan: min(s0, s2)
14195 // s2 snan: qnan
14196
14197 // s0 qnan: min(s1, s2)
14198 // s1 qnan: min(s0, s2)
14199 // s2 qnan: min(s0, s1)
14200 const MachineFunction &MF = DAG.getMachineFunction();
14202
14203 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
14204 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
14205 // can only form if op0 is fmaxnum_ieee if IEEE=1.
14206 EVT VT = Op0.getValueType();
14207 if (Info->getMode().DX10Clamp) {
14208 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
14209 // hardware fmed3 behavior converting to a min.
14210 // FIXME: Should this be allowing -0.0?
14211 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
14212 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
14213 }
14214
14215 // med3 for f16 is only available on gfx9+, and not available for v2f16.
14216 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
14217 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
14218 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
14219 // then give the other result, which is different from med3 with a NaN
14220 // input.
14221 SDValue Var = Op0.getOperand(0);
14222 if (!DAG.isKnownNeverSNaN(Var))
14223 return SDValue();
14224
14226
14227 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
14228 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
14229 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
14230 SDValue(K0, 0), SDValue(K1, 0));
14231 }
14232 }
14233
14234 return SDValue();
14235}
14236
14237/// \return true if the subtarget supports minimum3 and maximum3 with the given
14238/// base min/max opcode \p Opc for type \p VT.
14239static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
14240 EVT VT) {
14241 switch (Opc) {
14242 case ISD::FMINNUM:
14243 case ISD::FMAXNUM:
14244 case ISD::FMINNUM_IEEE:
14245 case ISD::FMAXNUM_IEEE:
14246 case ISD::FMINIMUMNUM:
14247 case ISD::FMAXIMUMNUM:
14250 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
14251 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
14252 case ISD::FMINIMUM:
14253 case ISD::FMAXIMUM:
14254 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
14255 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
14256 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
14257 case ISD::SMAX:
14258 case ISD::SMIN:
14259 case ISD::UMAX:
14260 case ISD::UMIN:
14261 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
14262 default:
14263 return false;
14264 }
14265
14266 llvm_unreachable("not a min/max opcode");
14267}
14268
14269SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
14270 DAGCombinerInfo &DCI) const {
14271 SelectionDAG &DAG = DCI.DAG;
14272
14273 EVT VT = N->getValueType(0);
14274 unsigned Opc = N->getOpcode();
14275 SDValue Op0 = N->getOperand(0);
14276 SDValue Op1 = N->getOperand(1);
14277
14278 // Only do this if the inner op has one use since this will just increases
14279 // register pressure for no benefit.
14280
14281 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
14282 // max(max(a, b), c) -> max3(a, b, c)
14283 // min(min(a, b), c) -> min3(a, b, c)
14284 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
14285 SDLoc DL(N);
14286 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
14287 Op0.getOperand(0), Op0.getOperand(1), Op1);
14288 }
14289
14290 // Try commuted.
14291 // max(a, max(b, c)) -> max3(a, b, c)
14292 // min(a, min(b, c)) -> min3(a, b, c)
14293 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
14294 SDLoc DL(N);
14295 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
14296 Op0, Op1.getOperand(0), Op1.getOperand(1));
14297 }
14298 }
14299
14300 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
14301 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
14302 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
14303 if (SDValue Med3 = performIntMed3ImmCombine(
14304 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
14305 return Med3;
14306 }
14307 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
14308 if (SDValue Med3 = performIntMed3ImmCombine(
14309 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
14310 return Med3;
14311 }
14312
14313 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
14314 if (SDValue Med3 = performIntMed3ImmCombine(
14315 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
14316 return Med3;
14317 }
14318 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
14319 if (SDValue Med3 = performIntMed3ImmCombine(
14320 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
14321 return Med3;
14322 }
14323
14324 // if !is_snan(x):
14325 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14326 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14327 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14328 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14329 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
14333 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
14334 (VT == MVT::f32 || VT == MVT::f64 ||
14335 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
14336 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
14337 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
14338 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
14339 Op0.hasOneUse()) {
14340 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
14341 return Res;
14342 }
14343
14344 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
14345 // for some types, but at a higher cost since it's implemented with a 3
14346 // operand form.
14347 const SDNodeFlags Flags = N->getFlags();
14348 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) &&
14349 !Subtarget->hasIEEEMinimumMaximumInsts() && Flags.hasNoNaNs()) {
14350 unsigned NewOpc =
14352 return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
14353 }
14354
14355 return SDValue();
14356}
14357
14359 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
14360 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
14361 // FIXME: Should this be allowing -0.0?
14362 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
14363 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
14364 }
14365 }
14366
14367 return false;
14368}
14369
14370// FIXME: Should only worry about snans for version with chain.
14371SDValue SITargetLowering::performFMed3Combine(SDNode *N,
14372 DAGCombinerInfo &DCI) const {
14373 EVT VT = N->getValueType(0);
14374 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
14375 // NaNs. With a NaN input, the order of the operands may change the result.
14376
14377 SelectionDAG &DAG = DCI.DAG;
14378 SDLoc SL(N);
14379
14380 SDValue Src0 = N->getOperand(0);
14381 SDValue Src1 = N->getOperand(1);
14382 SDValue Src2 = N->getOperand(2);
14383
14384 if (isClampZeroToOne(Src0, Src1)) {
14385 // const_a, const_b, x -> clamp is safe in all cases including signaling
14386 // nans.
14387 // FIXME: Should this be allowing -0.0?
14388 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
14389 }
14390
14391 const MachineFunction &MF = DAG.getMachineFunction();
14393
14394 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
14395 // handling no dx10-clamp?
14396 if (Info->getMode().DX10Clamp) {
14397 // If NaNs is clamped to 0, we are free to reorder the inputs.
14398
14399 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
14400 std::swap(Src0, Src1);
14401
14402 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
14403 std::swap(Src1, Src2);
14404
14405 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
14406 std::swap(Src0, Src1);
14407
14408 if (isClampZeroToOne(Src1, Src2))
14409 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
14410 }
14411
14412 return SDValue();
14413}
14414
14415SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
14416 DAGCombinerInfo &DCI) const {
14417 SDValue Src0 = N->getOperand(0);
14418 SDValue Src1 = N->getOperand(1);
14419 if (Src0.isUndef() && Src1.isUndef())
14420 return DCI.DAG.getUNDEF(N->getValueType(0));
14421 return SDValue();
14422}
14423
14424// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
14425// expanded into a set of cmp/select instructions.
14427 unsigned NumElem,
14428 bool IsDivergentIdx,
14429 const GCNSubtarget *Subtarget) {
14431 return false;
14432
14433 unsigned VecSize = EltSize * NumElem;
14434
14435 // Sub-dword vectors of size 2 dword or less have better implementation.
14436 if (VecSize <= 64 && EltSize < 32)
14437 return false;
14438
14439 // Always expand the rest of sub-dword instructions, otherwise it will be
14440 // lowered via memory.
14441 if (EltSize < 32)
14442 return true;
14443
14444 // Always do this if var-idx is divergent, otherwise it will become a loop.
14445 if (IsDivergentIdx)
14446 return true;
14447
14448 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
14449 unsigned NumInsts = NumElem /* Number of compares */ +
14450 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
14451
14452 // On some architectures (GFX9) movrel is not available and it's better
14453 // to expand.
14454 if (Subtarget->useVGPRIndexMode())
14455 return NumInsts <= 16;
14456
14457 // If movrel is available, use it instead of expanding for vector of 8
14458 // elements.
14459 if (Subtarget->hasMovrel())
14460 return NumInsts <= 15;
14461
14462 return true;
14463}
14464
14466 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
14467 if (isa<ConstantSDNode>(Idx))
14468 return false;
14469
14470 SDValue Vec = N->getOperand(0);
14471 EVT VecVT = Vec.getValueType();
14472 EVT EltVT = VecVT.getVectorElementType();
14473 unsigned EltSize = EltVT.getSizeInBits();
14474 unsigned NumElem = VecVT.getVectorNumElements();
14475
14477 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
14478}
14479
14480SDValue
14481SITargetLowering::performExtractVectorEltCombine(SDNode *N,
14482 DAGCombinerInfo &DCI) const {
14483 SDValue Vec = N->getOperand(0);
14484 SelectionDAG &DAG = DCI.DAG;
14485
14486 EVT VecVT = Vec.getValueType();
14487 EVT VecEltVT = VecVT.getVectorElementType();
14488 EVT ResVT = N->getValueType(0);
14489
14490 unsigned VecSize = VecVT.getSizeInBits();
14491 unsigned VecEltSize = VecEltVT.getSizeInBits();
14492
14493 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
14495 SDLoc SL(N);
14496 SDValue Idx = N->getOperand(1);
14497 SDValue Elt =
14498 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
14499 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
14500 }
14501
14502 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
14503 // =>
14504 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
14505 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
14506 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
14507 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
14508 SDLoc SL(N);
14509 SDValue Idx = N->getOperand(1);
14510 unsigned Opc = Vec.getOpcode();
14511
14512 switch (Opc) {
14513 default:
14514 break;
14515 // TODO: Support other binary operations.
14516 case ISD::FADD:
14517 case ISD::FSUB:
14518 case ISD::FMUL:
14519 case ISD::ADD:
14520 case ISD::UMIN:
14521 case ISD::UMAX:
14522 case ISD::SMIN:
14523 case ISD::SMAX:
14524 case ISD::FMAXNUM:
14525 case ISD::FMINNUM:
14526 case ISD::FMAXNUM_IEEE:
14527 case ISD::FMINNUM_IEEE:
14528 case ISD::FMAXIMUM:
14529 case ISD::FMINIMUM: {
14530 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
14531 Vec.getOperand(0), Idx);
14532 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
14533 Vec.getOperand(1), Idx);
14534
14535 DCI.AddToWorklist(Elt0.getNode());
14536 DCI.AddToWorklist(Elt1.getNode());
14537 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
14538 }
14539 }
14540 }
14541
14542 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
14544 SDLoc SL(N);
14545 SDValue Idx = N->getOperand(1);
14546 SDValue V;
14547 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
14548 SDValue IC = DAG.getVectorIdxConstant(I, SL);
14549 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
14550 if (I == 0)
14551 V = Elt;
14552 else
14553 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
14554 }
14555 return V;
14556 }
14557
14558 if (!DCI.isBeforeLegalize())
14559 return SDValue();
14560
14561 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
14562 // elements. This exposes more load reduction opportunities by replacing
14563 // multiple small extract_vector_elements with a single 32-bit extract.
14564 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
14565 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
14566 VecSize > 32 && VecSize % 32 == 0 && Idx) {
14567 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
14568
14569 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
14570 unsigned EltIdx = BitIndex / 32;
14571 unsigned LeftoverBitIdx = BitIndex % 32;
14572 SDLoc SL(N);
14573
14574 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
14575 DCI.AddToWorklist(Cast.getNode());
14576
14577 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
14578 DAG.getConstant(EltIdx, SL, MVT::i32));
14579 DCI.AddToWorklist(Elt.getNode());
14580 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
14581 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
14582 DCI.AddToWorklist(Srl.getNode());
14583
14584 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
14585 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
14586 DCI.AddToWorklist(Trunc.getNode());
14587
14588 if (VecEltVT == ResVT) {
14589 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
14590 }
14591
14592 assert(ResVT.isScalarInteger());
14593 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
14594 }
14595
14596 return SDValue();
14597}
14598
14599SDValue
14600SITargetLowering::performInsertVectorEltCombine(SDNode *N,
14601 DAGCombinerInfo &DCI) const {
14602 SDValue Vec = N->getOperand(0);
14603 SDValue Idx = N->getOperand(2);
14604 EVT VecVT = Vec.getValueType();
14605 EVT EltVT = VecVT.getVectorElementType();
14606
14607 // INSERT_VECTOR_ELT (<n x e>, var-idx)
14608 // => BUILD_VECTOR n x select (e, const-idx)
14610 return SDValue();
14611
14612 SelectionDAG &DAG = DCI.DAG;
14613 SDLoc SL(N);
14614 SDValue Ins = N->getOperand(1);
14615 EVT IdxVT = Idx.getValueType();
14616
14618 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
14619 SDValue IC = DAG.getConstant(I, SL, IdxVT);
14620 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
14621 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
14622 Ops.push_back(V);
14623 }
14624
14625 return DAG.getBuildVector(VecVT, SL, Ops);
14626}
14627
14628/// Return the source of an fp_extend from f16 to f32, or a converted FP
14629/// constant.
14631 if (Src.getOpcode() == ISD::FP_EXTEND &&
14632 Src.getOperand(0).getValueType() == MVT::f16) {
14633 return Src.getOperand(0);
14634 }
14635
14636 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
14637 APFloat Val = CFP->getValueAPF();
14638 bool LosesInfo = true;
14640 if (!LosesInfo)
14641 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
14642 }
14643
14644 return SDValue();
14645}
14646
14647SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
14648 DAGCombinerInfo &DCI) const {
14649 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
14650 "combine only useful on gfx8");
14651
14652 SDValue TruncSrc = N->getOperand(0);
14653 EVT VT = N->getValueType(0);
14654 if (VT != MVT::f16)
14655 return SDValue();
14656
14657 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
14658 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
14659 return SDValue();
14660
14661 SelectionDAG &DAG = DCI.DAG;
14662 SDLoc SL(N);
14663
14664 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
14665 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
14666 // casting back.
14667
14668 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
14669 // fmin(fmax(a, b), fmax(fmin(a, b), c))
14670 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
14671 if (!A)
14672 return SDValue();
14673
14674 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
14675 if (!B)
14676 return SDValue();
14677
14678 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
14679 if (!C)
14680 return SDValue();
14681
14682 // This changes signaling nan behavior. If an input is a signaling nan, it
14683 // would have been quieted by the fpext originally. We don't care because
14684 // these are unconstrained ops. If we needed to insert quieting canonicalizes
14685 // we would be worse off than just doing the promotion.
14686 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
14687 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
14688 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
14689 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
14690}
14691
14692unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
14693 const SDNode *N0,
14694 const SDNode *N1) const {
14695 EVT VT = N0->getValueType(0);
14696
14697 // Only do this if we are not trying to support denormals. v_mad_f32 does not
14698 // support denormals ever.
14699 if (((VT == MVT::f32 &&
14701 (VT == MVT::f16 && Subtarget->hasMadF16() &&
14704 return ISD::FMAD;
14705
14706 const TargetOptions &Options = DAG.getTarget().Options;
14707 if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
14708 (N0->getFlags().hasAllowContract() &&
14709 N1->getFlags().hasAllowContract())) &&
14711 return ISD::FMA;
14712 }
14713
14714 return 0;
14715}
14716
14717// For a reassociatable opcode perform:
14718// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
14719SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
14720 SelectionDAG &DAG) const {
14721 EVT VT = N->getValueType(0);
14722 if (VT != MVT::i32 && VT != MVT::i64)
14723 return SDValue();
14724
14725 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
14726 return SDValue();
14727
14728 unsigned Opc = N->getOpcode();
14729 SDValue Op0 = N->getOperand(0);
14730 SDValue Op1 = N->getOperand(1);
14731
14732 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
14733 return SDValue();
14734
14735 if (Op0->isDivergent())
14736 std::swap(Op0, Op1);
14737
14738 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
14739 return SDValue();
14740
14741 SDValue Op2 = Op1.getOperand(1);
14742 Op1 = Op1.getOperand(0);
14743 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
14744 return SDValue();
14745
14746 if (Op1->isDivergent())
14747 std::swap(Op1, Op2);
14748
14749 SDLoc SL(N);
14750 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
14751 return DAG.getNode(Opc, SL, VT, Add1, Op2);
14752}
14753
14754static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
14755 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
14757 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
14758 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
14759 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
14760}
14761
14762// Fold
14763// y = lshr i64 x, 32
14764// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
14765// with Const.hi == -1
14766// To
14767// res = mad_u64_u32 y.lo ,Const.lo, x.lo
14769 SDValue MulLHS, SDValue MulRHS,
14770 SDValue AddRHS) {
14771 if (MulRHS.getOpcode() == ISD::SRL)
14772 std::swap(MulLHS, MulRHS);
14773
14774 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
14775 return SDValue();
14776
14777 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
14778 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
14779 MulLHS.getOperand(0) != AddRHS)
14780 return SDValue();
14781
14782 ConstantSDNode *Const = dyn_cast<ConstantSDNode>(MulRHS.getNode());
14783 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
14784 return SDValue();
14785
14786 SDValue ConstMul =
14787 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
14788 return getMad64_32(DAG, SL, MVT::i64,
14789 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
14790 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
14791}
14792
14793// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
14794// multiplies, if any.
14795//
14796// Full 64-bit multiplies that feed into an addition are lowered here instead
14797// of using the generic expansion. The generic expansion ends up with
14798// a tree of ADD nodes that prevents us from using the "add" part of the
14799// MAD instruction. The expansion produced here results in a chain of ADDs
14800// instead of a tree.
14801SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
14802 DAGCombinerInfo &DCI) const {
14803 assert(N->isAnyAdd());
14804
14805 SelectionDAG &DAG = DCI.DAG;
14806 EVT VT = N->getValueType(0);
14807 SDLoc SL(N);
14808 SDValue LHS = N->getOperand(0);
14809 SDValue RHS = N->getOperand(1);
14810
14811 if (VT.isVector())
14812 return SDValue();
14813
14814 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
14815 // result in scalar registers for uniform values.
14816 if (!N->isDivergent() && Subtarget->hasSMulHi())
14817 return SDValue();
14818
14819 unsigned NumBits = VT.getScalarSizeInBits();
14820 if (NumBits <= 32 || NumBits > 64)
14821 return SDValue();
14822
14823 if (LHS.getOpcode() != ISD::MUL) {
14824 assert(RHS.getOpcode() == ISD::MUL);
14825 std::swap(LHS, RHS);
14826 }
14827
14828 // Avoid the fold if it would unduly increase the number of multiplies due to
14829 // multiple uses, except on hardware with full-rate multiply-add (which is
14830 // part of full-rate 64-bit ops).
14831 if (!Subtarget->hasFullRate64Ops()) {
14832 unsigned NumUsers = 0;
14833 for (SDNode *User : LHS->users()) {
14834 // There is a use that does not feed into addition, so the multiply can't
14835 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
14836 if (!User->isAnyAdd())
14837 return SDValue();
14838
14839 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
14840 // MUL + 3xADD + 3xADDC over 3xMAD.
14841 ++NumUsers;
14842 if (NumUsers >= 3)
14843 return SDValue();
14844 }
14845 }
14846
14847 SDValue MulLHS = LHS.getOperand(0);
14848 SDValue MulRHS = LHS.getOperand(1);
14849 SDValue AddRHS = RHS;
14850
14851 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
14852 return FoldedMAD;
14853
14854 // Always check whether operands are small unsigned values, since that
14855 // knowledge is useful in more cases. Check for small signed values only if
14856 // doing so can unlock a shorter code sequence.
14857 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
14858 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
14859
14860 bool MulSignedLo = false;
14861 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
14862 MulSignedLo =
14863 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
14864 }
14865
14866 // The operands and final result all have the same number of bits. If
14867 // operands need to be extended, they can be extended with garbage. The
14868 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
14869 // truncated away in the end.
14870 if (VT != MVT::i64) {
14871 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
14872 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
14873 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
14874 }
14875
14876 // The basic code generated is conceptually straightforward. Pseudo code:
14877 //
14878 // accum = mad_64_32 lhs.lo, rhs.lo, accum
14879 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
14880 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
14881 //
14882 // The second and third lines are optional, depending on whether the factors
14883 // are {sign,zero}-extended or not.
14884 //
14885 // The actual DAG is noisier than the pseudo code, but only due to
14886 // instructions that disassemble values into low and high parts, and
14887 // assemble the final result.
14888 SDValue One = DAG.getConstant(1, SL, MVT::i32);
14889
14890 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
14891 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
14892 SDValue Accum =
14893 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
14894
14895 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
14896 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
14897
14898 if (!MulLHSUnsigned32) {
14899 auto MulLHSHi =
14900 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
14901 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
14902 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
14903 }
14904
14905 if (!MulRHSUnsigned32) {
14906 auto MulRHSHi =
14907 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
14908 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
14909 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
14910 }
14911
14912 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
14913 Accum = DAG.getBitcast(MVT::i64, Accum);
14914 }
14915
14916 if (VT != MVT::i64)
14917 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
14918 return Accum;
14919}
14920
14921SDValue
14922SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
14923 DAGCombinerInfo &DCI) const {
14924 SDValue RHS = N->getOperand(1);
14925 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14926 if (!CRHS)
14927 return SDValue();
14928
14929 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
14930 // common.
14931 uint64_t Val = CRHS->getZExtValue();
14932 if (countr_zero(Val) >= 32) {
14933 SelectionDAG &DAG = DCI.DAG;
14934 SDLoc SL(N);
14935 SDValue LHS = N->getOperand(0);
14936
14937 // Avoid carry machinery if we know the low half of the add does not
14938 // contribute to the final result.
14939 //
14940 // add i64:x, K if computeTrailingZeros(K) >= 32
14941 // => build_pair (add x.hi, K.hi), x.lo
14942
14943 // Breaking the 64-bit add here with this strange constant is unlikely
14944 // to interfere with addressing mode patterns.
14945
14946 SDValue Hi = getHiHalf64(LHS, DAG);
14947 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
14948 unsigned Opcode = N->getOpcode();
14949 if (Opcode == ISD::PTRADD)
14950 Opcode = ISD::ADD;
14951 SDValue AddHi =
14952 DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
14953
14954 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
14955 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
14956 }
14957
14958 return SDValue();
14959}
14960
14961// Collect the ultimate src of each of the mul node's operands, and confirm
14962// each operand is 8 bytes.
14963static std::optional<ByteProvider<SDValue>>
14964handleMulOperand(const SDValue &MulOperand) {
14965 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
14966 if (!Byte0 || Byte0->isConstantZero()) {
14967 return std::nullopt;
14968 }
14969 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
14970 if (Byte1 && !Byte1->isConstantZero()) {
14971 return std::nullopt;
14972 }
14973 return Byte0;
14974}
14975
14976static unsigned addPermMasks(unsigned First, unsigned Second) {
14977 unsigned FirstCs = First & 0x0c0c0c0c;
14978 unsigned SecondCs = Second & 0x0c0c0c0c;
14979 unsigned FirstNoCs = First & ~0x0c0c0c0c;
14980 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
14981
14982 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
14983 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
14984 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
14985 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
14986
14987 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
14988}
14989
14990struct DotSrc {
14992 int64_t PermMask;
14994};
14995
14999 SmallVectorImpl<DotSrc> &Src1s, int Step) {
15000
15001 assert(Src0.Src.has_value() && Src1.Src.has_value());
15002 // Src0s and Src1s are empty, just place arbitrarily.
15003 if (Step == 0) {
15004 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
15005 Src0.SrcOffset / 4});
15006 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
15007 Src1.SrcOffset / 4});
15008 return;
15009 }
15010
15011 for (int BPI = 0; BPI < 2; BPI++) {
15012 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
15013 if (BPI == 1) {
15014 BPP = {Src1, Src0};
15015 }
15016 unsigned ZeroMask = 0x0c0c0c0c;
15017 unsigned FMask = 0xFF << (8 * (3 - Step));
15018
15019 unsigned FirstMask =
15020 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15021 unsigned SecondMask =
15022 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15023 // Attempt to find Src vector which contains our SDValue, if so, add our
15024 // perm mask to the existing one. If we are unable to find a match for the
15025 // first SDValue, attempt to find match for the second.
15026 int FirstGroup = -1;
15027 for (int I = 0; I < 2; I++) {
15028 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
15029 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
15030 return IterElt.SrcOp == *BPP.first.Src &&
15031 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15032 };
15033
15034 auto *Match = llvm::find_if(Srcs, MatchesFirst);
15035 if (Match != Srcs.end()) {
15036 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
15037 FirstGroup = I;
15038 break;
15039 }
15040 }
15041 if (FirstGroup != -1) {
15042 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
15043 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
15044 return IterElt.SrcOp == *BPP.second.Src &&
15045 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15046 };
15047 auto *Match = llvm::find_if(Srcs, MatchesSecond);
15048 if (Match != Srcs.end()) {
15049 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
15050 } else
15051 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15052 return;
15053 }
15054 }
15055
15056 // If we have made it here, then we could not find a match in Src0s or Src1s
15057 // for either Src0 or Src1, so just place them arbitrarily.
15058
15059 unsigned ZeroMask = 0x0c0c0c0c;
15060 unsigned FMask = 0xFF << (8 * (3 - Step));
15061
15062 Src0s.push_back(
15063 {*Src0.Src,
15064 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15065 Src0.SrcOffset / 4});
15066 Src1s.push_back(
15067 {*Src1.Src,
15068 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15069 Src1.SrcOffset / 4});
15070}
15071
15073 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
15074 bool IsAny) {
15075
15076 // If we just have one source, just permute it accordingly.
15077 if (Srcs.size() == 1) {
15078 auto *Elt = Srcs.begin();
15079 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
15080
15081 // v_perm will produce the original value
15082 if (Elt->PermMask == 0x3020100)
15083 return EltOp;
15084
15085 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15086 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
15087 }
15088
15089 auto *FirstElt = Srcs.begin();
15090 auto *SecondElt = std::next(FirstElt);
15091
15093
15094 // If we have multiple sources in the chain, combine them via perms (using
15095 // calculated perm mask) and Ors.
15096 while (true) {
15097 auto FirstMask = FirstElt->PermMask;
15098 auto SecondMask = SecondElt->PermMask;
15099
15100 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15101 unsigned FirstPlusFour = FirstMask | 0x04040404;
15102 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
15103 // original 0x0C.
15104 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15105
15106 auto PermMask = addPermMasks(FirstMask, SecondMask);
15107 auto FirstVal =
15108 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15109 auto SecondVal =
15110 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
15111
15112 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
15113 SecondVal,
15114 DAG.getConstant(PermMask, SL, MVT::i32)));
15115
15116 FirstElt = std::next(SecondElt);
15117 if (FirstElt == Srcs.end())
15118 break;
15119
15120 SecondElt = std::next(FirstElt);
15121 // If we only have a FirstElt, then just combine that into the cumulative
15122 // source node.
15123 if (SecondElt == Srcs.end()) {
15124 auto EltOp =
15125 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15126
15127 Perms.push_back(
15128 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15129 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
15130 break;
15131 }
15132 }
15133
15134 assert(Perms.size() == 1 || Perms.size() == 2);
15135 return Perms.size() == 2
15136 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
15137 : Perms[0];
15138}
15139
15140static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
15141 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15142 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15143 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15144 EntryMask += ZeroMask;
15145 }
15146}
15147
15148static bool isMul(const SDValue Op) {
15149 auto Opcode = Op.getOpcode();
15150
15151 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
15152 Opcode == AMDGPUISD::MUL_I24);
15153}
15154
15155static std::optional<bool>
15157 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
15158 const SDValue &S1Op, const SelectionDAG &DAG) {
15159 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
15160 // of the dot4 is irrelevant.
15161 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
15162 return false;
15163
15164 auto Known0 = DAG.computeKnownBits(S0Op, 0);
15165 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
15166 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
15167 auto Known1 = DAG.computeKnownBits(S1Op, 0);
15168 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
15169 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
15170
15171 assert(!(S0IsUnsigned && S0IsSigned));
15172 assert(!(S1IsUnsigned && S1IsSigned));
15173
15174 // There are 9 possible permutations of
15175 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
15176
15177 // In two permutations, the sign bits are known to be the same for both Ops,
15178 // so simply return Signed / Unsigned corresponding to the MSB
15179
15180 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
15181 return S0IsSigned;
15182
15183 // In another two permutations, the sign bits are known to be opposite. In
15184 // this case return std::nullopt to indicate a bad match.
15185
15186 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
15187 return std::nullopt;
15188
15189 // In the remaining five permutations, we don't know the value of the sign
15190 // bit for at least one Op. Since we have a valid ByteProvider, we know that
15191 // the upper bits must be extension bits. Thus, the only ways for the sign
15192 // bit to be unknown is if it was sign extended from unknown value, or if it
15193 // was any extended. In either case, it is correct to use the signed
15194 // version of the signedness semantics of dot4
15195
15196 // In two of such permutations, we known the sign bit is set for
15197 // one op, and the other is unknown. It is okay to used signed version of
15198 // dot4.
15199 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
15200 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
15201 return true;
15202
15203 // In one such permutation, we don't know either of the sign bits. It is okay
15204 // to used the signed version of dot4.
15205 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
15206 return true;
15207
15208 // In two of such permutations, we known the sign bit is unset for
15209 // one op, and the other is unknown. Return std::nullopt to indicate a
15210 // bad match.
15211 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
15212 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
15213 return std::nullopt;
15214
15215 llvm_unreachable("Fully covered condition");
15216}
15217
15218SDValue SITargetLowering::performAddCombine(SDNode *N,
15219 DAGCombinerInfo &DCI) const {
15220 SelectionDAG &DAG = DCI.DAG;
15221 EVT VT = N->getValueType(0);
15222 SDLoc SL(N);
15223 SDValue LHS = N->getOperand(0);
15224 SDValue RHS = N->getOperand(1);
15225
15226 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
15227 if (Subtarget->hasMad64_32()) {
15228 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
15229 return Folded;
15230 }
15231 }
15232
15233 if (SDValue V = reassociateScalarOps(N, DAG)) {
15234 return V;
15235 }
15236
15237 if (VT == MVT::i64) {
15238 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
15239 return Folded;
15240 }
15241
15242 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
15243 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
15244 SDValue TempNode(N, 0);
15245 std::optional<bool> IsSigned;
15249
15250 // Match the v_dot4 tree, while collecting src nodes.
15251 int ChainLength = 0;
15252 for (int I = 0; I < 4; I++) {
15253 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
15254 if (MulIdx == -1)
15255 break;
15256 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
15257 if (!Src0)
15258 break;
15259 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
15260 if (!Src1)
15261 break;
15262
15263 auto IterIsSigned = checkDot4MulSignedness(
15264 TempNode->getOperand(MulIdx), *Src0, *Src1,
15265 TempNode->getOperand(MulIdx)->getOperand(0),
15266 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
15267 if (!IterIsSigned)
15268 break;
15269 if (!IsSigned)
15270 IsSigned = *IterIsSigned;
15271 if (*IterIsSigned != *IsSigned)
15272 break;
15273 placeSources(*Src0, *Src1, Src0s, Src1s, I);
15274 auto AddIdx = 1 - MulIdx;
15275 // Allow the special case where add (add (mul24, 0), mul24) became ->
15276 // add (mul24, mul24).
15277 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
15278 Src2s.push_back(TempNode->getOperand(AddIdx));
15279 auto Src0 =
15280 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
15281 if (!Src0)
15282 break;
15283 auto Src1 =
15284 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
15285 if (!Src1)
15286 break;
15287 auto IterIsSigned = checkDot4MulSignedness(
15288 TempNode->getOperand(AddIdx), *Src0, *Src1,
15289 TempNode->getOperand(AddIdx)->getOperand(0),
15290 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
15291 if (!IterIsSigned)
15292 break;
15293 assert(IsSigned);
15294 if (*IterIsSigned != *IsSigned)
15295 break;
15296 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
15297 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
15298 ChainLength = I + 2;
15299 break;
15300 }
15301
15302 TempNode = TempNode->getOperand(AddIdx);
15303 Src2s.push_back(TempNode);
15304 ChainLength = I + 1;
15305 if (TempNode->getNumOperands() < 2)
15306 break;
15307 LHS = TempNode->getOperand(0);
15308 RHS = TempNode->getOperand(1);
15309 }
15310
15311 if (ChainLength < 2)
15312 return SDValue();
15313
15314 // Masks were constructed with assumption that we would find a chain of
15315 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
15316 // 0x0c) so they do not affect dot calculation.
15317 if (ChainLength < 4) {
15318 fixMasks(Src0s, ChainLength);
15319 fixMasks(Src1s, ChainLength);
15320 }
15321
15322 SDValue Src0, Src1;
15323
15324 // If we are just using a single source for both, and have permuted the
15325 // bytes consistently, we can just use the sources without permuting
15326 // (commutation).
15327 bool UseOriginalSrc = false;
15328 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
15329 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
15330 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
15331 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
15332 SmallVector<unsigned, 4> SrcBytes;
15333 auto Src0Mask = Src0s.begin()->PermMask;
15334 SrcBytes.push_back(Src0Mask & 0xFF000000);
15335 bool UniqueEntries = true;
15336 for (auto I = 1; I < 4; I++) {
15337 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
15338
15339 if (is_contained(SrcBytes, NextByte)) {
15340 UniqueEntries = false;
15341 break;
15342 }
15343 SrcBytes.push_back(NextByte);
15344 }
15345
15346 if (UniqueEntries) {
15347 UseOriginalSrc = true;
15348
15349 auto *FirstElt = Src0s.begin();
15350 auto FirstEltOp =
15351 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15352
15353 auto *SecondElt = Src1s.begin();
15354 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
15355 SecondElt->DWordOffset);
15356
15357 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
15358 MVT::getIntegerVT(32));
15359 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
15360 MVT::getIntegerVT(32));
15361 }
15362 }
15363
15364 if (!UseOriginalSrc) {
15365 Src0 = resolveSources(DAG, SL, Src0s, false, true);
15366 Src1 = resolveSources(DAG, SL, Src1s, false, true);
15367 }
15368
15369 assert(IsSigned);
15370 SDValue Src2 =
15371 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
15372
15373 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
15374 : Intrinsic::amdgcn_udot4,
15375 SL, MVT::i64);
15376
15377 assert(!VT.isVector());
15378 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
15379 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
15380
15381 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
15382 }
15383
15384 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
15385 return SDValue();
15386
15387 // add x, zext (setcc) => uaddo_carry x, 0, setcc
15388 // add x, sext (setcc) => usubo_carry x, 0, setcc
15389 unsigned Opc = LHS.getOpcode();
15392 std::swap(RHS, LHS);
15393
15394 Opc = RHS.getOpcode();
15395 switch (Opc) {
15396 default:
15397 break;
15398 case ISD::ZERO_EXTEND:
15399 case ISD::SIGN_EXTEND:
15400 case ISD::ANY_EXTEND: {
15401 auto Cond = RHS.getOperand(0);
15402 // If this won't be a real VOPC output, we would still need to insert an
15403 // extra instruction anyway.
15404 if (!isBoolSGPR(Cond))
15405 break;
15406 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
15407 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
15409 return DAG.getNode(Opc, SL, VTList, Args);
15410 }
15411 case ISD::UADDO_CARRY: {
15412 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
15413 if (!isNullConstant(RHS.getOperand(1)))
15414 break;
15415 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
15416 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
15417 }
15418 }
15419 return SDValue();
15420}
15421
15422SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
15423 DAGCombinerInfo &DCI) const {
15424 SelectionDAG &DAG = DCI.DAG;
15425 SDLoc DL(N);
15426 EVT VT = N->getValueType(0);
15427 SDValue N0 = N->getOperand(0);
15428 SDValue N1 = N->getOperand(1);
15429
15430 // The following folds transform PTRADDs into regular arithmetic in cases
15431 // where the PTRADD wouldn't be folded as an immediate offset into memory
15432 // instructions anyway. They are target-specific in that other targets might
15433 // prefer to not lose information about the pointer arithmetic.
15434
15435 // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
15436 // Adapted from DAGCombiner::visitADDLikeCommutative.
15437 SDValue V, K;
15438 if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
15439 SDNodeFlags ShlFlags = N1->getFlags();
15440 // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
15441 // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
15442 // preserved.
15443 SDNodeFlags NewShlFlags =
15444 ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
15446 : SDNodeFlags();
15447 SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
15448 DCI.AddToWorklist(Inner.getNode());
15449 return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
15450 }
15451
15452 // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
15453 // performAddCombine.
15454 if (N1.getOpcode() == ISD::MUL) {
15455 if (Subtarget->hasMad64_32()) {
15456 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
15457 return Folded;
15458 }
15459 }
15460
15461 // If the 32 low bits of the constant are all zero, there is nothing to fold
15462 // into an immediate offset, so it's better to eliminate the unnecessary
15463 // addition for the lower 32 bits than to preserve the PTRADD.
15464 // Analogous to a fold in performAddCombine.
15465 if (VT == MVT::i64) {
15466 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
15467 return Folded;
15468 }
15469
15470 if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) {
15471 // Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with
15472 // global address GA and constant c, such that c can be folded into GA.
15473 SDValue GAValue = N0.getOperand(0);
15474 if (const GlobalAddressSDNode *GA =
15475 dyn_cast<GlobalAddressSDNode>(GAValue)) {
15476 if (DCI.isBeforeLegalizeOps() && isOffsetFoldingLegal(GA)) {
15477 // If both additions in the original were NUW, reassociation preserves
15478 // that.
15480 (N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
15481 SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags);
15482 DCI.AddToWorklist(Inner.getNode());
15483 return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags);
15484 }
15485 }
15486 }
15487
15488 if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
15489 return SDValue();
15490
15491 // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
15492 // y is not, and (add y, z) is used only once.
15493 // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
15494 // z is not, and (add y, z) is used only once.
15495 // The goal is to move constant offsets to the outermost ptradd, to create
15496 // more opportunities to fold offsets into memory instructions.
15497 // Together with the generic combines in DAGCombiner.cpp, this also
15498 // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
15499 //
15500 // This transform is here instead of in the general DAGCombiner as it can
15501 // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
15502 // AArch64's CPA.
15503 SDValue X = N0;
15504 SDValue Y = N1.getOperand(0);
15505 SDValue Z = N1.getOperand(1);
15506 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
15507 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
15508
15509 // If both additions in the original were NUW, reassociation preserves that.
15510 SDNodeFlags ReassocFlags =
15511 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
15512
15513 if (ZIsConstant != YIsConstant) {
15514 if (YIsConstant)
15515 std::swap(Y, Z);
15516 SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
15517 DCI.AddToWorklist(Inner.getNode());
15518 return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags);
15519 }
15520
15521 // If one of Y and Z is constant, they have been handled above. If both were
15522 // constant, the addition would have been folded in SelectionDAG::getNode
15523 // already. This ensures that the generic DAG combines won't undo the
15524 // following reassociation.
15525 assert(!YIsConstant && !ZIsConstant);
15526
15527 if (!X->isDivergent() && Y->isDivergent() != Z->isDivergent()) {
15528 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
15529 // y are uniform and z isn't.
15530 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
15531 // z are uniform and y isn't.
15532 // The goal is to push uniform operands up in the computation, so that they
15533 // can be handled with scalar operations. We can't use reassociateScalarOps
15534 // for this since it requires two identical commutative operations to
15535 // reassociate.
15536 if (Y->isDivergent())
15537 std::swap(Y, Z);
15538 SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
15539 DCI.AddToWorklist(UniformInner.getNode());
15540 return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
15541 }
15542
15543 return SDValue();
15544}
15545
15546SDValue SITargetLowering::performSubCombine(SDNode *N,
15547 DAGCombinerInfo &DCI) const {
15548 SelectionDAG &DAG = DCI.DAG;
15549 EVT VT = N->getValueType(0);
15550
15551 if (VT == MVT::i64) {
15552 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
15553 return Folded;
15554 }
15555
15556 if (VT != MVT::i32)
15557 return SDValue();
15558
15559 SDLoc SL(N);
15560 SDValue LHS = N->getOperand(0);
15561 SDValue RHS = N->getOperand(1);
15562
15563 // sub x, zext (setcc) => usubo_carry x, 0, setcc
15564 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
15565 unsigned Opc = RHS.getOpcode();
15566 switch (Opc) {
15567 default:
15568 break;
15569 case ISD::ZERO_EXTEND:
15570 case ISD::SIGN_EXTEND:
15571 case ISD::ANY_EXTEND: {
15572 auto Cond = RHS.getOperand(0);
15573 // If this won't be a real VOPC output, we would still need to insert an
15574 // extra instruction anyway.
15575 if (!isBoolSGPR(Cond))
15576 break;
15577 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
15578 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
15580 return DAG.getNode(Opc, SL, VTList, Args);
15581 }
15582 }
15583
15584 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
15585 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
15586 if (!isNullConstant(LHS.getOperand(1)))
15587 return SDValue();
15588 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
15589 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
15590 }
15591 return SDValue();
15592}
15593
15594SDValue
15595SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
15596 DAGCombinerInfo &DCI) const {
15597
15598 if (N->getValueType(0) != MVT::i32)
15599 return SDValue();
15600
15601 if (!isNullConstant(N->getOperand(1)))
15602 return SDValue();
15603
15604 SelectionDAG &DAG = DCI.DAG;
15605 SDValue LHS = N->getOperand(0);
15606
15607 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
15608 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
15609 unsigned LHSOpc = LHS.getOpcode();
15610 unsigned Opc = N->getOpcode();
15611 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
15612 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
15613 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
15614 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
15615 }
15616 return SDValue();
15617}
15618
15619SDValue SITargetLowering::performFAddCombine(SDNode *N,
15620 DAGCombinerInfo &DCI) const {
15621 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
15622 return SDValue();
15623
15624 SelectionDAG &DAG = DCI.DAG;
15625 EVT VT = N->getValueType(0);
15626
15627 SDLoc SL(N);
15628 SDValue LHS = N->getOperand(0);
15629 SDValue RHS = N->getOperand(1);
15630
15631 // These should really be instruction patterns, but writing patterns with
15632 // source modifiers is a pain.
15633
15634 // fadd (fadd (a, a), b) -> mad 2.0, a, b
15635 if (LHS.getOpcode() == ISD::FADD) {
15636 SDValue A = LHS.getOperand(0);
15637 if (A == LHS.getOperand(1)) {
15638 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
15639 if (FusedOp != 0) {
15640 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
15641 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
15642 }
15643 }
15644 }
15645
15646 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
15647 if (RHS.getOpcode() == ISD::FADD) {
15648 SDValue A = RHS.getOperand(0);
15649 if (A == RHS.getOperand(1)) {
15650 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
15651 if (FusedOp != 0) {
15652 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
15653 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
15654 }
15655 }
15656 }
15657
15658 return SDValue();
15659}
15660
15661SDValue SITargetLowering::performFSubCombine(SDNode *N,
15662 DAGCombinerInfo &DCI) const {
15663 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
15664 return SDValue();
15665
15666 SelectionDAG &DAG = DCI.DAG;
15667 SDLoc SL(N);
15668 EVT VT = N->getValueType(0);
15669 assert(!VT.isVector());
15670
15671 // Try to get the fneg to fold into the source modifier. This undoes generic
15672 // DAG combines and folds them into the mad.
15673 //
15674 // Only do this if we are not trying to support denormals. v_mad_f32 does
15675 // not support denormals ever.
15676 SDValue LHS = N->getOperand(0);
15677 SDValue RHS = N->getOperand(1);
15678 if (LHS.getOpcode() == ISD::FADD) {
15679 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
15680 SDValue A = LHS.getOperand(0);
15681 if (A == LHS.getOperand(1)) {
15682 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
15683 if (FusedOp != 0) {
15684 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
15685 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
15686
15687 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
15688 }
15689 }
15690 }
15691
15692 if (RHS.getOpcode() == ISD::FADD) {
15693 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
15694
15695 SDValue A = RHS.getOperand(0);
15696 if (A == RHS.getOperand(1)) {
15697 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
15698 if (FusedOp != 0) {
15699 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
15700 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
15701 }
15702 }
15703 }
15704
15705 return SDValue();
15706}
15707
15708SDValue SITargetLowering::performFDivCombine(SDNode *N,
15709 DAGCombinerInfo &DCI) const {
15710 SelectionDAG &DAG = DCI.DAG;
15711 SDLoc SL(N);
15712 EVT VT = N->getValueType(0);
15713 if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts())
15714 return SDValue();
15715
15716 SDValue LHS = N->getOperand(0);
15717 SDValue RHS = N->getOperand(1);
15718
15719 SDNodeFlags Flags = N->getFlags();
15720 SDNodeFlags RHSFlags = RHS->getFlags();
15721 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
15722 !RHS->hasOneUse())
15723 return SDValue();
15724
15725 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
15726 bool IsNegative = false;
15727 if (CLHS->isExactlyValue(1.0) ||
15728 (IsNegative = CLHS->isExactlyValue(-1.0))) {
15729 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
15730 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
15731 if (RHS.getOpcode() == ISD::FSQRT) {
15732 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
15733 SDValue Rsq =
15734 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
15735 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
15736 }
15737 }
15738 }
15739
15740 return SDValue();
15741}
15742
15743SDValue SITargetLowering::performFMulCombine(SDNode *N,
15744 DAGCombinerInfo &DCI) const {
15745 SelectionDAG &DAG = DCI.DAG;
15746 EVT VT = N->getValueType(0);
15747 EVT ScalarVT = VT.getScalarType();
15748 EVT IntVT = VT.changeElementType(MVT::i32);
15749
15750 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
15751 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
15752 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
15753 return SDValue();
15754 }
15755
15756 SDValue LHS = N->getOperand(0);
15757 SDValue RHS = N->getOperand(1);
15758
15759 // It is cheaper to realize i32 inline constants as compared against
15760 // materializing f16 or f64 (or even non-inline f32) values,
15761 // possible via ldexp usage, as shown below :
15762 //
15763 // Given : A = 2^a & B = 2^b ; where a and b are integers.
15764 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
15765 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
15766 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
15767 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
15768 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
15769 if (!TrueNode)
15770 return SDValue();
15771 const ConstantFPSDNode *FalseNode =
15772 isConstOrConstSplatFP(RHS.getOperand(2));
15773 if (!FalseNode)
15774 return SDValue();
15775
15776 if (TrueNode->isNegative() != FalseNode->isNegative())
15777 return SDValue();
15778
15779 // For f32, only non-inline constants should be transformed.
15781 if (ScalarVT == MVT::f32 &&
15782 TII->isInlineConstant(TrueNode->getValueAPF()) &&
15783 TII->isInlineConstant(FalseNode->getValueAPF()))
15784 return SDValue();
15785
15786 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
15787 if (TrueNodeExpVal == INT_MIN)
15788 return SDValue();
15789 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
15790 if (FalseNodeExpVal == INT_MIN)
15791 return SDValue();
15792
15793 SDLoc SL(N);
15794 SDValue SelectNode =
15795 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
15796 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
15797 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
15798
15799 LHS = TrueNode->isNegative()
15800 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
15801 : LHS;
15802
15803 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
15804 }
15805
15806 return SDValue();
15807}
15808
15809SDValue SITargetLowering::performFMACombine(SDNode *N,
15810 DAGCombinerInfo &DCI) const {
15811 SelectionDAG &DAG = DCI.DAG;
15812 EVT VT = N->getValueType(0);
15813 SDLoc SL(N);
15814
15815 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
15816 return SDValue();
15817
15818 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
15819 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
15820 SDValue Op1 = N->getOperand(0);
15821 SDValue Op2 = N->getOperand(1);
15822 SDValue FMA = N->getOperand(2);
15823
15824 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
15825 Op2.getOpcode() != ISD::FP_EXTEND)
15826 return SDValue();
15827
15828 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
15829 // regardless of the denorm mode setting. Therefore,
15830 // fp-contract is sufficient to allow generating fdot2.
15831 const TargetOptions &Options = DAG.getTarget().Options;
15832 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
15833 (N->getFlags().hasAllowContract() &&
15834 FMA->getFlags().hasAllowContract())) {
15835 Op1 = Op1.getOperand(0);
15836 Op2 = Op2.getOperand(0);
15837 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15839 return SDValue();
15840
15841 SDValue Vec1 = Op1.getOperand(0);
15842 SDValue Idx1 = Op1.getOperand(1);
15843 SDValue Vec2 = Op2.getOperand(0);
15844
15845 SDValue FMAOp1 = FMA.getOperand(0);
15846 SDValue FMAOp2 = FMA.getOperand(1);
15847 SDValue FMAAcc = FMA.getOperand(2);
15848
15849 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
15850 FMAOp2.getOpcode() != ISD::FP_EXTEND)
15851 return SDValue();
15852
15853 FMAOp1 = FMAOp1.getOperand(0);
15854 FMAOp2 = FMAOp2.getOperand(0);
15855 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15857 return SDValue();
15858
15859 SDValue Vec3 = FMAOp1.getOperand(0);
15860 SDValue Vec4 = FMAOp2.getOperand(0);
15861 SDValue Idx2 = FMAOp1.getOperand(1);
15862
15863 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
15864 // Idx1 and Idx2 cannot be the same.
15865 Idx1 == Idx2)
15866 return SDValue();
15867
15868 if (Vec1 == Vec2 || Vec3 == Vec4)
15869 return SDValue();
15870
15871 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
15872 return SDValue();
15873
15874 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
15875 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
15876 DAG.getTargetConstant(0, SL, MVT::i1));
15877 }
15878 }
15879 return SDValue();
15880}
15881
15882SDValue SITargetLowering::performSetCCCombine(SDNode *N,
15883 DAGCombinerInfo &DCI) const {
15884 SelectionDAG &DAG = DCI.DAG;
15885 SDLoc SL(N);
15886
15887 SDValue LHS = N->getOperand(0);
15888 SDValue RHS = N->getOperand(1);
15889 EVT VT = LHS.getValueType();
15890 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
15891
15892 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
15893 if (!CRHS) {
15894 CRHS = dyn_cast<ConstantSDNode>(LHS);
15895 if (CRHS) {
15896 std::swap(LHS, RHS);
15897 CC = getSetCCSwappedOperands(CC);
15898 }
15899 }
15900
15901 if (CRHS) {
15902 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
15903 isBoolSGPR(LHS.getOperand(0))) {
15904 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
15905 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
15906 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
15907 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
15908 if ((CRHS->isAllOnes() &&
15909 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
15910 (CRHS->isZero() &&
15911 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
15912 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
15913 DAG.getAllOnesConstant(SL, MVT::i1));
15914 if ((CRHS->isAllOnes() &&
15915 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
15916 (CRHS->isZero() &&
15917 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
15918 return LHS.getOperand(0);
15919 }
15920
15921 const APInt &CRHSVal = CRHS->getAPIntValue();
15922 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
15923 LHS.getOpcode() == ISD::SELECT &&
15924 isa<ConstantSDNode>(LHS.getOperand(1)) &&
15925 isa<ConstantSDNode>(LHS.getOperand(2)) &&
15926 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
15927 isBoolSGPR(LHS.getOperand(0))) {
15928 // Given CT != FT:
15929 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
15930 // setcc (select cc, CT, CF), CF, ne => cc
15931 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
15932 // setcc (select cc, CT, CF), CT, eq => cc
15933 const APInt &CT = LHS.getConstantOperandAPInt(1);
15934 const APInt &CF = LHS.getConstantOperandAPInt(2);
15935
15936 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
15937 (CT == CRHSVal && CC == ISD::SETNE))
15938 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
15939 DAG.getAllOnesConstant(SL, MVT::i1));
15940 if ((CF == CRHSVal && CC == ISD::SETNE) ||
15941 (CT == CRHSVal && CC == ISD::SETEQ))
15942 return LHS.getOperand(0);
15943 }
15944 }
15945
15946 if (VT != MVT::f32 && VT != MVT::f64 &&
15947 (!Subtarget->has16BitInsts() || VT != MVT::f16))
15948 return SDValue();
15949
15950 // Match isinf/isfinite pattern
15951 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
15952 // (fcmp one (fabs x), inf) -> (fp_class x,
15953 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
15954 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
15955 LHS.getOpcode() == ISD::FABS) {
15956 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
15957 if (!CRHS)
15958 return SDValue();
15959
15960 const APFloat &APF = CRHS->getValueAPF();
15961 if (APF.isInfinity() && !APF.isNegative()) {
15962 const unsigned IsInfMask =
15964 const unsigned IsFiniteMask =
15968 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
15969 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
15970 DAG.getConstant(Mask, SL, MVT::i32));
15971 }
15972 }
15973
15974 return SDValue();
15975}
15976
15977SDValue
15978SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
15979 DAGCombinerInfo &DCI) const {
15980 SelectionDAG &DAG = DCI.DAG;
15981 SDLoc SL(N);
15982 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
15983
15984 SDValue Src = N->getOperand(0);
15985 SDValue Shift = N->getOperand(0);
15986
15987 // TODO: Extend type shouldn't matter (assuming legal types).
15988 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
15989 Shift = Shift.getOperand(0);
15990
15991 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
15992 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
15993 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
15994 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
15995 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
15996 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
15997 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
15998 SDValue Shifted = DAG.getZExtOrTrunc(
15999 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
16000
16001 unsigned ShiftOffset = 8 * Offset;
16002 if (Shift.getOpcode() == ISD::SHL)
16003 ShiftOffset -= C->getZExtValue();
16004 else
16005 ShiftOffset += C->getZExtValue();
16006
16007 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16008 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
16009 MVT::f32, Shifted);
16010 }
16011 }
16012 }
16013
16014 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16015 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
16016 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
16017 // We simplified Src. If this node is not dead, visit it again so it is
16018 // folded properly.
16019 if (N->getOpcode() != ISD::DELETED_NODE)
16020 DCI.AddToWorklist(N);
16021 return SDValue(N, 0);
16022 }
16023
16024 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
16025 if (SDValue DemandedSrc =
16027 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
16028
16029 return SDValue();
16030}
16031
16032SDValue SITargetLowering::performClampCombine(SDNode *N,
16033 DAGCombinerInfo &DCI) const {
16034 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
16035 if (!CSrc)
16036 return SDValue();
16037
16038 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16039 const APFloat &F = CSrc->getValueAPF();
16040 APFloat Zero = APFloat::getZero(F.getSemantics());
16041 if (F < Zero ||
16042 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16043 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
16044 }
16045
16046 APFloat One(F.getSemantics(), "1.0");
16047 if (F > One)
16048 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
16049
16050 return SDValue(CSrc, 0);
16051}
16052
16053SDValue SITargetLowering::performSelectCombine(SDNode *N,
16054 DAGCombinerInfo &DCI) const {
16055
16056 // Try to fold CMP + SELECT patterns with shared constants (both FP and
16057 // integer).
16058 // Detect when CMP and SELECT use the same constant and fold them to avoid
16059 // loading the constant twice. Specifically handles patterns like:
16060 // %cmp = icmp eq i32 %val, 4242
16061 // %sel = select i1 %cmp, i32 4242, i32 %other
16062 // It can be optimized to reuse %val instead of 4242 in select.
16063 SDValue Cond = N->getOperand(0);
16064 SDValue TrueVal = N->getOperand(1);
16065 SDValue FalseVal = N->getOperand(2);
16066
16067 // Check if condition is a comparison.
16068 if (Cond.getOpcode() != ISD::SETCC)
16069 return SDValue();
16070
16071 SDValue LHS = Cond.getOperand(0);
16072 SDValue RHS = Cond.getOperand(1);
16073 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
16074
16075 bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
16076 bool isInteger = LHS.getValueType().isInteger();
16077
16078 // Handle simple floating-point and integer types only.
16079 if (!isFloatingPoint && !isInteger)
16080 return SDValue();
16081
16082 bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
16083 bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
16084 if (!isEquality && !isNonEquality)
16085 return SDValue();
16086
16087 SDValue ArgVal, ConstVal;
16088 if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
16089 (isInteger && isa<ConstantSDNode>(RHS))) {
16090 ConstVal = RHS;
16091 ArgVal = LHS;
16092 } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
16093 (isInteger && isa<ConstantSDNode>(LHS))) {
16094 ConstVal = LHS;
16095 ArgVal = RHS;
16096 } else {
16097 return SDValue();
16098 }
16099
16100 // Skip optimization for inlinable immediates.
16101 if (isFloatingPoint) {
16102 const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
16103 if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
16104 return SDValue();
16105 } else {
16107 cast<ConstantSDNode>(ConstVal)->getSExtValue()))
16108 return SDValue();
16109 }
16110
16111 // For equality and non-equality comparisons, patterns:
16112 // select (setcc x, const), const, y -> select (setcc x, const), x, y
16113 // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
16114 if (!(isEquality && TrueVal == ConstVal) &&
16115 !(isNonEquality && FalseVal == ConstVal))
16116 return SDValue();
16117
16118 SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
16119 SDValue SelectRHS =
16120 (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
16121 return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
16122 SelectLHS, SelectRHS);
16123}
16124
16126 DAGCombinerInfo &DCI) const {
16127 switch (N->getOpcode()) {
16128 case ISD::ADD:
16129 case ISD::SUB:
16130 case ISD::SHL:
16131 case ISD::SRL:
16132 case ISD::SRA:
16133 case ISD::AND:
16134 case ISD::OR:
16135 case ISD::XOR:
16136 case ISD::MUL:
16137 case ISD::SETCC:
16138 case ISD::SELECT:
16139 case ISD::SMIN:
16140 case ISD::SMAX:
16141 case ISD::UMIN:
16142 case ISD::UMAX:
16143 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
16144 return Res;
16145 break;
16146 default:
16147 break;
16148 }
16149
16150 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
16151 return SDValue();
16152
16153 switch (N->getOpcode()) {
16154 case ISD::ADD:
16155 return performAddCombine(N, DCI);
16156 case ISD::PTRADD:
16157 return performPtrAddCombine(N, DCI);
16158 case ISD::SUB:
16159 return performSubCombine(N, DCI);
16160 case ISD::UADDO_CARRY:
16161 case ISD::USUBO_CARRY:
16162 return performAddCarrySubCarryCombine(N, DCI);
16163 case ISD::FADD:
16164 return performFAddCombine(N, DCI);
16165 case ISD::FSUB:
16166 return performFSubCombine(N, DCI);
16167 case ISD::FDIV:
16168 return performFDivCombine(N, DCI);
16169 case ISD::FMUL:
16170 return performFMulCombine(N, DCI);
16171 case ISD::SETCC:
16172 return performSetCCCombine(N, DCI);
16173 case ISD::SELECT:
16174 if (auto Res = performSelectCombine(N, DCI))
16175 return Res;
16176 break;
16177 case ISD::FMAXNUM:
16178 case ISD::FMINNUM:
16179 case ISD::FMAXNUM_IEEE:
16180 case ISD::FMINNUM_IEEE:
16181 case ISD::FMAXIMUM:
16182 case ISD::FMINIMUM:
16183 case ISD::FMAXIMUMNUM:
16184 case ISD::FMINIMUMNUM:
16185 case ISD::SMAX:
16186 case ISD::SMIN:
16187 case ISD::UMAX:
16188 case ISD::UMIN:
16191 return performMinMaxCombine(N, DCI);
16192 case ISD::FMA:
16193 return performFMACombine(N, DCI);
16194 case ISD::AND:
16195 return performAndCombine(N, DCI);
16196 case ISD::OR:
16197 return performOrCombine(N, DCI);
16198 case ISD::FSHR: {
16200 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
16201 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
16202 return matchPERM(N, DCI);
16203 }
16204 break;
16205 }
16206 case ISD::XOR:
16207 return performXorCombine(N, DCI);
16208 case ISD::ZERO_EXTEND:
16209 return performZeroExtendCombine(N, DCI);
16211 return performSignExtendInRegCombine(N, DCI);
16213 return performClassCombine(N, DCI);
16214 case ISD::FCANONICALIZE:
16215 return performFCanonicalizeCombine(N, DCI);
16216 case AMDGPUISD::RCP:
16217 return performRcpCombine(N, DCI);
16218 case ISD::FLDEXP:
16219 case AMDGPUISD::FRACT:
16220 case AMDGPUISD::RSQ:
16223 case AMDGPUISD::RSQ_CLAMP: {
16224 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
16225 SDValue Src = N->getOperand(0);
16226 if (Src.isUndef())
16227 return Src;
16228 break;
16229 }
16230 case ISD::SINT_TO_FP:
16231 case ISD::UINT_TO_FP:
16232 return performUCharToFloatCombine(N, DCI);
16233 case ISD::FCOPYSIGN:
16234 return performFCopySignCombine(N, DCI);
16239 return performCvtF32UByteNCombine(N, DCI);
16240 case AMDGPUISD::FMED3:
16241 return performFMed3Combine(N, DCI);
16243 return performCvtPkRTZCombine(N, DCI);
16244 case AMDGPUISD::CLAMP:
16245 return performClampCombine(N, DCI);
16246 case ISD::SCALAR_TO_VECTOR: {
16247 SelectionDAG &DAG = DCI.DAG;
16248 EVT VT = N->getValueType(0);
16249
16250 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
16251 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
16252 SDLoc SL(N);
16253 SDValue Src = N->getOperand(0);
16254 EVT EltVT = Src.getValueType();
16255 if (EltVT != MVT::i16)
16256 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
16257
16258 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
16259 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
16260 }
16261
16262 break;
16263 }
16265 return performExtractVectorEltCombine(N, DCI);
16267 return performInsertVectorEltCombine(N, DCI);
16268 case ISD::FP_ROUND:
16269 return performFPRoundCombine(N, DCI);
16270 case ISD::LOAD: {
16271 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
16272 return Widened;
16273 [[fallthrough]];
16274 }
16275 default: {
16276 if (!DCI.isBeforeLegalize()) {
16277 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
16278 return performMemSDNodeCombine(MemNode, DCI);
16279 }
16280
16281 break;
16282 }
16283 }
16284
16286}
16287
16288/// Helper function for adjustWritemask
16289static unsigned SubIdx2Lane(unsigned Idx) {
16290 switch (Idx) {
16291 default:
16292 return ~0u;
16293 case AMDGPU::sub0:
16294 return 0;
16295 case AMDGPU::sub1:
16296 return 1;
16297 case AMDGPU::sub2:
16298 return 2;
16299 case AMDGPU::sub3:
16300 return 3;
16301 case AMDGPU::sub4:
16302 return 4; // Possible with TFE/LWE
16303 }
16304}
16305
16306/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
16307SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
16308 SelectionDAG &DAG) const {
16309 unsigned Opcode = Node->getMachineOpcode();
16310
16311 // Subtract 1 because the vdata output is not a MachineSDNode operand.
16312 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
16313 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
16314 return Node; // not implemented for D16
16315
16316 SDNode *Users[5] = {nullptr};
16317 unsigned Lane = 0;
16318 unsigned DmaskIdx =
16319 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
16320 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
16321 unsigned NewDmask = 0;
16322 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
16323 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
16324 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
16325 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx));
16326 unsigned TFCLane = 0;
16327 bool HasChain = Node->getNumValues() > 1;
16328
16329 if (OldDmask == 0) {
16330 // These are folded out, but on the chance it happens don't assert.
16331 return Node;
16332 }
16333
16334 unsigned OldBitsSet = llvm::popcount(OldDmask);
16335 // Work out which is the TFE/LWE lane if that is enabled.
16336 if (UsesTFC) {
16337 TFCLane = OldBitsSet;
16338 }
16339
16340 // Try to figure out the used register components
16341 for (SDUse &Use : Node->uses()) {
16342
16343 // Don't look at users of the chain.
16344 if (Use.getResNo() != 0)
16345 continue;
16346
16347 SDNode *User = Use.getUser();
16348
16349 // Abort if we can't understand the usage
16350 if (!User->isMachineOpcode() ||
16351 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
16352 return Node;
16353
16354 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
16355 // Note that subregs are packed, i.e. Lane==0 is the first bit set
16356 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
16357 // set, etc.
16358 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
16359 if (Lane == ~0u)
16360 return Node;
16361
16362 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
16363 if (UsesTFC && Lane == TFCLane) {
16364 Users[Lane] = User;
16365 } else {
16366 // Set which texture component corresponds to the lane.
16367 unsigned Comp;
16368 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
16369 Comp = llvm::countr_zero(Dmask);
16370 Dmask &= ~(1 << Comp);
16371 }
16372
16373 // Abort if we have more than one user per component.
16374 if (Users[Lane])
16375 return Node;
16376
16377 Users[Lane] = User;
16378 NewDmask |= 1 << Comp;
16379 }
16380 }
16381
16382 // Don't allow 0 dmask, as hardware assumes one channel enabled.
16383 bool NoChannels = !NewDmask;
16384 if (NoChannels) {
16385 if (!UsesTFC) {
16386 // No uses of the result and not using TFC. Then do nothing.
16387 return Node;
16388 }
16389 // If the original dmask has one channel - then nothing to do
16390 if (OldBitsSet == 1)
16391 return Node;
16392 // Use an arbitrary dmask - required for the instruction to work
16393 NewDmask = 1;
16394 }
16395 // Abort if there's no change
16396 if (NewDmask == OldDmask)
16397 return Node;
16398
16399 unsigned BitsSet = llvm::popcount(NewDmask);
16400
16401 // Check for TFE or LWE - increase the number of channels by one to account
16402 // for the extra return value
16403 // This will need adjustment for D16 if this is also included in
16404 // adjustWriteMask (this function) but at present D16 are excluded.
16405 unsigned NewChannels = BitsSet + UsesTFC;
16406
16407 int NewOpcode =
16408 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
16409 assert(NewOpcode != -1 &&
16410 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
16411 "failed to find equivalent MIMG op");
16412
16413 // Adjust the writemask in the node
16415 llvm::append_range(Ops, Node->ops().take_front(DmaskIdx));
16416 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
16417 llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1));
16418
16419 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
16420
16421 MVT ResultVT = NewChannels == 1
16422 ? SVT
16423 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
16424 : NewChannels == 5 ? 8
16425 : NewChannels);
16426 SDVTList NewVTList =
16427 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
16428
16429 MachineSDNode *NewNode =
16430 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
16431
16432 if (HasChain) {
16433 // Update chain.
16434 DAG.setNodeMemRefs(NewNode, Node->memoperands());
16435 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
16436 }
16437
16438 if (NewChannels == 1) {
16439 assert(Node->hasNUsesOfValue(1, 0));
16440 SDNode *Copy =
16441 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
16442 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
16443 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
16444 return nullptr;
16445 }
16446
16447 // Update the users of the node with the new indices
16448 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
16449 SDNode *User = Users[i];
16450 if (!User) {
16451 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
16452 // Users[0] is still nullptr because channel 0 doesn't really have a use.
16453 if (i || !NoChannels)
16454 continue;
16455 } else {
16456 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
16457 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
16458 if (NewUser != User) {
16459 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
16460 DAG.RemoveDeadNode(User);
16461 }
16462 }
16463
16464 switch (Idx) {
16465 default:
16466 break;
16467 case AMDGPU::sub0:
16468 Idx = AMDGPU::sub1;
16469 break;
16470 case AMDGPU::sub1:
16471 Idx = AMDGPU::sub2;
16472 break;
16473 case AMDGPU::sub2:
16474 Idx = AMDGPU::sub3;
16475 break;
16476 case AMDGPU::sub3:
16477 Idx = AMDGPU::sub4;
16478 break;
16479 }
16480 }
16481
16482 DAG.RemoveDeadNode(Node);
16483 return nullptr;
16484}
16485
16487 if (Op.getOpcode() == ISD::AssertZext)
16488 Op = Op.getOperand(0);
16489
16490 return isa<FrameIndexSDNode>(Op);
16491}
16492
16493/// Legalize target independent instructions (e.g. INSERT_SUBREG)
16494/// with frame index operands.
16495/// LLVM assumes that inputs are to these instructions are registers.
16496SDNode *
16498 SelectionDAG &DAG) const {
16499 if (Node->getOpcode() == ISD::CopyToReg) {
16500 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
16501 SDValue SrcVal = Node->getOperand(2);
16502
16503 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
16504 // to try understanding copies to physical registers.
16505 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
16506 SDLoc SL(Node);
16508 SDValue VReg = DAG.getRegister(
16509 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
16510
16511 SDNode *Glued = Node->getGluedNode();
16512 SDValue ToVReg = DAG.getCopyToReg(
16513 Node->getOperand(0), SL, VReg, SrcVal,
16514 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
16515 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
16516 VReg, ToVReg.getValue(1));
16517 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
16518 DAG.RemoveDeadNode(Node);
16519 return ToResultReg.getNode();
16520 }
16521 }
16522
16524 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
16525 if (!isFrameIndexOp(Node->getOperand(i))) {
16526 Ops.push_back(Node->getOperand(i));
16527 continue;
16528 }
16529
16530 SDLoc DL(Node);
16531 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
16532 Node->getOperand(i).getValueType(),
16533 Node->getOperand(i)),
16534 0));
16535 }
16536
16537 return DAG.UpdateNodeOperands(Node, Ops);
16538}
16539
16540/// Fold the instructions after selecting them.
16541/// Returns null if users were already updated.
16543 SelectionDAG &DAG) const {
16545 unsigned Opcode = Node->getMachineOpcode();
16546
16547 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
16548 !TII->isGather4(Opcode) &&
16549 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
16550 return adjustWritemask(Node, DAG);
16551 }
16552
16553 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
16555 return Node;
16556 }
16557
16558 switch (Opcode) {
16559 case AMDGPU::V_DIV_SCALE_F32_e64:
16560 case AMDGPU::V_DIV_SCALE_F64_e64: {
16561 // Satisfy the operand register constraint when one of the inputs is
16562 // undefined. Ordinarily each undef value will have its own implicit_def of
16563 // a vreg, so force these to use a single register.
16564 SDValue Src0 = Node->getOperand(1);
16565 SDValue Src1 = Node->getOperand(3);
16566 SDValue Src2 = Node->getOperand(5);
16567
16568 if ((Src0.isMachineOpcode() &&
16569 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
16570 (Src0 == Src1 || Src0 == Src2))
16571 break;
16572
16573 MVT VT = Src0.getValueType().getSimpleVT();
16574 const TargetRegisterClass *RC =
16575 getRegClassFor(VT, Src0.getNode()->isDivergent());
16576
16578 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
16579
16580 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
16581 Src0, SDValue());
16582
16583 // src0 must be the same register as src1 or src2, even if the value is
16584 // undefined, so make sure we don't violate this constraint.
16585 if (Src0.isMachineOpcode() &&
16586 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
16587 if (Src1.isMachineOpcode() &&
16588 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
16589 Src0 = Src1;
16590 else if (Src2.isMachineOpcode() &&
16591 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
16592 Src0 = Src2;
16593 else {
16594 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
16595 Src0 = UndefReg;
16596 Src1 = UndefReg;
16597 }
16598 } else
16599 break;
16600
16601 SmallVector<SDValue, 9> Ops(Node->ops());
16602 Ops[1] = Src0;
16603 Ops[3] = Src1;
16604 Ops[5] = Src2;
16605 Ops.push_back(ImpDef.getValue(1));
16606 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
16607 }
16608 default:
16609 break;
16610 }
16611
16612 return Node;
16613}
16614
16615// Any MIMG instructions that use tfe or lwe require an initialization of the
16616// result register that will be written in the case of a memory access failure.
16617// The required code is also added to tie this init code to the result of the
16618// img instruction.
16621 const SIRegisterInfo &TRI = TII->getRegisterInfo();
16622 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
16623 MachineBasicBlock &MBB = *MI.getParent();
16624
16625 int DstIdx =
16626 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
16627 unsigned InitIdx = 0;
16628
16629 if (TII->isImage(MI)) {
16630 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
16631 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
16632 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
16633
16634 if (!TFE && !LWE) // intersect_ray
16635 return;
16636
16637 unsigned TFEVal = TFE ? TFE->getImm() : 0;
16638 unsigned LWEVal = LWE ? LWE->getImm() : 0;
16639 unsigned D16Val = D16 ? D16->getImm() : 0;
16640
16641 if (!TFEVal && !LWEVal)
16642 return;
16643
16644 // At least one of TFE or LWE are non-zero
16645 // We have to insert a suitable initialization of the result value and
16646 // tie this to the dest of the image instruction.
16647
16648 // Calculate which dword we have to initialize to 0.
16649 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
16650
16651 // check that dmask operand is found.
16652 assert(MO_Dmask && "Expected dmask operand in instruction");
16653
16654 unsigned dmask = MO_Dmask->getImm();
16655 // Determine the number of active lanes taking into account the
16656 // Gather4 special case
16657 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
16658
16659 bool Packed = !Subtarget->hasUnpackedD16VMem();
16660
16661 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
16662
16663 // Abandon attempt if the dst size isn't large enough
16664 // - this is in fact an error but this is picked up elsewhere and
16665 // reported correctly.
16666 uint32_t DstSize =
16667 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
16668 if (DstSize < InitIdx)
16669 return;
16670 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
16671 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
16672 } else {
16673 return;
16674 }
16675
16676 const DebugLoc &DL = MI.getDebugLoc();
16677
16678 // Create a register for the initialization value.
16679 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
16680 unsigned NewDst = 0; // Final initialized value will be in here
16681
16682 // If PRTStrictNull feature is enabled (the default) then initialize
16683 // all the result registers to 0, otherwise just the error indication
16684 // register (VGPRn+1)
16685 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
16686 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
16687
16688 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
16689 for (; SizeLeft; SizeLeft--, CurrIdx++) {
16690 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
16691 // Initialize dword
16692 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
16693 // clang-format off
16694 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
16695 .addImm(0);
16696 // clang-format on
16697 // Insert into the super-reg
16698 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
16699 .addReg(PrevDst)
16700 .addReg(SubReg)
16702
16703 PrevDst = NewDst;
16704 }
16705
16706 // Add as an implicit operand
16707 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
16708
16709 // Tie the just added implicit operand to the dst
16710 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
16711}
16712
16713/// Assign the register class depending on the number of
16714/// bits set in the writemask
16716 SDNode *Node) const {
16718
16719 MachineFunction *MF = MI.getParent()->getParent();
16722
16723 if (TII->isVOP3(MI.getOpcode())) {
16724 // Make sure constant bus requirements are respected.
16725 TII->legalizeOperandsVOP3(MRI, MI);
16726
16727 // Prefer VGPRs over AGPRs in mAI instructions where possible.
16728 // This saves a chain-copy of registers and better balance register
16729 // use between vgpr and agpr as agpr tuples tend to be big.
16730 if (!MI.getDesc().operands().empty()) {
16731 unsigned Opc = MI.getOpcode();
16732 bool HasAGPRs = Info->mayNeedAGPRs();
16733 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16734 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
16735 for (auto I :
16736 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
16737 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
16738 if (I == -1)
16739 break;
16740 if ((I == Src2Idx) && (HasAGPRs))
16741 break;
16742 MachineOperand &Op = MI.getOperand(I);
16743 if (!Op.isReg() || !Op.getReg().isVirtual())
16744 continue;
16745 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
16746 if (!TRI->hasAGPRs(RC))
16747 continue;
16748 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
16749 if (!Src || !Src->isCopy() ||
16750 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
16751 continue;
16752 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
16753 // All uses of agpr64 and agpr32 can also accept vgpr except for
16754 // v_accvgpr_read, but we do not produce agpr reads during selection,
16755 // so no use checks are needed.
16756 MRI.setRegClass(Op.getReg(), NewRC);
16757 }
16758
16759 if (TII->isMAI(MI)) {
16760 // The ordinary src0, src1, src2 were legalized above.
16761 //
16762 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
16763 // as a separate instruction.
16764 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
16765 AMDGPU::OpName::scale_src0);
16766 if (Src0Idx != -1) {
16767 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
16768 AMDGPU::OpName::scale_src1);
16769 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
16770 TII->usesConstantBus(MRI, MI, Src1Idx))
16771 TII->legalizeOpWithMove(MI, Src1Idx);
16772 }
16773 }
16774
16775 if (!HasAGPRs)
16776 return;
16777
16778 // Resolve the rest of AV operands to AGPRs.
16779 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
16780 if (Src2->isReg() && Src2->getReg().isVirtual()) {
16781 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
16782 if (TRI->isVectorSuperClass(RC)) {
16783 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
16784 MRI.setRegClass(Src2->getReg(), NewRC);
16785 if (Src2->isTied())
16786 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
16787 }
16788 }
16789 }
16790 }
16791
16792 return;
16793 }
16794
16795 if (TII->isImage(MI))
16796 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
16797}
16798
16800 uint64_t Val) {
16801 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
16802 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
16803}
16804
16806 const SDLoc &DL,
16807 SDValue Ptr) const {
16809
16810 // Build the half of the subregister with the constants before building the
16811 // full 128-bit register. If we are building multiple resource descriptors,
16812 // this will allow CSEing of the 2-component register.
16813 const SDValue Ops0[] = {
16814 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
16815 buildSMovImm32(DAG, DL, 0),
16816 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
16817 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
16818 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
16819
16820 SDValue SubRegHi = SDValue(
16821 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
16822
16823 // Combine the constants and the pointer.
16824 const SDValue Ops1[] = {
16825 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
16826 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
16827 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
16828
16829 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
16830}
16831
16832/// Return a resource descriptor with the 'Add TID' bit enabled
16833/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
16834/// of the resource descriptor) to create an offset, which is added to
16835/// the resource pointer.
16837 SDValue Ptr, uint32_t RsrcDword1,
16838 uint64_t RsrcDword2And3) const {
16839 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
16840 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
16841 if (RsrcDword1) {
16842 PtrHi =
16843 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
16844 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
16845 0);
16846 }
16847
16848 SDValue DataLo =
16849 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
16850 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
16851
16852 const SDValue Ops[] = {
16853 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
16854 PtrLo,
16855 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
16856 PtrHi,
16857 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
16858 DataLo,
16859 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
16860 DataHi,
16861 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
16862
16863 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
16864}
16865
16866//===----------------------------------------------------------------------===//
16867// SI Inline Assembly Support
16868//===----------------------------------------------------------------------===//
16869
16870std::pair<unsigned, const TargetRegisterClass *>
16872 StringRef Constraint,
16873 MVT VT) const {
16874 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
16875
16876 const TargetRegisterClass *RC = nullptr;
16877 if (Constraint.size() == 1) {
16878 // Check if we cannot determine the bit size of the given value type. This
16879 // can happen, for example, in this situation where we have an empty struct
16880 // (size 0): `call void asm "", "v"({} poison)`-
16881 if (VT == MVT::Other)
16882 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
16883 const unsigned BitWidth = VT.getSizeInBits();
16884 switch (Constraint[0]) {
16885 default:
16886 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
16887 case 's':
16888 case 'r':
16889 switch (BitWidth) {
16890 case 16:
16891 RC = &AMDGPU::SReg_32RegClass;
16892 break;
16893 case 64:
16894 RC = &AMDGPU::SGPR_64RegClass;
16895 break;
16896 default:
16898 if (!RC)
16899 return std::pair(0U, nullptr);
16900 break;
16901 }
16902 break;
16903 case 'v':
16904 switch (BitWidth) {
16905 case 16:
16906 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
16907 : &AMDGPU::VGPR_32RegClass;
16908 break;
16909 default:
16910 RC = TRI->getVGPRClassForBitWidth(BitWidth);
16911 if (!RC)
16912 return std::pair(0U, nullptr);
16913 break;
16914 }
16915 break;
16916 case 'a':
16917 if (!Subtarget->hasMAIInsts())
16918 break;
16919 switch (BitWidth) {
16920 case 16:
16921 RC = &AMDGPU::AGPR_32RegClass;
16922 break;
16923 default:
16924 RC = TRI->getAGPRClassForBitWidth(BitWidth);
16925 if (!RC)
16926 return std::pair(0U, nullptr);
16927 break;
16928 }
16929 break;
16930 }
16931 } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
16932 const unsigned BitWidth = VT.getSizeInBits();
16933 switch (BitWidth) {
16934 case 16:
16935 RC = &AMDGPU::AV_32RegClass;
16936 break;
16937 default:
16938 RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
16939 if (!RC)
16940 return std::pair(0U, nullptr);
16941 break;
16942 }
16943 }
16944
16945 // We actually support i128, i16 and f16 as inline parameters
16946 // even if they are not reported as legal
16947 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
16948 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
16949 return std::pair(0U, RC);
16950
16951 auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
16952 if (Kind != '\0') {
16953 if (Kind == 'v') {
16954 RC = &AMDGPU::VGPR_32RegClass;
16955 } else if (Kind == 's') {
16956 RC = &AMDGPU::SGPR_32RegClass;
16957 } else if (Kind == 'a') {
16958 RC = &AMDGPU::AGPR_32RegClass;
16959 }
16960
16961 if (RC) {
16962 if (NumRegs > 1) {
16963 if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
16964 return std::pair(0U, nullptr);
16965
16966 uint32_t Width = NumRegs * 32;
16967 // Prohibit constraints for register ranges with a width that does not
16968 // match the required type.
16969 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
16970 return std::pair(0U, nullptr);
16971
16972 MCRegister Reg = RC->getRegister(Idx);
16974 RC = TRI->getVGPRClassForBitWidth(Width);
16975 else if (SIRegisterInfo::isSGPRClass(RC))
16976 RC = TRI->getSGPRClassForBitWidth(Width);
16977 else if (SIRegisterInfo::isAGPRClass(RC))
16978 RC = TRI->getAGPRClassForBitWidth(Width);
16979 if (RC) {
16980 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
16981 if (!Reg) {
16982 // The register class does not contain the requested register,
16983 // e.g., because it is an SGPR pair that would violate alignment
16984 // requirements.
16985 return std::pair(0U, nullptr);
16986 }
16987 return std::pair(Reg, RC);
16988 }
16989 }
16990
16991 // Check for lossy scalar/vector conversions.
16992 if (VT.isVector() && VT.getSizeInBits() != 32)
16993 return std::pair(0U, nullptr);
16994 if (Idx < RC->getNumRegs())
16995 return std::pair(RC->getRegister(Idx), RC);
16996 }
16997 }
16998
16999 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17000 if (Ret.first)
17001 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
17002
17003 return Ret;
17004}
17005
17006static bool isImmConstraint(StringRef Constraint) {
17007 if (Constraint.size() == 1) {
17008 switch (Constraint[0]) {
17009 default:
17010 break;
17011 case 'I':
17012 case 'J':
17013 case 'A':
17014 case 'B':
17015 case 'C':
17016 return true;
17017 }
17018 } else if (Constraint == "DA" || Constraint == "DB") {
17019 return true;
17020 }
17021 return false;
17022}
17023
17026 if (Constraint.size() == 1) {
17027 switch (Constraint[0]) {
17028 default:
17029 break;
17030 case 's':
17031 case 'v':
17032 case 'a':
17033 return C_RegisterClass;
17034 }
17035 } else if (Constraint.size() == 2) {
17036 if (Constraint == "VA")
17037 return C_RegisterClass;
17038 }
17039 if (isImmConstraint(Constraint)) {
17040 return C_Other;
17041 }
17042 return TargetLowering::getConstraintType(Constraint);
17043}
17044
17045static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
17047 Val = Val & maskTrailingOnes<uint64_t>(Size);
17048 }
17049 return Val;
17050}
17051
17053 StringRef Constraint,
17054 std::vector<SDValue> &Ops,
17055 SelectionDAG &DAG) const {
17056 if (isImmConstraint(Constraint)) {
17057 uint64_t Val;
17058 if (getAsmOperandConstVal(Op, Val) &&
17059 checkAsmConstraintVal(Op, Constraint, Val)) {
17060 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
17061 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
17062 }
17063 } else {
17064 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
17065 }
17066}
17067
17069 unsigned Size = Op.getScalarValueSizeInBits();
17070 if (Size > 64)
17071 return false;
17072
17073 if (Size == 16 && !Subtarget->has16BitInsts())
17074 return false;
17075
17076 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
17077 Val = C->getSExtValue();
17078 return true;
17079 }
17080 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
17081 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17082 return true;
17083 }
17084 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) {
17085 if (Size != 16 || Op.getNumOperands() != 2)
17086 return false;
17087 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
17088 return false;
17089 if (ConstantSDNode *C = V->getConstantSplatNode()) {
17090 Val = C->getSExtValue();
17091 return true;
17092 }
17093 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
17094 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17095 return true;
17096 }
17097 }
17098
17099 return false;
17100}
17101
17103 uint64_t Val) const {
17104 if (Constraint.size() == 1) {
17105 switch (Constraint[0]) {
17106 case 'I':
17108 case 'J':
17109 return isInt<16>(Val);
17110 case 'A':
17111 return checkAsmConstraintValA(Op, Val);
17112 case 'B':
17113 return isInt<32>(Val);
17114 case 'C':
17115 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
17117 default:
17118 break;
17119 }
17120 } else if (Constraint.size() == 2) {
17121 if (Constraint == "DA") {
17122 int64_t HiBits = static_cast<int32_t>(Val >> 32);
17123 int64_t LoBits = static_cast<int32_t>(Val);
17124 return checkAsmConstraintValA(Op, HiBits, 32) &&
17125 checkAsmConstraintValA(Op, LoBits, 32);
17126 }
17127 if (Constraint == "DB") {
17128 return true;
17129 }
17130 }
17131 llvm_unreachable("Invalid asm constraint");
17132}
17133
17135 unsigned MaxSize) const {
17136 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
17137 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
17138 if (Size == 16) {
17139 MVT VT = Op.getSimpleValueType();
17140 switch (VT.SimpleTy) {
17141 default:
17142 return false;
17143 case MVT::i16:
17144 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
17145 case MVT::f16:
17146 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
17147 case MVT::bf16:
17148 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
17149 case MVT::v2i16:
17150 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
17151 case MVT::v2f16:
17152 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
17153 case MVT::v2bf16:
17154 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
17155 }
17156 }
17157 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
17158 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
17159 return true;
17160 return false;
17161}
17162
17163static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
17164 switch (UnalignedClassID) {
17165 case AMDGPU::VReg_64RegClassID:
17166 return AMDGPU::VReg_64_Align2RegClassID;
17167 case AMDGPU::VReg_96RegClassID:
17168 return AMDGPU::VReg_96_Align2RegClassID;
17169 case AMDGPU::VReg_128RegClassID:
17170 return AMDGPU::VReg_128_Align2RegClassID;
17171 case AMDGPU::VReg_160RegClassID:
17172 return AMDGPU::VReg_160_Align2RegClassID;
17173 case AMDGPU::VReg_192RegClassID:
17174 return AMDGPU::VReg_192_Align2RegClassID;
17175 case AMDGPU::VReg_224RegClassID:
17176 return AMDGPU::VReg_224_Align2RegClassID;
17177 case AMDGPU::VReg_256RegClassID:
17178 return AMDGPU::VReg_256_Align2RegClassID;
17179 case AMDGPU::VReg_288RegClassID:
17180 return AMDGPU::VReg_288_Align2RegClassID;
17181 case AMDGPU::VReg_320RegClassID:
17182 return AMDGPU::VReg_320_Align2RegClassID;
17183 case AMDGPU::VReg_352RegClassID:
17184 return AMDGPU::VReg_352_Align2RegClassID;
17185 case AMDGPU::VReg_384RegClassID:
17186 return AMDGPU::VReg_384_Align2RegClassID;
17187 case AMDGPU::VReg_512RegClassID:
17188 return AMDGPU::VReg_512_Align2RegClassID;
17189 case AMDGPU::VReg_1024RegClassID:
17190 return AMDGPU::VReg_1024_Align2RegClassID;
17191 case AMDGPU::AReg_64RegClassID:
17192 return AMDGPU::AReg_64_Align2RegClassID;
17193 case AMDGPU::AReg_96RegClassID:
17194 return AMDGPU::AReg_96_Align2RegClassID;
17195 case AMDGPU::AReg_128RegClassID:
17196 return AMDGPU::AReg_128_Align2RegClassID;
17197 case AMDGPU::AReg_160RegClassID:
17198 return AMDGPU::AReg_160_Align2RegClassID;
17199 case AMDGPU::AReg_192RegClassID:
17200 return AMDGPU::AReg_192_Align2RegClassID;
17201 case AMDGPU::AReg_256RegClassID:
17202 return AMDGPU::AReg_256_Align2RegClassID;
17203 case AMDGPU::AReg_512RegClassID:
17204 return AMDGPU::AReg_512_Align2RegClassID;
17205 case AMDGPU::AReg_1024RegClassID:
17206 return AMDGPU::AReg_1024_Align2RegClassID;
17207 default:
17208 return -1;
17209 }
17210}
17211
17212// Figure out which registers should be reserved for stack access. Only after
17213// the function is legalized do we know all of the non-spill stack objects or if
17214// calls are present.
17218 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
17219 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17220 const SIInstrInfo *TII = ST.getInstrInfo();
17221
17222 if (Info->isEntryFunction()) {
17223 // Callable functions have fixed registers used for stack access.
17225 }
17226
17227 // TODO: Move this logic to getReservedRegs()
17228 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
17229 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
17230 Register SReg = ST.isWave32()
17231 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
17232 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
17233 &AMDGPU::SGPR_64RegClass);
17234 Info->setSGPRForEXECCopy(SReg);
17235
17236 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
17237 Info->getStackPtrOffsetReg()));
17238 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
17239 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
17240
17241 // We need to worry about replacing the default register with itself in case
17242 // of MIR testcases missing the MFI.
17243 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
17244 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
17245
17246 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
17247 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
17248
17249 Info->limitOccupancy(MF);
17250
17251 if (ST.isWave32() && !MF.empty()) {
17252 for (auto &MBB : MF) {
17253 for (auto &MI : MBB) {
17254 TII->fixImplicitOperands(MI);
17255 }
17256 }
17257 }
17258
17259 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
17260 // classes if required. Ideally the register class constraints would differ
17261 // per-subtarget, but there's no easy way to achieve that right now. This is
17262 // not a problem for VGPRs because the correctly aligned VGPR class is implied
17263 // from using them as the register class for legal types.
17264 if (ST.needsAlignedVGPRs()) {
17265 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
17266 const Register Reg = Register::index2VirtReg(I);
17267 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
17268 if (!RC)
17269 continue;
17270 int NewClassID = getAlignedAGPRClassID(RC->getID());
17271 if (NewClassID != -1)
17272 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
17273 }
17274 }
17275
17277}
17278
17280 KnownBits &Known,
17281 const APInt &DemandedElts,
17282 const SelectionDAG &DAG,
17283 unsigned Depth) const {
17284 Known.resetAll();
17285 unsigned Opc = Op.getOpcode();
17286 switch (Opc) {
17288 unsigned IID = Op.getConstantOperandVal(0);
17289 switch (IID) {
17290 case Intrinsic::amdgcn_mbcnt_lo:
17291 case Intrinsic::amdgcn_mbcnt_hi: {
17292 const GCNSubtarget &ST =
17294 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
17295 // most 31 + src1.
17296 Known.Zero.setBitsFrom(
17297 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
17298 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
17299 Known = KnownBits::add(Known, Known2);
17300 return;
17301 }
17302 }
17303 break;
17304 }
17305 }
17307 Op, Known, DemandedElts, DAG, Depth);
17308}
17309
17311 const int FI, KnownBits &Known, const MachineFunction &MF) const {
17313
17314 // Set the high bits to zero based on the maximum allowed scratch size per
17315 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
17316 // calculation won't overflow, so assume the sign bit is never set.
17317 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
17318}
17319
17321 GISelValueTracking &VT, KnownBits &Known,
17322 unsigned Dim) {
17323 unsigned MaxValue =
17324 ST.getMaxWorkitemID(VT.getMachineFunction().getFunction(), Dim);
17325 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
17326}
17327
17329 KnownBits &Known, const APInt &DemandedElts,
17330 unsigned BFEWidth, bool SExt, unsigned Depth) {
17332 const MachineOperand &Src1 = MI.getOperand(2);
17333
17334 unsigned Src1Cst = 0;
17335 if (Src1.isImm()) {
17336 Src1Cst = Src1.getImm();
17337 } else if (Src1.isReg()) {
17338 auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI);
17339 if (!Cst)
17340 return;
17341 Src1Cst = Cst->Value.getZExtValue();
17342 } else {
17343 return;
17344 }
17345
17346 // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
17347 // Width is always [22:16].
17348 const unsigned Offset =
17349 Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
17350 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
17351
17352 if (Width >= BFEWidth) // Ill-formed.
17353 return;
17354
17355 VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
17356 Depth + 1);
17357
17358 Known = Known.extractBits(Width, Offset);
17359
17360 if (SExt)
17361 Known = Known.sext(BFEWidth);
17362 else
17363 Known = Known.zext(BFEWidth);
17364}
17365
17367 GISelValueTracking &VT, Register R, KnownBits &Known,
17368 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
17369 unsigned Depth) const {
17370 Known.resetAll();
17371 const MachineInstr *MI = MRI.getVRegDef(R);
17372 switch (MI->getOpcode()) {
17373 case AMDGPU::S_BFE_I32:
17374 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
17375 /*SExt=*/true, Depth);
17376 case AMDGPU::S_BFE_U32:
17377 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
17378 /*SExt=*/false, Depth);
17379 case AMDGPU::S_BFE_I64:
17380 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
17381 /*SExt=*/true, Depth);
17382 case AMDGPU::S_BFE_U64:
17383 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
17384 /*SExt=*/false, Depth);
17385 case AMDGPU::G_INTRINSIC:
17386 case AMDGPU::G_INTRINSIC_CONVERGENT: {
17387 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
17388 switch (IID) {
17389 case Intrinsic::amdgcn_workitem_id_x:
17390 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 0);
17391 break;
17392 case Intrinsic::amdgcn_workitem_id_y:
17393 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 1);
17394 break;
17395 case Intrinsic::amdgcn_workitem_id_z:
17396 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 2);
17397 break;
17398 case Intrinsic::amdgcn_mbcnt_lo:
17399 case Intrinsic::amdgcn_mbcnt_hi: {
17400 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
17401 // most 31 + src1.
17402 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
17403 ? getSubtarget()->getWavefrontSizeLog2()
17404 : 5);
17405 KnownBits Known2;
17406 VT.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
17407 Depth + 1);
17408 Known = KnownBits::add(Known, Known2);
17409 break;
17410 }
17411 case Intrinsic::amdgcn_groupstaticsize: {
17412 // We can report everything over the maximum size as 0. We can't report
17413 // based on the actual size because we don't know if it's accurate or not
17414 // at any given point.
17415 Known.Zero.setHighBits(
17416 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
17417 break;
17418 }
17419 }
17420 break;
17421 }
17422 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
17423 Known.Zero.setHighBits(24);
17424 break;
17425 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
17426 Known.Zero.setHighBits(16);
17427 break;
17428 case AMDGPU::G_AMDGPU_SMED3:
17429 case AMDGPU::G_AMDGPU_UMED3: {
17430 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
17431
17432 KnownBits Known2;
17433 VT.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
17434 if (Known2.isUnknown())
17435 break;
17436
17437 KnownBits Known1;
17438 VT.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
17439 if (Known1.isUnknown())
17440 break;
17441
17442 KnownBits Known0;
17443 VT.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
17444 if (Known0.isUnknown())
17445 break;
17446
17447 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
17448 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
17449 Known.One = Known0.One & Known1.One & Known2.One;
17450 break;
17451 }
17452 }
17453}
17454
17457 unsigned Depth) const {
17458 const MachineInstr *MI = MRI.getVRegDef(R);
17459 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
17460 // FIXME: Can this move to generic code? What about the case where the call
17461 // site specifies a lower alignment?
17462 Intrinsic::ID IID = GI->getIntrinsicID();
17464 AttributeList Attrs =
17465 Intrinsic::getAttributes(Ctx, IID, Intrinsic::getType(Ctx, IID));
17466 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
17467 return *RetAlign;
17468 }
17469 return Align(1);
17470}
17471
17474 const Align CacheLineAlign = Align(64);
17475
17476 // Pre-GFX10 target did not benefit from loop alignment
17477 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
17478 getSubtarget()->hasInstFwdPrefetchBug())
17479 return PrefAlign;
17480
17481 // On GFX10 I$ is 4 x 64 bytes cache lines.
17482 // By default prefetcher keeps one cache line behind and reads two ahead.
17483 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
17484 // behind and one ahead.
17485 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
17486 // If loop fits 64 bytes it always spans no more than two cache lines and
17487 // does not need an alignment.
17488 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
17489 // Else if loop is less or equal 192 bytes we need two lines behind.
17490
17492 const MachineBasicBlock *Header = ML->getHeader();
17493 if (Header->getAlignment() != PrefAlign)
17494 return Header->getAlignment(); // Already processed.
17495
17496 unsigned LoopSize = 0;
17497 for (const MachineBasicBlock *MBB : ML->blocks()) {
17498 // If inner loop block is aligned assume in average half of the alignment
17499 // size to be added as nops.
17500 if (MBB != Header)
17501 LoopSize += MBB->getAlignment().value() / 2;
17502
17503 for (const MachineInstr &MI : *MBB) {
17504 LoopSize += TII->getInstSizeInBytes(MI);
17505 if (LoopSize > 192)
17506 return PrefAlign;
17507 }
17508 }
17509
17510 if (LoopSize <= 64)
17511 return PrefAlign;
17512
17513 if (LoopSize <= 128)
17514 return CacheLineAlign;
17515
17516 // If any of parent loops is surrounded by prefetch instructions do not
17517 // insert new for inner loop, which would reset parent's settings.
17518 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
17519 if (MachineBasicBlock *Exit = P->getExitBlock()) {
17520 auto I = Exit->getFirstNonDebugInstr();
17521 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
17522 return CacheLineAlign;
17523 }
17524 }
17525
17526 MachineBasicBlock *Pre = ML->getLoopPreheader();
17527 MachineBasicBlock *Exit = ML->getExitBlock();
17528
17529 if (Pre && Exit) {
17530 auto PreTerm = Pre->getFirstTerminator();
17531 if (PreTerm == Pre->begin() ||
17532 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
17533 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
17534 .addImm(1); // prefetch 2 lines behind PC
17535
17536 auto ExitHead = Exit->getFirstNonDebugInstr();
17537 if (ExitHead == Exit->end() ||
17538 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
17539 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
17540 .addImm(2); // prefetch 1 line behind PC
17541 }
17542
17543 return CacheLineAlign;
17544}
17545
17547static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
17548 assert(N->getOpcode() == ISD::CopyFromReg);
17549 do {
17550 // Follow the chain until we find an INLINEASM node.
17551 N = N->getOperand(0).getNode();
17552 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
17553 return true;
17554 } while (N->getOpcode() == ISD::CopyFromReg);
17555 return false;
17556}
17557
17560 UniformityInfo *UA) const {
17561 switch (N->getOpcode()) {
17562 case ISD::CopyFromReg: {
17563 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
17564 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
17565 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17566 Register Reg = R->getReg();
17567
17568 // FIXME: Why does this need to consider isLiveIn?
17569 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
17570 return !TRI->isSGPRReg(MRI, Reg);
17571
17572 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
17573 return UA->isDivergent(V);
17574
17576 return !TRI->isSGPRReg(MRI, Reg);
17577 }
17578 case ISD::LOAD: {
17579 const LoadSDNode *L = cast<LoadSDNode>(N);
17580 unsigned AS = L->getAddressSpace();
17581 // A flat load may access private memory.
17583 }
17584 case ISD::CALLSEQ_END:
17585 return true;
17587 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
17589 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
17608 // Target-specific read-modify-write atomics are sources of divergence.
17609 return true;
17610 default:
17611 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
17612 // Generic read-modify-write atomics are sources of divergence.
17613 return A->readMem() && A->writeMem();
17614 }
17615 return false;
17616 }
17617}
17618
17620 EVT VT) const {
17621 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
17622 case MVT::f32:
17624 case MVT::f64:
17625 case MVT::f16:
17627 default:
17628 return false;
17629 }
17630}
17631
17633 LLT Ty, const MachineFunction &MF) const {
17634 switch (Ty.getScalarSizeInBits()) {
17635 case 32:
17636 return !denormalModeIsFlushAllF32(MF);
17637 case 64:
17638 case 16:
17639 return !denormalModeIsFlushAllF64F16(MF);
17640 default:
17641 return false;
17642 }
17643}
17644
17646 const APInt &DemandedElts,
17647 const SelectionDAG &DAG,
17648 bool SNaN,
17649 unsigned Depth) const {
17650 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
17651 const MachineFunction &MF = DAG.getMachineFunction();
17653
17654 if (Info->getMode().DX10Clamp)
17655 return true; // Clamped to 0.
17656 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
17657 }
17658
17660 DAG, SNaN, Depth);
17661}
17662
17663// On older subtargets, global FP atomic instructions have a hardcoded FP mode
17664// and do not support FP32 denormals, and only support v2f16/f64 denormals.
17666 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
17667 return true;
17668
17670 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
17671 if (DenormMode == DenormalMode::getPreserveSign())
17672 return true;
17673
17674 // TODO: Remove this.
17675 return RMW->getFunction()
17676 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
17677 .getValueAsBool();
17678}
17679
17681 LLVMContext &Ctx = RMW->getContext();
17682 StringRef MemScope =
17683 Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system");
17684
17685 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
17686 << "Hardware instruction generated for atomic "
17687 << RMW->getOperationName(RMW->getOperation())
17688 << " operation at memory scope " << MemScope;
17689}
17690
17691static bool isV2F16OrV2BF16(Type *Ty) {
17692 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
17693 Type *EltTy = VT->getElementType();
17694 return VT->getNumElements() == 2 &&
17695 (EltTy->isHalfTy() || EltTy->isBFloatTy());
17696 }
17697
17698 return false;
17699}
17700
17701static bool isV2F16(Type *Ty) {
17702 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
17703 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
17704}
17705
17706static bool isV2BF16(Type *Ty) {
17707 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
17708 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
17709}
17710
17711/// \return true if atomicrmw integer ops work for the type.
17712static bool isAtomicRMWLegalIntTy(Type *Ty) {
17713 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
17714 unsigned BW = IT->getBitWidth();
17715 return BW == 32 || BW == 64;
17716 }
17717
17718 return false;
17719}
17720
17721/// \return true if this atomicrmw xchg type can be selected.
17722static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
17723 Type *Ty = RMW->getType();
17724 if (isAtomicRMWLegalIntTy(Ty))
17725 return true;
17726
17727 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
17728 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
17729 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
17730 return BW == 32 || BW == 64;
17731 }
17732
17733 if (Ty->isFloatTy() || Ty->isDoubleTy())
17734 return true;
17735
17736 if (FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty)) {
17737 return VT->getNumElements() == 2 &&
17738 VT->getElementType()->getPrimitiveSizeInBits() == 16;
17739 }
17740
17741 return false;
17742}
17743
17744/// \returns true if it's valid to emit a native instruction for \p RMW, based
17745/// on the properties of the target memory.
17746static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
17747 const AtomicRMWInst *RMW,
17748 bool HasSystemScope) {
17749 // The remote/fine-grained access logic is different from the integer
17750 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
17751 // fine-grained access does not work, even for a device local allocation.
17752 //
17753 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
17754 // allocations work.
17755 if (HasSystemScope) {
17757 RMW->hasMetadata("amdgpu.no.remote.memory"))
17758 return true;
17759 if (Subtarget.hasEmulatedSystemScopeAtomics())
17760 return true;
17762 return true;
17763
17764 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
17765}
17766
17767/// \return Action to perform on AtomicRMWInsts for integer operations.
17770 return isAtomicRMWLegalIntTy(RMW->getType())
17773}
17774
17775/// Return if a flat address space atomicrmw can access private memory.
17777 const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace);
17778 return !MD ||
17780}
17781
17784 // For GAS, lower to flat atomic.
17788}
17789
17792 unsigned AS = RMW->getPointerAddressSpace();
17793 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
17795
17796 // 64-bit flat atomics that dynamically reside in private memory will silently
17797 // be dropped.
17798 //
17799 // Note that we will emit a new copy of the original atomic in the expansion,
17800 // which will be incrementally relegalized.
17801 const DataLayout &DL = RMW->getFunction()->getDataLayout();
17802 if (AS == AMDGPUAS::FLAT_ADDRESS &&
17803 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
17806
17807 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
17809 ORE.emit([=]() {
17810 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
17811 });
17812 return Kind;
17813 };
17814
17815 auto SSID = RMW->getSyncScopeID();
17816 bool HasSystemScope =
17817 SSID == SyncScope::System ||
17818 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
17819
17820 auto Op = RMW->getOperation();
17821 switch (Op) {
17823 // PCIe supports add and xchg for system atomics.
17824 return isAtomicRMWLegalXChgTy(RMW)
17827 case AtomicRMWInst::Add:
17828 // PCIe supports add and xchg for system atomics.
17830 case AtomicRMWInst::Sub:
17831 case AtomicRMWInst::And:
17832 case AtomicRMWInst::Or:
17833 case AtomicRMWInst::Xor:
17834 case AtomicRMWInst::Max:
17835 case AtomicRMWInst::Min:
17842 if (Subtarget->hasEmulatedSystemScopeAtomics())
17844
17845 // On most subtargets, for atomicrmw operations other than add/xchg,
17846 // whether or not the instructions will behave correctly depends on where
17847 // the address physically resides and what interconnect is used in the
17848 // system configuration. On some some targets the instruction will nop,
17849 // and in others synchronization will only occur at degraded device scope.
17850 //
17851 // If the allocation is known local to the device, the instructions should
17852 // work correctly.
17853 if (RMW->hasMetadata("amdgpu.no.remote.memory"))
17855
17856 // If fine-grained remote memory works at device scope, we don't need to
17857 // do anything.
17858 if (!HasSystemScope &&
17861
17862 // If we are targeting a remote allocated address, it depends what kind of
17863 // allocation the address belongs to.
17864 //
17865 // If the allocation is fine-grained (in host memory, or in PCIe peer
17866 // device memory), the operation will fail depending on the target.
17867 //
17868 // Note fine-grained host memory access does work on APUs or if XGMI is
17869 // used, but we do not know if we are targeting an APU or the system
17870 // configuration from the ISA version/target-cpu.
17871 if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
17873
17876 // Atomic sub/or/xor do not work over PCI express, but atomic add
17877 // does. InstCombine transforms these with 0 to or, so undo that.
17878 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
17879 ConstVal && ConstVal->isNullValue())
17881 }
17882
17883 // If the allocation could be in remote, fine-grained memory, the rmw
17884 // instructions may fail. cmpxchg should work, so emit that. On some
17885 // system configurations, PCIe atomics aren't supported so cmpxchg won't
17886 // even work, so you're out of luck anyway.
17887
17888 // In summary:
17889 //
17890 // Cases that may fail:
17891 // - fine-grained pinned host memory
17892 // - fine-grained migratable host memory
17893 // - fine-grained PCIe peer device
17894 //
17895 // Cases that should work, but may be treated overly conservatively.
17896 // - fine-grained host memory on an APU
17897 // - fine-grained XGMI peer device
17899 }
17900
17902 }
17903 case AtomicRMWInst::FAdd: {
17904 Type *Ty = RMW->getType();
17905
17906 // TODO: Handle REGION_ADDRESS
17907 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
17908 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
17909 // is fixed to round-to-nearest-even.
17910 //
17911 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
17912 // round-to-nearest-even.
17913 //
17914 // We ignore the rounding mode problem, even in strictfp. The C++ standard
17915 // suggests it is OK if the floating-point mode may not match the calling
17916 // thread.
17917 if (Ty->isFloatTy()) {
17920 }
17921
17922 if (Ty->isDoubleTy()) {
17923 // Ignores denormal mode, but we don't consider flushing mandatory.
17926 }
17927
17928 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
17930
17932 }
17933
17934 // LDS atomics respect the denormal mode from the mode register.
17935 //
17936 // Traditionally f32 global/buffer memory atomics would unconditionally
17937 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
17938 // flush.
17939 //
17940 // On targets with flat atomic fadd, denormals would flush depending on
17941 // whether the target address resides in LDS or global memory. We consider
17942 // this flat-maybe-flush as will-flush.
17943 if (Ty->isFloatTy() &&
17947
17948 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
17949 // safe. The message phrasing also should be better.
17950 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
17951 if (AS == AMDGPUAS::FLAT_ADDRESS) {
17952 // gfx942, gfx12
17953 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
17954 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17955 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
17956 // gfx90a, gfx942, gfx12
17957 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
17958 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17959
17960 // gfx942, gfx12
17961 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
17962 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17963 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
17964 // gfx90a, gfx942, gfx12
17965 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
17966 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17967
17968 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
17969 // buffer. gfx12 does have the buffer version.
17970 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
17971 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17972 }
17973
17974 // global and flat atomic fadd f64: gfx90a, gfx942.
17975 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
17976 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17977
17978 if (AS != AMDGPUAS::FLAT_ADDRESS) {
17979 if (Ty->isFloatTy()) {
17980 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
17981 // gfx11+.
17982 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
17983 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17984 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
17985 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
17986 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17987 } else {
17988 // gfx908
17989 if (RMW->use_empty() &&
17991 isV2F16(Ty))
17992 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17993 }
17994 }
17995
17996 // flat atomic fadd f32: gfx942, gfx11+.
17997 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
17998 if (Subtarget->hasFlatAtomicFaddF32Inst())
17999 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18000
18001 // If it is in flat address space, and the type is float, we will try to
18002 // expand it, if the target supports global and lds atomic fadd. The
18003 // reason we need that is, in the expansion, we emit the check of
18004 // address space. If it is in global address space, we emit the global
18005 // atomic fadd; if it is in shared address space, we emit the LDS atomic
18006 // fadd.
18007 if (Subtarget->hasLDSFPAtomicAddF32()) {
18008 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18010 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18012 }
18013 }
18014 }
18015
18017 }
18019 case AtomicRMWInst::FMax: {
18020 Type *Ty = RMW->getType();
18021
18022 // LDS float and double fmin/fmax were always supported.
18023 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18024 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
18026 }
18027
18028 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18029 // For flat and global cases:
18030 // float, double in gfx7. Manual claims denormal support.
18031 // Removed in gfx8.
18032 // float, double restored in gfx10.
18033 // double removed again in gfx11, so only f32 for gfx11/gfx12.
18034 //
18035 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
18036 // no f32.
18037 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18038 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18039 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18040 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18041 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18042 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
18044 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18045 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18046 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18047 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18048 }
18049 }
18050
18052 }
18055 default:
18057 }
18058
18059 llvm_unreachable("covered atomicrmw op switch");
18060}
18061
18067}
18068
18071 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
18074}
18075
18078 unsigned AddrSpace = CmpX->getPointerAddressSpace();
18079 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
18081
18082 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
18084
18085 const DataLayout &DL = CmpX->getDataLayout();
18086
18087 Type *ValTy = CmpX->getNewValOperand()->getType();
18088
18089 // If a 64-bit flat atomic may alias private, we need to avoid using the
18090 // atomic in the private case.
18091 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand
18093}
18094
18095const TargetRegisterClass *
18096SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
18098 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18099 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18100 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18101 : &AMDGPU::SReg_32RegClass;
18102 if (!TRI->isSGPRClass(RC) && !isDivergent)
18103 return TRI->getEquivalentSGPRClass(RC);
18104 if (TRI->isSGPRClass(RC) && isDivergent)
18105 return TRI->getEquivalentVGPRClass(RC);
18106
18107 return RC;
18108}
18109
18110// FIXME: This is a workaround for DivergenceAnalysis not understanding always
18111// uniform values (as produced by the mask results of control flow intrinsics)
18112// used outside of divergent blocks. The phi users need to also be treated as
18113// always uniform.
18114//
18115// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
18116static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
18117 unsigned WaveSize) {
18118 // FIXME: We assume we never cast the mask results of a control flow
18119 // intrinsic.
18120 // Early exit if the type won't be consistent as a compile time hack.
18121 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
18122 if (!IT || IT->getBitWidth() != WaveSize)
18123 return false;
18124
18125 if (!isa<Instruction>(V))
18126 return false;
18127 if (!Visited.insert(V).second)
18128 return false;
18129 bool Result = false;
18130 for (const auto *U : V->users()) {
18131 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
18132 if (V == U->getOperand(1)) {
18133 switch (Intrinsic->getIntrinsicID()) {
18134 default:
18135 Result = false;
18136 break;
18137 case Intrinsic::amdgcn_if_break:
18138 case Intrinsic::amdgcn_if:
18139 case Intrinsic::amdgcn_else:
18140 Result = true;
18141 break;
18142 }
18143 }
18144 if (V == U->getOperand(0)) {
18145 switch (Intrinsic->getIntrinsicID()) {
18146 default:
18147 Result = false;
18148 break;
18149 case Intrinsic::amdgcn_end_cf:
18150 case Intrinsic::amdgcn_loop:
18151 Result = true;
18152 break;
18153 }
18154 }
18155 } else {
18156 Result = hasCFUser(U, Visited, WaveSize);
18157 }
18158 if (Result)
18159 break;
18160 }
18161 return Result;
18162}
18163
18165 const Value *V) const {
18166 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
18167 if (CI->isInlineAsm()) {
18168 // FIXME: This cannot give a correct answer. This should only trigger in
18169 // the case where inline asm returns mixed SGPR and VGPR results, used
18170 // outside the defining block. We don't have a specific result to
18171 // consider, so this assumes if any value is SGPR, the overall register
18172 // also needs to be SGPR.
18173 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
18175 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
18176 for (auto &TC : TargetConstraints) {
18177 if (TC.Type == InlineAsm::isOutput) {
18179 const TargetRegisterClass *RC =
18180 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
18181 TC.ConstraintVT)
18182 .second;
18183 if (RC && SIRI->isSGPRClass(RC))
18184 return true;
18185 }
18186 }
18187 }
18188 }
18190 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
18191}
18192
18194 for (SDUse &Use : N->uses()) {
18195 if (MemSDNode *M = dyn_cast<MemSDNode>(Use.getUser())) {
18196 if (getBasePtrIndex(M) == Use.getOperandNo())
18197 return true;
18198 }
18199 }
18200 return false;
18201}
18202
18204 SDValue N1) const {
18205 if (!N0.hasOneUse())
18206 return false;
18207 // Take care of the opportunity to keep N0 uniform
18208 if (N0->isDivergent() || !N1->isDivergent())
18209 return true;
18210 // Check if we have a good chance to form the memory access pattern with the
18211 // base and offset
18212 return (DAG.isBaseWithConstantOffset(N0) &&
18214}
18215
18217 Register N0, Register N1) const {
18218 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
18219}
18220
18223 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
18225 if (I.getMetadata("amdgpu.noclobber"))
18226 Flags |= MONoClobber;
18227 if (I.getMetadata("amdgpu.last.use"))
18228 Flags |= MOLastUse;
18229 return Flags;
18230}
18231
18233 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
18234 const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const {
18235 if (User->getOpcode() != ISD::CopyToReg)
18236 return false;
18237 if (!Def->isMachineOpcode())
18238 return false;
18239 MachineSDNode *MDef = dyn_cast<MachineSDNode>(Def);
18240 if (!MDef)
18241 return false;
18242
18243 unsigned ResNo = User->getOperand(Op).getResNo();
18244 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
18245 return false;
18246 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
18247 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
18248 PhysReg = AMDGPU::SCC;
18249 const TargetRegisterClass *RC =
18250 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
18251 Cost = RC->getCopyCost();
18252 return true;
18253 }
18254 return false;
18255}
18256
18258 Instruction *AI) const {
18259 // Given: atomicrmw fadd ptr %addr, float %val ordering
18260 //
18261 // With this expansion we produce the following code:
18262 // [...]
18263 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
18264 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
18265 //
18266 // atomicrmw.shared:
18267 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
18268 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
18269 // float %val ordering
18270 // br label %atomicrmw.phi
18271 //
18272 // atomicrmw.check.private:
18273 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
18274 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
18275 //
18276 // atomicrmw.private:
18277 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
18278 // %loaded.private = load float, ptr addrspace(5) %cast.private
18279 // %val.new = fadd float %loaded.private, %val
18280 // store float %val.new, ptr addrspace(5) %cast.private
18281 // br label %atomicrmw.phi
18282 //
18283 // atomicrmw.global:
18284 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
18285 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
18286 // float %val ordering
18287 // br label %atomicrmw.phi
18288 //
18289 // atomicrmw.phi:
18290 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
18291 // [ %loaded.private, %atomicrmw.private ],
18292 // [ %loaded.global, %atomicrmw.global ]
18293 // br label %atomicrmw.end
18294 //
18295 // atomicrmw.end:
18296 // [...]
18297 //
18298 //
18299 // For 64-bit atomics which may reside in private memory, we perform a simpler
18300 // version that only inserts the private check, and uses the flat operation.
18301
18302 IRBuilder<> Builder(AI);
18303 LLVMContext &Ctx = Builder.getContext();
18304
18305 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
18306 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
18308 Value *Addr = AI->getOperand(PtrOpIdx);
18309
18310 /// TODO: Only need to check private, then emit flat-known-not private (no
18311 /// need for shared block, or cast to global).
18312 AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);
18313
18314 Align Alignment;
18315 if (RMW)
18316 Alignment = RMW->getAlign();
18317 else if (CX)
18318 Alignment = CX->getAlign();
18319 else
18320 llvm_unreachable("unhandled atomic operation");
18321
18322 // FullFlatEmulation is true if we need to issue the private, shared, and
18323 // global cases.
18324 //
18325 // If this is false, we are only dealing with the flat-targeting-private case,
18326 // where we only insert a check for private and still use the flat instruction
18327 // for global and shared.
18328
18329 bool FullFlatEmulation =
18330 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
18331 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
18333 RMW->getType()->isDoubleTy()));
18334
18335 // If the return value isn't used, do not introduce a false use in the phi.
18336 bool ReturnValueIsUsed = !AI->use_empty();
18337
18338 BasicBlock *BB = Builder.GetInsertBlock();
18339 Function *F = BB->getParent();
18340 BasicBlock *ExitBB =
18341 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
18342 BasicBlock *SharedBB = nullptr;
18343
18344 BasicBlock *CheckPrivateBB = BB;
18345 if (FullFlatEmulation) {
18346 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
18347 CheckPrivateBB =
18348 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
18349 }
18350
18351 BasicBlock *PrivateBB =
18352 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
18353 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
18354 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
18355
18356 std::prev(BB->end())->eraseFromParent();
18357 Builder.SetInsertPoint(BB);
18358
18359 Value *LoadedShared = nullptr;
18360 if (FullFlatEmulation) {
18361 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
18362 {Addr}, nullptr, "is.shared");
18363 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
18364 Builder.SetInsertPoint(SharedBB);
18365 Value *CastToLocal = Builder.CreateAddrSpaceCast(
18367
18368 Instruction *Clone = AI->clone();
18369 Clone->insertInto(SharedBB, SharedBB->end());
18370 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
18371 LoadedShared = Clone;
18372
18373 Builder.CreateBr(PhiBB);
18374 Builder.SetInsertPoint(CheckPrivateBB);
18375 }
18376
18377 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
18378 {Addr}, nullptr, "is.private");
18379 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
18380
18381 Builder.SetInsertPoint(PrivateBB);
18382
18383 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
18385
18386 Value *LoadedPrivate;
18387 if (RMW) {
18388 LoadedPrivate = Builder.CreateAlignedLoad(
18389 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
18390
18391 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
18392 LoadedPrivate, RMW->getValOperand());
18393
18394 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
18395 } else {
18396 auto [ResultLoad, Equal] =
18397 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
18398 CX->getNewValOperand(), CX->getAlign());
18399
18400 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
18401 ResultLoad, 0);
18402 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
18403 }
18404
18405 Builder.CreateBr(PhiBB);
18406
18407 Builder.SetInsertPoint(GlobalBB);
18408
18409 // Continue using a flat instruction if we only emitted the check for private.
18410 Instruction *LoadedGlobal = AI;
18411 if (FullFlatEmulation) {
18412 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
18414 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
18415 }
18416
18417 AI->removeFromParent();
18418 AI->insertInto(GlobalBB, GlobalBB->end());
18419
18420 // The new atomicrmw may go through another round of legalization later.
18421 if (!FullFlatEmulation) {
18422 // We inserted the runtime check already, make sure we do not try to
18423 // re-expand this.
18424 // TODO: Should union with any existing metadata.
18425 MDBuilder MDB(F->getContext());
18426 MDNode *RangeNotPrivate =
18429 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
18430 RangeNotPrivate);
18431 }
18432
18433 Builder.CreateBr(PhiBB);
18434
18435 Builder.SetInsertPoint(PhiBB);
18436
18437 if (ReturnValueIsUsed) {
18438 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
18439 AI->replaceAllUsesWith(Loaded);
18440 if (FullFlatEmulation)
18441 Loaded->addIncoming(LoadedShared, SharedBB);
18442 Loaded->addIncoming(LoadedPrivate, PrivateBB);
18443 Loaded->addIncoming(LoadedGlobal, GlobalBB);
18444 Loaded->takeName(AI);
18445 }
18446
18447 Builder.CreateBr(ExitBB);
18448}
18449
18451 unsigned PtrOpIdx) {
18452 Value *PtrOp = I->getOperand(PtrOpIdx);
18455
18456 Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
18457 Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
18458 I->getIterator());
18459 I->setOperand(PtrOpIdx, ASCast);
18460}
18461
18464
18467
18470 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
18471 ConstVal && ConstVal->isNullValue()) {
18472 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
18474
18475 // We may still need the private-alias-flat handling below.
18476
18477 // TODO: Skip this for cases where we cannot access remote memory.
18478 }
18479 }
18480
18481 // The non-flat expansions should only perform the de-canonicalization of
18482 // identity values.
18484 return;
18485
18487}
18488
18492
18494}
18495
18499
18501 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
18502}
18503
18505 if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
18506 return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
18507
18509 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
18510}
18511
18512LoadInst *
18514 IRBuilder<> Builder(AI);
18515 auto Order = AI->getOrdering();
18516
18517 // The optimization removes store aspect of the atomicrmw. Therefore, cache
18518 // must be flushed if the atomic ordering had a release semantics. This is
18519 // not necessary a fence, a release fence just coincides to do that flush.
18520 // Avoid replacing of an atomicrmw with a release semantics.
18521 if (isReleaseOrStronger(Order))
18522 return nullptr;
18523
18524 LoadInst *LI = Builder.CreateAlignedLoad(
18525 AI->getType(), AI->getPointerOperand(), AI->getAlign());
18526 LI->setAtomic(Order, AI->getSyncScopeID());
18527 LI->copyMetadata(*AI);
18528 LI->takeName(AI);
18529 AI->replaceAllUsesWith(LI);
18530 AI->eraseFromParent();
18531 return LI;
18532}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
constexpr LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:298
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Addr
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
static bool isUndef(const MachineInstr &MI)
Register const TargetRegisterInfo * TRI
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition: SIDefines.h:1244
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition: SIDefines.h:1241
static cl::opt< bool > UseSelectionDAGPTRADD("amdgpu-use-sdag-ptradd", cl::Hidden, cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the " "SelectionDAG ISel"), cl::init(false))
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static uint32_t getIdentityValueForWaveReduction(unsigned Opc)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
raw_pwrite_stream & OS
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:480
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
#define LLVM_DEBUG(...)
Definition: Debug.h:119
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool hasBF16PackedInsts() const
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
bool hasCvtPkF16F32Inst() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool hasBF16ConversionInsts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
bool hasBF16TransInsts() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition: APFloat.h:1120
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:6057
LLVM_READONLY int getExactLog2Abs() const
Definition: APFloat.h:1497
bool isNegative() const
Definition: APFloat.h:1449
bool isNormal() const
Definition: APFloat.h:1453
APInt bitcastToAPInt() const
Definition: APFloat.h:1353
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition: APFloat.h:1138
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:1098
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:1079
bool isInfinity() const
Definition: APFloat.h:1446
Class for arbitrary precision integers.
Definition: APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1391
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1385
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:380
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition: APInt.h:466
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1639
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
Definition: APInt.h:366
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1237
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1221
This class represents an incoming formal argument to a Function.
Definition: Argument.h:32
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition: Function.cpp:339
const Function * getParent() const
Definition: Argument.h:44
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:142
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:506
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:645
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:549
static unsigned getPointerOperandIndex()
Definition: Instructions.h:636
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:709
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:843
static unsigned getPointerOperandIndex()
Definition: Instructions.h:888
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:721
@ Add
*p = old + v
Definition: Instructions.h:725
@ FAdd
*p = old + v
Definition: Instructions.h:746
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:739
@ Or
*p = old | v
Definition: Instructions.h:733
@ Sub
*p = old - v
Definition: Instructions.h:727
@ And
*p = old & v
Definition: Instructions.h:729
@ Xor
*p = old ^ v
Definition: Instructions.h:735
@ FSub
*p = old - v
Definition: Instructions.h:749
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:769
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:737
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:743
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:757
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:741
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:753
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:773
@ Nand
*p = ~(old & v)
Definition: Instructions.h:731
Value * getPointerOperand()
Definition: Instructions.h:886
void setOperation(BinOp Operation)
Definition: Instructions.h:837
BinOp getOperation() const
Definition: Instructions.h:819
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition: Instructions.h:877
Value * getValOperand()
Definition: Instructions.h:890
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition: Instructions.h:863
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:894
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
Definition: Attributes.cpp:386
LLVM Basic Block Representation.
Definition: BasicBlock.h:62
iterator end()
Definition: BasicBlock.h:472
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:206
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:555
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:213
BitVector & set()
Definition: BitVector.h:351
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
Definition: ByteProvider.h:32
static ByteProvider getConstantZero()
Definition: ByteProvider.h:67
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
Definition: ByteProvider.h:60
std::optional< ISelOp > Src
Definition: ByteProvider.h:51
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1348
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
Definition: InstrTypes.h:1458
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1292
unsigned arg_size() const
Definition: InstrTypes.h:1290
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:678
@ ICMP_NE
not equal
Definition: InstrTypes.h:700
bool isSigned() const
Definition: InstrTypes.h:932
bool isFPPredicate() const
Definition: InstrTypes.h:784
bool isIntPredicate() const
Definition: InstrTypes.h:785
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition: Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:214
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition: Constant.h:43
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:842
bool isBigEndian() const
Definition: DataLayout.h:199
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
A debug info location.
Definition: DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:592
unsigned getNumElements() const
Definition: DerivedTypes.h:635
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Definition: DerivedTypes.h:105
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:137
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:209
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:363
iterator_range< arg_iterator > args()
Definition: Function.h:890
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:762
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:359
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition: Function.cpp:803
Argument * getArg(unsigned i) const
Definition: Function.h:884
bool hasMemoryAtomicFaddF32DenormalSupport() const
Definition: GCNSubtarget.h:949
bool hasD16Images() const
Definition: GCNSubtarget.h:752
bool hasMinimum3Maximum3F32() const
bool useVGPRIndexMode() const
bool hasAtomicDsPkAdd16Insts() const
Definition: GCNSubtarget.h:911
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:523
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:514
bool hasAtomicFMinFMaxF64FlatInsts() const
Definition: GCNSubtarget.h:907
bool hasDot7Insts() const
Definition: GCNSubtarget.h:851
bool hasApertureRegs() const
Definition: GCNSubtarget.h:649
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:679
bool hasAtomicFMinFMaxF32FlatInsts() const
Definition: GCNSubtarget.h:903
bool hasIEEEMinimumMaximumInsts() const
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasRelaxedBufferOOBMode() const
Definition: GCNSubtarget.h:647
bool hasDLInsts() const
Definition: GCNSubtarget.h:821
bool hasBCNT(unsigned Size) const
Definition: GCNSubtarget.h:455
bool hasMAIInsts() const
Definition: GCNSubtarget.h:879
bool hasLDSLoadB96_B128() const
Returns true if the target supports global_load_lds_dwordx3/global_load_lds_dwordx4 or buffer_load_dw...
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
Definition: GCNSubtarget.h:956
bool hasMultiDwordFlatScratchAddressing() const
Definition: GCNSubtarget.h:732
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
Definition: GCNSubtarget.h:573
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
Definition: GCNSubtarget.h:631
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:309
bool hasDot1Insts() const
Definition: GCNSubtarget.h:827
bool hasAtomicFaddRtnInsts() const
Definition: GCNSubtarget.h:919
bool hasSafeSmemPrefetch() const
bool hasPkMovB32() const
Align getStackAlignment() const
bool hasScalarSubwordLoads() const
Definition: GCNSubtarget.h:501
bool enableFlatScratch() const
Definition: GCNSubtarget.h:704
bool hasMadF16() const
bool hasMin3Max3PKF16() const
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
Definition: GCNSubtarget.h:675
bool hasFmaMixBF16Insts() const
Definition: GCNSubtarget.h:479
bool hasVMemToLDSLoad() const
bool supportsGetDoorbellID() const
Definition: GCNSubtarget.h:507
bool hasFlatAtomicFaddF32Inst() const
Definition: GCNSubtarget.h:939
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:321
bool hasMad64_32() const
Definition: GCNSubtarget.h:797
bool useDS128() const
Definition: GCNSubtarget.h:583
bool hasBVHDualAndBVH8Insts() const
bool hasMinimum3Maximum3PKF16() const
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
bool hasGloballyAddressableScratch() const
bool has64BitLiterals() const
TrapHandlerAbi getTrapHandlerAbi() const
Definition: GCNSubtarget.h:503
const SIFrameLowering * getFrameLowering() const override
Definition: GCNSubtarget.h:313
bool hasMinimum3Maximum3F16() const
bool hasAtomicFMinFMaxF32GlobalInsts() const
Definition: GCNSubtarget.h:895
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
Definition: GCNSubtarget.h:471
bool hasIntClamp() const
Definition: GCNSubtarget.h:401
bool hasGFX10_AEncoding() const
unsigned getMaxNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
bool hasEmulatedSystemScopeAtomics() const
Definition: GCNSubtarget.h:962
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:421
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:653
bool hasLDSFPAtomicAddF64() const
bool hasFlatGlobalInsts() const
Definition: GCNSubtarget.h:683
bool getScalarizeGlobalBehavior() const
bool hasScalarSMulU64() const
Definition: GCNSubtarget.h:786
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
Definition: GCNSubtarget.h:376
bool hasIntMinMax64() const
bool hasShaderCyclesHiLoRegisters() const
Definition: GCNSubtarget.h:992
bool hasFFBL() const
Definition: GCNSubtarget.h:459
bool hasNSAEncoding() const
bool hasSMemRealTime() const
bool usePRTStrictNull() const
Definition: GCNSubtarget.h:605
bool hasVmemPrefInsts() const
bool hasAtomicFMinFMaxF64GlobalInsts() const
Definition: GCNSubtarget.h:899
bool hasMed3_16() const
Definition: GCNSubtarget.h:467
bool hasUnalignedScratchAccessEnabled() const
Definition: GCNSubtarget.h:639
bool hasMovrel() const
bool hasAtomicFlatPkAdd16Insts() const
Definition: GCNSubtarget.h:913
bool hasBFI() const
Definition: GCNSubtarget.h:447
bool isWave32() const
bool hasUnalignedBufferAccessEnabled() const
Definition: GCNSubtarget.h:623
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:384
bool hasImageGather4D16Bug() const
bool hasDot10Insts() const
Definition: GCNSubtarget.h:863
bool supportsMinMaxDenormModes() const
Definition: GCNSubtarget.h:568
bool hasFFBH() const
Definition: GCNSubtarget.h:463
bool hasAtomicFaddInsts() const
Definition: GCNSubtarget.h:915
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const
Definition: GCNSubtarget.h:923
bool hasAtomicBufferPkAddBF16Inst() const
Definition: GCNSubtarget.h:935
bool hasAtomicFaddNoRtnInsts() const
Definition: GCNSubtarget.h:921
bool hasFlatBufferGlobalAtomicFaddF64Inst() const
Definition: GCNSubtarget.h:943
bool hasScalarDwordx3Loads() const
bool hasLDSFPAtomicAddF32() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
Definition: GCNSubtarget.h:593
bool hasDot8Insts() const
Definition: GCNSubtarget.h:855
bool hasVectorMulU64() const
bool hasDS96AndDS128() const
Definition: GCNSubtarget.h:588
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:577
Generation getGeneration() const
Definition: GCNSubtarget.h:357
bool hasAtomicBufferGlobalPkAddF16Insts() const
Definition: GCNSubtarget.h:927
bool hasScalarAddSub64() const
Definition: GCNSubtarget.h:784
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:788
bool hasAtomicGlobalPkAddBF16Inst() const
Definition: GCNSubtarget.h:931
bool hasAddr64() const
Definition: GCNSubtarget.h:425
bool isWave64() const
bool hasFmaMixInsts() const
Definition: GCNSubtarget.h:475
bool hasPackedTID() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:780
bool hasFractBug() const
Definition: GCNSubtarget.h:439
bool hasGDS() const
bool hasBFE() const
Definition: GCNSubtarget.h:443
bool hasGWSAutoReplay() const
Definition: GCNSubtarget.h:767
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
const MachineFunction & getMachineFunction() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
Definition: GlobalValue.h:513
unsigned getAddressSpace() const
Definition: GlobalValue.h:207
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:663
Type * getValueType() const
Definition: GlobalValue.h:298
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2625
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1864
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:202
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:201
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:834
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2494
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1197
LLVMContext & getContext() const
Definition: IRBuilder.h:203
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1191
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:207
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1883
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2209
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2780
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:90
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
Definition: Instruction.h:406
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:82
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1718
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Definition: Instruction.cpp:86
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
Definition: DerivedTypes.h:42
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:49
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:265
constexpr bool isScalar() const
Definition: LowLevelType.h:147
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:43
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Definition: LowLevelType.h:58
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:191
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
Definition: LowLevelType.h:219
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
LLVM_ABI std::optional< StringRef > getSyncScopeName(SyncScope::ID Id) const
getSyncScopeName - Returns the name of a SyncScope::ID registered with LLVMContext,...
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
Definition: Instructions.h:180
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:265
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:245
static unsigned getPointerOperandIndex()
Definition: Instructions.h:261
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:199
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition: MDBuilder.cpp:96
Metadata node.
Definition: Metadata.h:1077
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:247
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
Definition: MachineInstr.h:72
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:595
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition: ModRef.h:221
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition: ModRef.h:215
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition: ModRef.h:218
Root of the metadata hierarchy.
Definition: Metadata.h:63
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:278
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1885
Register getReg() const
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition: Register.h:67
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:78
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
bool isInlineConstant(const APInt &Imm) const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:229
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:758
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:578
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:500
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
Definition: SelectionDAG.h:813
const Pass * getPass() const
Definition: SelectionDAG.h:494
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:504
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:868
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
Definition: SelectionDAG.h:839
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:498
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:719
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:499
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:707
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:493
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:885
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:511
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:587
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:581
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:401
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:541
bool empty() const
Definition: SmallVector.h:82
size_t size() const
Definition: SmallVector.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:684
void resize(size_type N)
Definition: SmallVector.h:639
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
An instruction for storing to memory.
Definition: Instructions.h:296
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:862
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:154
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:43
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:68
R Default(T Value)
Definition: StringSwitch.h:177
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:83
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:47
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:418
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:311
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition: Type.h:258
LLVM_ABI const fltSemantics & getFltSemantics() const
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:240
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:352
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
LLVM_ABI void set(Value *Val)
Definition: Value.h:905
User * getUser() const
Returns the User that contains this Use.
Definition: Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition: Use.cpp:35
const Use & getOperandUse(unsigned i) const
Definition: User.h:245
Value * getOperand(unsigned i) const
Definition: User.h:232
LLVM Value Representation.
Definition: Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:546
iterator_range< user_iterator > users()
Definition: Value.h:426
bool use_empty() const
Definition: Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1101
iterator_range< use_iterator > uses()
Definition: Value.h:380
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:396
Type * getElementType() const
Definition: DerivedTypes.h:463
constexpr bool isZero() const
Definition: TypeSize.h:157
const ParentTy * getParent() const
Definition: ilist_node.h:34
self_iterator getIterator()
Definition: ilist_node.h:134
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: Lint.cpp:82
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:126
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ MaxID
The highest possible ID. Must be some 2^k - 1.
Definition: CallingConv.h:291
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:801
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:256
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1232
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:774
@ ATOMIC_LOAD_FMAX
Definition: ISDOpcodes.h:1386
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:45
@ SET_FPENV
Sets the current floating-point environment.
Definition: ISDOpcodes.h:1108
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:270
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1379
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:765
@ ConstantFP
Definition: ISDOpcodes.h:87
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1381
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1351
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1382
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:259
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1141
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:215
@ GlobalAddress
Definition: ISDOpcodes.h:88
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:1364
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:738
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:985
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1377
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:975
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:249
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1378
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:1018
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1574
@ ATOMIC_LOAD_FADD
Definition: ISDOpcodes.h:1384
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:957
@ CONVERGENCECTRL_GLUE
Definition: ISDOpcodes.h:1541
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:826
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:656
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
Definition: ISDOpcodes.h:1298
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1157
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:773
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1331
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1090
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:1002
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1187
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:347
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1380
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:528
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition: ISDOpcodes.h:535
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:778
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1347
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:228
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:242
@ ATOMIC_LOAD_FMIN
Definition: ISDOpcodes.h:1387
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:695
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1126
@ GET_FPENV
Gets the current floating-point environment.
Definition: ISDOpcodes.h:1103
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:636
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1375
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:601
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1075
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:832
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1321
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:793
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1358
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1383
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
Definition: ISDOpcodes.h:1059
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:351
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1151
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:718
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:960
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:323
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:1207
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:994
@ ATOMIC_LOAD_UDEC_WRAP
Definition: ISDOpcodes.h:1391
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1373
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:493
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1081
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1374
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:908
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1292
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:730
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1318
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:200
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:53
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1372
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:1025
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:941
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:434
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:979
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:1204
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:838
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1180
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:815
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:62
@ ATOMIC_LOAD_UINC_WRAP
Definition: ISDOpcodes.h:1390
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:360
@ AssertZext
Definition: ISDOpcodes.h:63
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
Definition: ISDOpcodes.h:1086
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:543
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
Definition: ISDOpcodes.h:1724
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1691
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1671
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
Definition: Intrinsics.cpp:743
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
Definition: Intrinsics.cpp:762
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
Definition: Intrinsics.cpp:596
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
Offsets
Offsets in bytes from the start of the input buffer.
Definition: SIInstrInfo.h:1716
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:58
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
constexpr double inv_pi
Definition: MathExtras.h:54
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:338
@ Offset
Definition: DWP.cpp:477
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition: Analysis.cpp:241
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:307
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition: STLExtras.h:870
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
Definition: LowerAtomic.cpp:40
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
@ Done
Definition: Threading.h:60
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition: MathExtras.h:232
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:270
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2155
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:551
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:390
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:157
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:282
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:44
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:336
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:203
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:288
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition: Error.cpp:167
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition: Analysis.cpp:207
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:164
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
Definition: LowerAtomic.cpp:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:399
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
@ AfterLegalizeDAG
Definition: DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition: DAGCombine.h:18
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:376
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition: MathExtras.h:241
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:223
@ DS_Warning
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition: Utils.cpp:433
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1777
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1916
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:853
#define N
SDValue SrcOp
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:304
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:264
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition: SCCPSolver.h:42
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition: ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:390
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:295
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:238
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition: ValueTypes.h:113
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:465
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:407
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:251
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:216
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition: ValueTypes.cpp:330
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
unsigned getOrigArgIndex() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:66
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition: KnownBits.h:165
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:74
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition: KnownBits.h:218
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition: KnownBits.h:173
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:340
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:241
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals