LLVM 21.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/Statistic.h"
37#include "llvm/IR/IRBuilder.h"
39#include "llvm/IR/IntrinsicsAMDGPU.h"
40#include "llvm/IR/IntrinsicsR600.h"
41#include "llvm/IR/MDBuilder.h"
44#include "llvm/Support/ModRef.h"
46#include <optional>
47
48using namespace llvm;
49
50#define DEBUG_TYPE "si-lower"
51
52STATISTIC(NumTailCalls, "Number of tail calls");
53
54static cl::opt<bool>
55 DisableLoopAlignment("amdgpu-disable-loop-alignment",
56 cl::desc("Do not align and prefetch loops"),
57 cl::init(false));
58
60 "amdgpu-use-divergent-register-indexing", cl::Hidden,
61 cl::desc("Use indirect register addressing for divergent indexes"),
62 cl::init(false));
63
66 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
67}
68
71 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
72}
73
74static unsigned findFirstFreeSGPR(CCState &CCInfo) {
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
77 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
78 return AMDGPU::SGPR0 + Reg;
79 }
80 }
81 llvm_unreachable("Cannot allocate sgpr");
82}
83
85 const GCNSubtarget &STI)
86 : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) {
87 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
88 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
89
90 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
91 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
92
93 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
94
95 const SIRegisterInfo *TRI = STI.getRegisterInfo();
96 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
97
98 addRegisterClass(MVT::f64, V64RegClass);
99 addRegisterClass(MVT::v2f32, V64RegClass);
100 addRegisterClass(MVT::Untyped, V64RegClass);
101
102 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
103 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
104
105 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
106 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
107
108 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
109 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
110
111 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
112 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
113
114 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
115 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
116
117 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
118 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
119
120 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
121 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
122
123 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
124 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
125
126 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
127 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
128
129 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
130 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
131
132 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
133 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
134
135 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
136 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
137
138 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
139 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
140
141 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
142 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
143
144 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
145 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
146
147 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
148 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
149
150 if (Subtarget->has16BitInsts()) {
151 if (Subtarget->useRealTrue16Insts()) {
152 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
153 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
154 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
155 } else {
156 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
157 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
158 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
159 }
160
161 // Unless there are also VOP3P operations, not operations are really legal.
162 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
163 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
164 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
165 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
166 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
167 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
168 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
169 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
170 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
171 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
172 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
173 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
174 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
175 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
176 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
177 }
178
179 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
180 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
181
183
184 // The boolean content concept here is too inflexible. Compares only ever
185 // really produce a 1-bit result. Any copy/extend from these will turn into a
186 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
187 // it's what most targets use.
190
191 // We need to custom lower vector stores from local memory
193 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
194 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
195 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
196 MVT::i1, MVT::v32i32},
197 Custom);
198
200 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
201 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
202 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
203 MVT::i1, MVT::v32i32},
204 Custom);
205
206 if (isTypeLegal(MVT::bf16)) {
207 for (unsigned Opc :
216 ISD::SETCC}) {
217 // FIXME: The promoted to type shouldn't need to be explicit
218 setOperationAction(Opc, MVT::bf16, Promote);
219 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
220 }
221
223
225 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
226
230
231 // We only need to custom lower because we can't specify an action for bf16
232 // sources.
235 }
236
237 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
238 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
239 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
240 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
241 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
242 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
243 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
244 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
245 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
246 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
247 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
248 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
249 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
250 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
251 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
252 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
253
254 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
255 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
256 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
257 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
258 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
259 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
260 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
261
262 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
263
267 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
268
269 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
270
272 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
273
275 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
276 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
277
279 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
280 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
281 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
282 Expand);
284 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
285 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
286 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
287 Expand);
288
290 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
291 MVT::v3i16, MVT::v4i16, MVT::Other},
292 Custom);
293
296 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
297
299
301
303 Expand);
304
305#if 0
307#endif
308
309 // We only support LOAD/STORE and vector manipulation ops for vectors
310 // with > 4 elements.
311 for (MVT VT :
312 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
313 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
314 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
315 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
316 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
317 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
318 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
319 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
320 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
321 switch (Op) {
322 case ISD::LOAD:
323 case ISD::STORE:
325 case ISD::BITCAST:
326 case ISD::UNDEF:
330 case ISD::IS_FPCLASS:
331 break;
336 break;
337 default:
339 break;
340 }
341 }
342 }
343
345
346 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
347 // is expanded to avoid having two separate loops in case the index is a VGPR.
348
349 // Most operations are naturally 32-bit vector operations. We only support
350 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
351 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
353 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
354
356 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
357
359 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
360
362 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
363 }
364
365 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
367 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
368
370 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
371
373 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
374
376 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
377 }
378
379 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
381 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
382
384 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
385
387 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
388
390 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
391 }
392
393 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
395 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
396
398 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
399
401 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
402
404 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
405 }
406
407 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
409 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
410
412 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
413
415 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
416
418 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
419 }
420
422 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
423 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
424 Custom);
425
426 if (Subtarget->hasPkMovB32()) {
427 // TODO: 16-bit element vectors should be legal with even aligned elements.
428 // TODO: Can be legal with wider source types than the result with
429 // subregister extracts.
430 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
431 }
432
433 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
434 Custom);
435
436 // Avoid stack access for these.
437 // TODO: Generalize to more vector types.
439 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
440 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
441 Custom);
442
443 // Deal with vec3 vector operations when widened to vec4.
445 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
446
447 // Deal with vec5/6/7 vector operations when widened to vec8.
449 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
450 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
451 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
452 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
453 Custom);
454
455 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
456 // and output demarshalling
457 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
458
459 // We can't return success/failure, only the old value,
460 // let LLVM add the comparison
462 Expand);
463
464 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
465
466 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
467
468 // FIXME: This should be narrowed to i32, but that only happens if i64 is
469 // illegal.
470 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
471 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
472
473 // On SI this is s_memtime and s_memrealtime on VI.
475
476 if (Subtarget->hasSMemRealTime() ||
480
481 if (Subtarget->has16BitInsts()) {
484 } else {
486 }
487
488 if (Subtarget->hasMadMacF32Insts())
490
491 if (!Subtarget->hasBFI())
492 // fcopysign can be done in a single instruction with BFI.
493 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
494
495 if (!Subtarget->hasBCNT(32))
497
498 if (!Subtarget->hasBCNT(64))
500
501 if (Subtarget->hasFFBH())
503
504 if (Subtarget->hasFFBL())
506
507 // We only really have 32-bit BFE instructions (and 16-bit on VI).
508 //
509 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
510 // effort to match them now. We want this to be false for i64 cases when the
511 // extraction isn't restricted to the upper or lower half. Ideally we would
512 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
513 // span the midpoint are probably relatively rare, so don't worry about them
514 // for now.
515 if (Subtarget->hasBFE())
517
518 // Clamp modifier on add/sub
519 if (Subtarget->hasIntClamp())
521
522 if (Subtarget->hasAddNoCarry())
523 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
524 Legal);
525
526 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
527 Custom);
528
529 // These are really only legal for ieee_mode functions. We should be avoiding
530 // them for functions that don't have ieee_mode enabled, so just say they are
531 // legal.
533 {MVT::f32, MVT::f64}, Legal);
534
535 if (Subtarget->haveRoundOpsF64())
537 Legal);
538 else
540 MVT::f64, Custom);
541
543 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
544 Legal);
545 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
546
549
550 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
551 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
552
553 // Custom lower these because we can't specify a rule based on an illegal
554 // source bf16.
557
558 if (Subtarget->has16BitInsts()) {
561 MVT::i16, Legal);
562
563 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
564
566 MVT::i16, Expand);
567
571 ISD::CTPOP},
572 MVT::i16, Promote);
573
575
576 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
577
579 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
581 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
582
586
588
589 // F16 - Constant Actions.
592
593 // F16 - Load/Store Actions.
595 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
597 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
598
599 // BF16 - Load/Store Actions.
601 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
603 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
604
605 // F16 - VOP1 Actions.
608 MVT::f16, Custom);
609
612
613 // F16 - VOP2 Actions.
614 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
615 Expand);
619
620 // F16 - VOP3 Actions.
622 if (STI.hasMadF16())
624
625 for (MVT VT :
626 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
627 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
628 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
629 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
630 switch (Op) {
631 case ISD::LOAD:
632 case ISD::STORE:
634 case ISD::BITCAST:
635 case ISD::UNDEF:
640 case ISD::IS_FPCLASS:
641 break;
645 break;
646 default:
648 break;
649 }
650 }
651 }
652
653 // v_perm_b32 can handle either of these.
654 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
656
657 // XXX - Do these do anything? Vector constants turn into build_vector.
658 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
659
660 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
661 Legal);
662
664 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
666 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
667
669 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
671 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
672
673 setOperationAction(ISD::AND, MVT::v2i16, Promote);
674 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
675 setOperationAction(ISD::OR, MVT::v2i16, Promote);
676 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
677 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
678 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
679
681 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
683 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
684 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
685 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
686
688 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
690 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
692 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
693
695 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
697 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
698 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
699 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
700
702 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
704 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
705
707 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
709 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
711 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
712
713 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
714 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
715 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
716 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
717 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
718 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
719
721 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
723 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
724 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
725 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
726
727 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
728 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
729 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
730 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
731 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
732 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
733
735 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
737 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
738 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
739 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
740
742 MVT::v2i32, Expand);
744
746 MVT::v4i32, Expand);
747
749 MVT::v8i32, Expand);
750
751 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
752 Subtarget->hasVOP3PInsts() ? Legal : Custom);
753
754 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
755 // This isn't really legal, but this avoids the legalizer unrolling it (and
756 // allows matching fneg (fabs x) patterns)
757 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
758
761
764 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
765 Custom);
766
768 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
769 Expand);
770
771 for (MVT Vec16 :
772 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
773 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
776 Vec16, Custom);
778 }
779 }
780
781 if (Subtarget->hasVOP3PInsts()) {
785 MVT::v2i16, Legal);
786
789 MVT::v2f16, Legal);
790
792 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
793
795 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
796 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
797 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
798 Custom);
799
800 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
801 // Split vector operations.
806 VT, Custom);
807
808 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
809 // Split vector operations.
811 VT, Custom);
812
813 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
814 Custom);
815
816 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
817 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
818 Custom);
819
820 if (Subtarget->hasPackedFP32Ops()) {
822 MVT::v2f32, Legal);
824 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
825 Custom);
826 }
827 }
828
830
831 if (Subtarget->has16BitInsts()) {
833 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
835 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
836 } else {
837 // Legalization hack.
838 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
839
841 }
842
844 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
845 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
846 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
847 MVT::v32f16, MVT::v32bf16},
848 Custom);
849
851
852 if (Subtarget->hasScalarSMulU64())
854
855 if (Subtarget->hasMad64_32())
857
858 if (Subtarget->hasPrefetch())
860
861 if (Subtarget->hasIEEEMinMax()) {
863 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
865 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
866 Custom);
867 } else {
868 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
869 if (Subtarget->hasMinimum3Maximum3F32())
871
872 if (Subtarget->hasMinimum3Maximum3PKF16())
874 }
875
877 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
878 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
879 MVT::i8},
880 Custom);
881
883 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
884 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
885 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
886 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
887 Custom);
888
890 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
891 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
892 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
893 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
894 Custom);
895
901
902 // TODO: Could move this to custom lowering, could benefit from combines on
903 // extract of relevant bits.
905
907
908 if (Subtarget->hasBF16ConversionInsts()) {
912 }
913
914 if (Subtarget->hasCvtPkF16F32Inst()) {
916 }
917
920 ISD::SUB,
922 ISD::MUL,
923 ISD::FADD,
924 ISD::FSUB,
925 ISD::FDIV,
926 ISD::FMUL,
933 ISD::FMA,
934 ISD::SMIN,
935 ISD::SMAX,
936 ISD::UMIN,
937 ISD::UMAX,
940 ISD::SMIN,
941 ISD::SMAX,
942 ISD::UMIN,
943 ISD::UMAX,
944 ISD::AND,
945 ISD::OR,
946 ISD::XOR,
947 ISD::SHL,
948 ISD::SRL,
949 ISD::SRA,
950 ISD::FSHR,
960
961 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
963
964 // All memory operations. Some folding on the pointer operand is done to help
965 // matching the constant offsets in the addressing modes.
990
991 // FIXME: In other contexts we pretend this is a per-function property.
993
995}
996
997const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
998
1000 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1001 return RCRegs;
1002}
1003
1004//===----------------------------------------------------------------------===//
1005// TargetLowering queries
1006//===----------------------------------------------------------------------===//
1007
1008// v_mad_mix* support a conversion from f16 to f32.
1009//
1010// There is only one special case when denormals are enabled we don't currently,
1011// where this is OK to use.
1012bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1013 EVT DestVT, EVT SrcVT) const {
1014 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1015 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1016 DestVT.getScalarType() == MVT::f32 &&
1017 SrcVT.getScalarType() == MVT::f16 &&
1018 // TODO: This probably only requires no input flushing?
1020}
1021
1023 LLT DestTy, LLT SrcTy) const {
1024 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1025 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1026 DestTy.getScalarSizeInBits() == 32 &&
1027 SrcTy.getScalarSizeInBits() == 16 &&
1028 // TODO: This probably only requires no input flushing?
1029 denormalModeIsFlushAllF32(*MI.getMF());
1030}
1031
1033 // SI has some legal vector types, but no legal vector operations. Say no
1034 // shuffles are legal in order to prefer scalarizing some vector operations.
1035 return false;
1036}
1037
1040 EVT VT) const {
1043
1044 if (VT.isVector()) {
1045 EVT ScalarVT = VT.getScalarType();
1046 unsigned Size = ScalarVT.getSizeInBits();
1047 if (Size == 16) {
1048 if (Subtarget->has16BitInsts()) {
1049 if (VT.isInteger())
1050 return MVT::v2i16;
1051 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1052 }
1053 return VT.isInteger() ? MVT::i32 : MVT::f32;
1054 }
1055
1056 if (Size < 16)
1057 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1058 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1059 }
1060
1061 if (VT.getSizeInBits() > 32)
1062 return MVT::i32;
1063
1065}
1066
1069 EVT VT) const {
1072
1073 if (VT.isVector()) {
1074 unsigned NumElts = VT.getVectorNumElements();
1075 EVT ScalarVT = VT.getScalarType();
1076 unsigned Size = ScalarVT.getSizeInBits();
1077
1078 // FIXME: Should probably promote 8-bit vectors to i16.
1079 if (Size == 16 && Subtarget->has16BitInsts())
1080 return (NumElts + 1) / 2;
1081
1082 if (Size <= 32)
1083 return NumElts;
1084
1085 if (Size > 32)
1086 return NumElts * ((Size + 31) / 32);
1087 } else if (VT.getSizeInBits() > 32)
1088 return (VT.getSizeInBits() + 31) / 32;
1089
1091}
1092
1094 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1095 unsigned &NumIntermediates, MVT &RegisterVT) const {
1096 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1097 unsigned NumElts = VT.getVectorNumElements();
1098 EVT ScalarVT = VT.getScalarType();
1099 unsigned Size = ScalarVT.getSizeInBits();
1100 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1101 // support, but unless we can properly handle 3-vectors, it will be still be
1102 // inconsistent.
1103 if (Size == 16 && Subtarget->has16BitInsts()) {
1104 if (ScalarVT == MVT::bf16) {
1105 RegisterVT = MVT::i32;
1106 IntermediateVT = MVT::v2bf16;
1107 } else {
1108 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1109 IntermediateVT = RegisterVT;
1110 }
1111 NumIntermediates = (NumElts + 1) / 2;
1112 return NumIntermediates;
1113 }
1114
1115 if (Size == 32) {
1116 RegisterVT = ScalarVT.getSimpleVT();
1117 IntermediateVT = RegisterVT;
1118 NumIntermediates = NumElts;
1119 return NumIntermediates;
1120 }
1121
1122 if (Size < 16 && Subtarget->has16BitInsts()) {
1123 // FIXME: Should probably form v2i16 pieces
1124 RegisterVT = MVT::i16;
1125 IntermediateVT = ScalarVT;
1126 NumIntermediates = NumElts;
1127 return NumIntermediates;
1128 }
1129
1130 if (Size != 16 && Size <= 32) {
1131 RegisterVT = MVT::i32;
1132 IntermediateVT = ScalarVT;
1133 NumIntermediates = NumElts;
1134 return NumIntermediates;
1135 }
1136
1137 if (Size > 32) {
1138 RegisterVT = MVT::i32;
1139 IntermediateVT = RegisterVT;
1140 NumIntermediates = NumElts * ((Size + 31) / 32);
1141 return NumIntermediates;
1142 }
1143 }
1144
1146 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1147}
1148
1150 const DataLayout &DL, Type *Ty,
1151 unsigned MaxNumLanes) {
1152 assert(MaxNumLanes != 0);
1153
1154 LLVMContext &Ctx = Ty->getContext();
1155 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1156 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1157 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1158 NumElts);
1159 }
1160
1161 return TLI.getValueType(DL, Ty);
1162}
1163
1164// Peek through TFE struct returns to only use the data size.
1166 const DataLayout &DL, Type *Ty,
1167 unsigned MaxNumLanes) {
1168 auto *ST = dyn_cast<StructType>(Ty);
1169 if (!ST)
1170 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1171
1172 // TFE intrinsics return an aggregate type.
1173 assert(ST->getNumContainedTypes() == 2 &&
1174 ST->getContainedType(1)->isIntegerTy(32));
1175 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1176}
1177
1178/// Map address space 7 to MVT::v5i32 because that's its in-memory
1179/// representation. This return value is vector-typed because there is no
1180/// MVT::i160 and it is not clear if one can be added. While this could
1181/// cause issues during codegen, these address space 7 pointers will be
1182/// rewritten away by then. Therefore, we can return MVT::v5i32 in order
1183/// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1184/// modeling, to work.
1186 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1187 return MVT::v5i32;
1189 DL.getPointerSizeInBits(AS) == 192)
1190 return MVT::v6i32;
1192}
1193/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1194/// v8i32 when padding is added.
1195/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1196/// also v8i32 with padding.
1198 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1199 DL.getPointerSizeInBits(AS) == 160) ||
1201 DL.getPointerSizeInBits(AS) == 192))
1202 return MVT::v8i32;
1204}
1205
1207 const CallInst &CI,
1208 MachineFunction &MF,
1209 unsigned IntrID) const {
1211 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1213 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1215 Info.flags |= getTargetMMOFlags(CI);
1216
1217 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1219 AttributeList Attr =
1221 MemoryEffects ME = Attr.getMemoryEffects();
1222 if (ME.doesNotAccessMemory())
1223 return false;
1224
1225 // TODO: Should images get their own address space?
1226 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1227
1228 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1229 if (RsrcIntr->IsImage) {
1232 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1233 Info.align.reset();
1234 }
1235
1236 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1237 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1238 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1239 // We conservatively set the memory operand of a buffer intrinsic to the
1240 // base resource pointer, so that we can access alias information about
1241 // those pointers. Cases like "this points at the same value
1242 // but with a different offset" are handled in
1243 // areMemAccessesTriviallyDisjoint.
1244 Info.ptrVal = RsrcArg;
1245 }
1246
1247 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1248 if (!IsSPrefetch) {
1249 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1250 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1252 }
1253
1255 if (ME.onlyReadsMemory()) {
1256 if (RsrcIntr->IsImage) {
1257 unsigned MaxNumLanes = 4;
1258
1259 if (!BaseOpcode->Gather4) {
1260 // If this isn't a gather, we may have excess loaded elements in the
1261 // IR type. Check the dmask for the real number of elements loaded.
1262 unsigned DMask =
1263 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1264 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1265 }
1266
1267 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1268 CI.getType(), MaxNumLanes);
1269 } else {
1270 Info.memVT =
1272 std::numeric_limits<unsigned>::max());
1273 }
1274
1275 // FIXME: What does alignment mean for an image?
1278 } else if (ME.onlyWritesMemory()) {
1280
1281 Type *DataTy = CI.getArgOperand(0)->getType();
1282 if (RsrcIntr->IsImage) {
1283 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1284 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1285 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1286 DMaskLanes);
1287 } else
1288 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1289
1291 } else {
1292 // Atomic, NoReturn Sampler or prefetch
1295 Info.flags |=
1297
1298 if (!IsSPrefetch)
1300
1301 switch (IntrID) {
1302 default:
1303 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1304 // Fake memory access type for no return sampler intrinsics
1305 Info.memVT = MVT::i32;
1306 } else {
1307 // XXX - Should this be volatile without known ordering?
1309 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1310 }
1311 break;
1312 case Intrinsic::amdgcn_raw_buffer_load_lds:
1313 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1314 case Intrinsic::amdgcn_struct_buffer_load_lds:
1315 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1316 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1317 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1318 Info.ptrVal = CI.getArgOperand(1);
1319 return true;
1320 }
1321 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1322 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1323 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1324 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1325 Info.memVT =
1327 std::numeric_limits<unsigned>::max());
1328 Info.flags &= ~MachineMemOperand::MOStore;
1329 return true;
1330 }
1331 }
1332 }
1333 return true;
1334 }
1335
1336 switch (IntrID) {
1337 case Intrinsic::amdgcn_ds_ordered_add:
1338 case Intrinsic::amdgcn_ds_ordered_swap: {
1340 Info.memVT = MVT::getVT(CI.getType());
1341 Info.ptrVal = CI.getOperand(0);
1342 Info.align.reset();
1344
1345 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1346 if (!Vol->isZero())
1348
1349 return true;
1350 }
1351 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1352 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1354 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1355 Info.ptrVal = nullptr;
1356 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1358 return true;
1359 }
1360 case Intrinsic::amdgcn_ds_append:
1361 case Intrinsic::amdgcn_ds_consume: {
1363 Info.memVT = MVT::getVT(CI.getType());
1364 Info.ptrVal = CI.getOperand(0);
1365 Info.align.reset();
1367
1368 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1369 if (!Vol->isZero())
1371
1372 return true;
1373 }
1374 case Intrinsic::amdgcn_global_atomic_csub: {
1376 Info.memVT = MVT::getVT(CI.getType());
1377 Info.ptrVal = CI.getOperand(0);
1378 Info.align.reset();
1381 return true;
1382 }
1383 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1385 Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1386
1387 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1388 Info.align.reset();
1389 Info.flags |=
1391 return true;
1392 }
1393 case Intrinsic::amdgcn_global_atomic_fmin_num:
1394 case Intrinsic::amdgcn_global_atomic_fmax_num:
1395 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1396 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1397 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1398 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1400 Info.memVT = MVT::getVT(CI.getType());
1401 Info.ptrVal = CI.getOperand(0);
1402 Info.align.reset();
1406 return true;
1407 }
1408 case Intrinsic::amdgcn_global_load_tr_b64:
1409 case Intrinsic::amdgcn_global_load_tr_b128:
1410 case Intrinsic::amdgcn_ds_read_tr4_b64:
1411 case Intrinsic::amdgcn_ds_read_tr6_b96:
1412 case Intrinsic::amdgcn_ds_read_tr8_b64:
1413 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1415 Info.memVT = MVT::getVT(CI.getType());
1416 Info.ptrVal = CI.getOperand(0);
1417 Info.align.reset();
1419 return true;
1420 }
1421 case Intrinsic::amdgcn_ds_gws_init:
1422 case Intrinsic::amdgcn_ds_gws_barrier:
1423 case Intrinsic::amdgcn_ds_gws_sema_v:
1424 case Intrinsic::amdgcn_ds_gws_sema_br:
1425 case Intrinsic::amdgcn_ds_gws_sema_p:
1426 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1428
1429 const GCNTargetMachine &TM =
1430 static_cast<const GCNTargetMachine &>(getTargetMachine());
1431
1433 Info.ptrVal = MFI->getGWSPSV(TM);
1434
1435 // This is an abstract access, but we need to specify a type and size.
1436 Info.memVT = MVT::i32;
1437 Info.size = 4;
1438 Info.align = Align(4);
1439
1440 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1442 else
1444 return true;
1445 }
1446 case Intrinsic::amdgcn_global_load_lds: {
1448 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1449 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1450 Info.ptrVal = CI.getArgOperand(1);
1452 return true;
1453 }
1454 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1456
1457 const GCNTargetMachine &TM =
1458 static_cast<const GCNTargetMachine &>(getTargetMachine());
1459
1461 Info.ptrVal = MFI->getGWSPSV(TM);
1462
1463 // This is an abstract access, but we need to specify a type and size.
1464 Info.memVT = MVT::i32;
1465 Info.size = 4;
1466 Info.align = Align(4);
1467
1469 return true;
1470 }
1471 case Intrinsic::amdgcn_s_prefetch_data: {
1473 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1474 Info.ptrVal = CI.getArgOperand(0);
1476 return true;
1477 }
1478 default:
1479 return false;
1480 }
1481}
1482
1484 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1485 switch (cast<IntrinsicInst>(I).getIntrinsicID()) {
1486 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1487 // The DAG's ValueType loses the addrspaces.
1488 // Add them as 2 extra Constant operands "from" and "to".
1489 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1490 unsigned DstAS = I.getType()->getPointerAddressSpace();
1491 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1492 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1493 break;
1494 }
1495 default:
1496 break;
1497 }
1498}
1499
1502 Type *&AccessTy) const {
1503 Value *Ptr = nullptr;
1504 switch (II->getIntrinsicID()) {
1505 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1506 case Intrinsic::amdgcn_ds_append:
1507 case Intrinsic::amdgcn_ds_consume:
1508 case Intrinsic::amdgcn_ds_read_tr4_b64:
1509 case Intrinsic::amdgcn_ds_read_tr6_b96:
1510 case Intrinsic::amdgcn_ds_read_tr8_b64:
1511 case Intrinsic::amdgcn_ds_read_tr16_b64:
1512 case Intrinsic::amdgcn_ds_ordered_add:
1513 case Intrinsic::amdgcn_ds_ordered_swap:
1514 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1515 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1516 case Intrinsic::amdgcn_global_atomic_csub:
1517 case Intrinsic::amdgcn_global_atomic_fmax_num:
1518 case Intrinsic::amdgcn_global_atomic_fmin_num:
1519 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1520 case Intrinsic::amdgcn_global_load_tr_b64:
1521 case Intrinsic::amdgcn_global_load_tr_b128:
1522 Ptr = II->getArgOperand(0);
1523 break;
1524 case Intrinsic::amdgcn_global_load_lds:
1525 Ptr = II->getArgOperand(1);
1526 break;
1527 default:
1528 return false;
1529 }
1530 AccessTy = II->getType();
1531 Ops.push_back(Ptr);
1532 return true;
1533}
1534
1536 unsigned AddrSpace) const {
1537 if (!Subtarget->hasFlatInstOffsets()) {
1538 // Flat instructions do not have offsets, and only have the register
1539 // address.
1540 return AM.BaseOffs == 0 && AM.Scale == 0;
1541 }
1542
1543 decltype(SIInstrFlags::FLAT) FlatVariant =
1547
1548 return AM.Scale == 0 &&
1549 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1550 AM.BaseOffs, AddrSpace, FlatVariant));
1551}
1552
1554 if (Subtarget->hasFlatGlobalInsts())
1556
1557 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1558 // Assume the we will use FLAT for all global memory accesses
1559 // on VI.
1560 // FIXME: This assumption is currently wrong. On VI we still use
1561 // MUBUF instructions for the r + i addressing mode. As currently
1562 // implemented, the MUBUF instructions only work on buffer < 4GB.
1563 // It may be possible to support > 4GB buffers with MUBUF instructions,
1564 // by setting the stride value in the resource descriptor which would
1565 // increase the size limit to (stride * 4GB). However, this is risky,
1566 // because it has never been validated.
1568 }
1569
1570 return isLegalMUBUFAddressingMode(AM);
1571}
1572
1573bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1574 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1575 // additionally can do r + r + i with addr64. 32-bit has more addressing
1576 // mode options. Depending on the resource constant, it can also do
1577 // (i64 r0) + (i32 r1) * (i14 i).
1578 //
1579 // Private arrays end up using a scratch buffer most of the time, so also
1580 // assume those use MUBUF instructions. Scratch loads / stores are currently
1581 // implemented as mubuf instructions with offen bit set, so slightly
1582 // different than the normal addr64.
1583 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1584 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1585 return false;
1586
1587 // FIXME: Since we can split immediate into soffset and immediate offset,
1588 // would it make sense to allow any immediate?
1589
1590 switch (AM.Scale) {
1591 case 0: // r + i or just i, depending on HasBaseReg.
1592 return true;
1593 case 1:
1594 return true; // We have r + r or r + i.
1595 case 2:
1596 if (AM.HasBaseReg) {
1597 // Reject 2 * r + r.
1598 return false;
1599 }
1600
1601 // Allow 2 * r as r + r
1602 // Or 2 * r + i is allowed as r + r + i.
1603 return true;
1604 default: // Don't allow n * r
1605 return false;
1606 }
1607}
1608
1610 const AddrMode &AM, Type *Ty,
1611 unsigned AS,
1612 Instruction *I) const {
1613 // No global is ever allowed as a base.
1614 if (AM.BaseGV)
1615 return false;
1616
1617 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1618 return isLegalGlobalAddressingMode(AM);
1619
1620 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1624 // If the offset isn't a multiple of 4, it probably isn't going to be
1625 // correctly aligned.
1626 // FIXME: Can we get the real alignment here?
1627 if (AM.BaseOffs % 4 != 0)
1628 return isLegalMUBUFAddressingMode(AM);
1629
1630 if (!Subtarget->hasScalarSubwordLoads()) {
1631 // There are no SMRD extloads, so if we have to do a small type access we
1632 // will use a MUBUF load.
1633 // FIXME?: We also need to do this if unaligned, but we don't know the
1634 // alignment here.
1635 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1636 return isLegalGlobalAddressingMode(AM);
1637 }
1638
1640 // SMRD instructions have an 8-bit, dword offset on SI.
1641 if (!isUInt<8>(AM.BaseOffs / 4))
1642 return false;
1643 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1644 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1645 // in 8-bits, it can use a smaller encoding.
1646 if (!isUInt<32>(AM.BaseOffs / 4))
1647 return false;
1648 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1649 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1650 if (!isUInt<20>(AM.BaseOffs))
1651 return false;
1652 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1653 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1654 // for S_BUFFER_* instructions).
1655 if (!isInt<21>(AM.BaseOffs))
1656 return false;
1657 } else {
1658 // On GFX12, all offsets are signed 24-bit in bytes.
1659 if (!isInt<24>(AM.BaseOffs))
1660 return false;
1661 }
1662
1663 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1665 AM.BaseOffs < 0) {
1666 // Scalar (non-buffer) loads can only use a negative offset if
1667 // soffset+offset is non-negative. Since the compiler can only prove that
1668 // in a few special cases, it is safer to claim that negative offsets are
1669 // not supported.
1670 return false;
1671 }
1672
1673 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1674 return true;
1675
1676 if (AM.Scale == 1 && AM.HasBaseReg)
1677 return true;
1678
1679 return false;
1680 }
1681
1682 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1683 return Subtarget->enableFlatScratch()
1685 : isLegalMUBUFAddressingMode(AM);
1686
1687 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1688 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1689 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1690 // field.
1691 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1692 // an 8-bit dword offset but we don't know the alignment here.
1693 if (!isUInt<16>(AM.BaseOffs))
1694 return false;
1695
1696 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1697 return true;
1698
1699 if (AM.Scale == 1 && AM.HasBaseReg)
1700 return true;
1701
1702 return false;
1703 }
1704
1706 // For an unknown address space, this usually means that this is for some
1707 // reason being used for pure arithmetic, and not based on some addressing
1708 // computation. We don't have instructions that compute pointers with any
1709 // addressing modes, so treat them as having no offset like flat
1710 // instructions.
1712 }
1713
1714 // Assume a user alias of global for unknown address spaces.
1715 return isLegalGlobalAddressingMode(AM);
1716}
1717
1719 const MachineFunction &MF) const {
1721 return (MemVT.getSizeInBits() <= 4 * 32);
1722 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1723 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1724 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1725 }
1727 return (MemVT.getSizeInBits() <= 2 * 32);
1728 return true;
1729}
1730
1732 unsigned Size, unsigned AddrSpace, Align Alignment,
1733 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1734 if (IsFast)
1735 *IsFast = 0;
1736
1737 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1738 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1739 // Check if alignment requirements for ds_read/write instructions are
1740 // disabled.
1741 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1742 return false;
1743
1744 Align RequiredAlignment(
1745 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1746 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1747 Alignment < RequiredAlignment)
1748 return false;
1749
1750 // Either, the alignment requirements are "enabled", or there is an
1751 // unaligned LDS access related hardware bug though alignment requirements
1752 // are "disabled". In either case, we need to check for proper alignment
1753 // requirements.
1754 //
1755 switch (Size) {
1756 case 64:
1757 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1758 // address is negative, then the instruction is incorrectly treated as
1759 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1760 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1761 // load later in the SILoadStoreOptimizer.
1762 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1763 return false;
1764
1765 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1766 // can do a 4 byte aligned, 8 byte access in a single operation using
1767 // ds_read2/write2_b32 with adjacent offsets.
1768 RequiredAlignment = Align(4);
1769
1770 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1771 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1772 // ds_write2_b32 depending on the alignment. In either case with either
1773 // alignment there is no faster way of doing this.
1774
1775 // The numbers returned here and below are not additive, it is a 'speed
1776 // rank'. They are just meant to be compared to decide if a certain way
1777 // of lowering an operation is faster than another. For that purpose
1778 // naturally aligned operation gets it bitsize to indicate that "it
1779 // operates with a speed comparable to N-bit wide load". With the full
1780 // alignment ds128 is slower than ds96 for example. If underaligned it
1781 // is comparable to a speed of a single dword access, which would then
1782 // mean 32 < 128 and it is faster to issue a wide load regardless.
1783 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1784 // wider load which will not be aligned anymore the latter is slower.
1785 if (IsFast)
1786 *IsFast = (Alignment >= RequiredAlignment) ? 64
1787 : (Alignment < Align(4)) ? 32
1788 : 1;
1789 return true;
1790 }
1791
1792 break;
1793 case 96:
1794 if (!Subtarget->hasDS96AndDS128())
1795 return false;
1796
1797 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1798 // gfx8 and older.
1799
1800 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1801 // Naturally aligned access is fastest. However, also report it is Fast
1802 // if memory is aligned less than DWORD. A narrow load or store will be
1803 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1804 // be more of them, so overall we will pay less penalty issuing a single
1805 // instruction.
1806
1807 // See comment on the values above.
1808 if (IsFast)
1809 *IsFast = (Alignment >= RequiredAlignment) ? 96
1810 : (Alignment < Align(4)) ? 32
1811 : 1;
1812 return true;
1813 }
1814
1815 break;
1816 case 128:
1817 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1818 return false;
1819
1820 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1821 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1822 // single operation using ds_read2/write2_b64.
1823 RequiredAlignment = Align(8);
1824
1825 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1826 // Naturally aligned access is fastest. However, also report it is Fast
1827 // if memory is aligned less than DWORD. A narrow load or store will be
1828 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1829 // will be more of them, so overall we will pay less penalty issuing a
1830 // single instruction.
1831
1832 // See comment on the values above.
1833 if (IsFast)
1834 *IsFast = (Alignment >= RequiredAlignment) ? 128
1835 : (Alignment < Align(4)) ? 32
1836 : 1;
1837 return true;
1838 }
1839
1840 break;
1841 default:
1842 if (Size > 32)
1843 return false;
1844
1845 break;
1846 }
1847
1848 // See comment on the values above.
1849 // Note that we have a single-dword or sub-dword here, so if underaligned
1850 // it is a slowest possible access, hence returned value is 0.
1851 if (IsFast)
1852 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1853
1854 return Alignment >= RequiredAlignment ||
1855 Subtarget->hasUnalignedDSAccessEnabled();
1856 }
1857
1858 // FIXME: We have to be conservative here and assume that flat operations
1859 // will access scratch. If we had access to the IR function, then we
1860 // could determine if any private memory was used in the function.
1861 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
1862 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
1863 bool AlignedBy4 = Alignment >= Align(4);
1864 if (IsFast)
1865 *IsFast = AlignedBy4;
1866
1867 return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
1868 }
1869
1870 // So long as they are correct, wide global memory operations perform better
1871 // than multiple smaller memory ops -- even when misaligned
1872 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
1873 if (IsFast)
1874 *IsFast = Size;
1875
1876 return Alignment >= Align(4) ||
1878 }
1879
1880 // Smaller than dword value must be aligned.
1881 if (Size < 32)
1882 return false;
1883
1884 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1885 // byte-address are ignored, thus forcing Dword alignment.
1886 // This applies to private, global, and constant memory.
1887 if (IsFast)
1888 *IsFast = 1;
1889
1890 return Size >= 32 && Alignment >= Align(4);
1891}
1892
1894 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1895 unsigned *IsFast) const {
1897 Alignment, Flags, IsFast);
1898}
1899
1901 const MemOp &Op, const AttributeList &FuncAttributes) const {
1902 // FIXME: Should account for address space here.
1903
1904 // The default fallback uses the private pointer size as a guess for a type to
1905 // use. Make sure we switch these to 64-bit accesses.
1906
1907 if (Op.size() >= 16 &&
1908 Op.isDstAligned(Align(4))) // XXX: Should only do for global
1909 return MVT::v4i32;
1910
1911 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1912 return MVT::v2i32;
1913
1914 // Use the default.
1915 return MVT::Other;
1916}
1917
1919 const MemSDNode *MemNode = cast<MemSDNode>(N);
1920 return MemNode->getMemOperand()->getFlags() & MONoClobber;
1921}
1922
1924 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1926}
1927
1929 unsigned DestAS) const {
1930 // Flat -> private/local is a simple truncate.
1931 // Flat -> global is no-op
1932 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1933 return true;
1934
1935 const GCNTargetMachine &TM =
1936 static_cast<const GCNTargetMachine &>(getTargetMachine());
1937 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1938}
1939
1942 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1943 VT.getScalarType().bitsLE(MVT::i16))
1946}
1947
1949 Type *Ty) const {
1950 // FIXME: Could be smarter if called for vector constants.
1951 return true;
1952}
1953
1955 unsigned Index) const {
1957 return false;
1958
1959 // TODO: Add more cases that are cheap.
1960 return Index == 0;
1961}
1962
1963bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
1964 // TODO: This should be more aggressive, particular for 16-bit element
1965 // vectors. However there are some mixed improvements and regressions.
1966 EVT EltTy = VT.getVectorElementType();
1967 return EltTy.getSizeInBits() % 32 == 0;
1968}
1969
1971 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1972 switch (Op) {
1973 case ISD::LOAD:
1974 case ISD::STORE:
1975 return true;
1976 default:
1977 return false;
1978 }
1979 }
1980
1981 // SimplifySetCC uses this function to determine whether or not it should
1982 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1983 if (VT == MVT::i1 && Op == ISD::SETCC)
1984 return false;
1985
1987}
1988
1989SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1990 const SDLoc &SL,
1991 SDValue Chain,
1992 uint64_t Offset) const {
1993 const DataLayout &DL = DAG.getDataLayout();
1997
1998 auto [InputPtrReg, RC, ArgTy] =
2000
2001 // We may not have the kernarg segment argument if we have no kernel
2002 // arguments.
2003 if (!InputPtrReg)
2004 return DAG.getConstant(Offset, SL, PtrVT);
2005
2007 SDValue BasePtr = DAG.getCopyFromReg(
2008 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2009
2010 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2011}
2012
2013SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2014 const SDLoc &SL) const {
2017 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2018}
2019
2020SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2021 const SDLoc &SL) const {
2022
2024 std::optional<uint32_t> KnownSize =
2026 if (KnownSize.has_value())
2027 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2028 return SDValue();
2029}
2030
2031SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2032 const SDLoc &SL, SDValue Val,
2033 bool Signed,
2034 const ISD::InputArg *Arg) const {
2035 // First, if it is a widened vector, narrow it.
2036 if (VT.isVector() &&
2038 EVT NarrowedVT =
2041 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2042 DAG.getConstant(0, SL, MVT::i32));
2043 }
2044
2045 // Then convert the vector elements or scalar value.
2046 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2047 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2048 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2049 }
2050
2051 if (MemVT.isFloatingPoint())
2052 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2053 else if (Signed)
2054 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2055 else
2056 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2057
2058 return Val;
2059}
2060
2061SDValue SITargetLowering::lowerKernargMemParameter(
2062 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2063 uint64_t Offset, Align Alignment, bool Signed,
2064 const ISD::InputArg *Arg) const {
2066
2067 // Try to avoid using an extload by loading earlier than the argument address,
2068 // and extracting the relevant bits. The load should hopefully be merged with
2069 // the previous argument.
2070 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2071 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2072 int64_t AlignDownOffset = alignDown(Offset, 4);
2073 int64_t OffsetDiff = Offset - AlignDownOffset;
2074
2075 EVT IntVT = MemVT.changeTypeToInteger();
2076
2077 // TODO: If we passed in the base kernel offset we could have a better
2078 // alignment than 4, but we don't really need it.
2079 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2080 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2083
2084 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2085 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2086
2087 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2088 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2089 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2090
2091 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2092 }
2093
2094 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2095 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2098
2099 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2100 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2101}
2102
2103SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2104 CCValAssign &VA, const SDLoc &SL,
2105 SDValue Chain,
2106 const ISD::InputArg &Arg) const {
2108 MachineFrameInfo &MFI = MF.getFrameInfo();
2109
2110 if (Arg.Flags.isByVal()) {
2111 unsigned Size = Arg.Flags.getByValSize();
2112 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2113 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2114 }
2115
2116 unsigned ArgOffset = VA.getLocMemOffset();
2117 unsigned ArgSize = VA.getValVT().getStoreSize();
2118
2119 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2120
2121 // Create load nodes to retrieve arguments from the stack.
2122 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2123 SDValue ArgValue;
2124
2125 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2127 MVT MemVT = VA.getValVT();
2128
2129 switch (VA.getLocInfo()) {
2130 default:
2131 break;
2132 case CCValAssign::BCvt:
2133 MemVT = VA.getLocVT();
2134 break;
2135 case CCValAssign::SExt:
2136 ExtType = ISD::SEXTLOAD;
2137 break;
2138 case CCValAssign::ZExt:
2139 ExtType = ISD::ZEXTLOAD;
2140 break;
2141 case CCValAssign::AExt:
2142 ExtType = ISD::EXTLOAD;
2143 break;
2144 }
2145
2146 ArgValue = DAG.getExtLoad(
2147 ExtType, SL, VA.getLocVT(), Chain, FIN,
2149 return ArgValue;
2150}
2151
2152SDValue SITargetLowering::getPreloadedValue(
2153 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2155 const ArgDescriptor *Reg = nullptr;
2156 const TargetRegisterClass *RC;
2157 LLT Ty;
2158
2160 const ArgDescriptor WorkGroupIDX =
2161 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2162 // If GridZ is not programmed in an entry function then the hardware will set
2163 // it to all zeros, so there is no need to mask the GridY value in the low
2164 // order bits.
2165 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2166 AMDGPU::TTMP7,
2167 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2168 const ArgDescriptor WorkGroupIDZ =
2169 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2170 if (Subtarget->hasArchitectedSGPRs() &&
2172 switch (PVID) {
2174 Reg = &WorkGroupIDX;
2175 RC = &AMDGPU::SReg_32RegClass;
2176 Ty = LLT::scalar(32);
2177 break;
2179 Reg = &WorkGroupIDY;
2180 RC = &AMDGPU::SReg_32RegClass;
2181 Ty = LLT::scalar(32);
2182 break;
2184 Reg = &WorkGroupIDZ;
2185 RC = &AMDGPU::SReg_32RegClass;
2186 Ty = LLT::scalar(32);
2187 break;
2188 default:
2189 break;
2190 }
2191 }
2192
2193 if (!Reg)
2194 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2195 if (!Reg) {
2197 // It's possible for a kernarg intrinsic call to appear in a kernel with
2198 // no allocated segment, in which case we do not add the user sgpr
2199 // argument, so just return null.
2200 return DAG.getConstant(0, SDLoc(), VT);
2201 }
2202
2203 // It's undefined behavior if a function marked with the amdgpu-no-*
2204 // attributes uses the corresponding intrinsic.
2205 return DAG.getUNDEF(VT);
2206 }
2207
2208 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2209}
2210
2212 CallingConv::ID CallConv,
2213 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2214 FunctionType *FType,
2215 SIMachineFunctionInfo *Info) {
2216 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2217 const ISD::InputArg *Arg = &Ins[I];
2218
2219 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2220 "vector type argument should have been split");
2221
2222 // First check if it's a PS input addr.
2223 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2224 PSInputNum <= 15) {
2225 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2226
2227 // Inconveniently only the first part of the split is marked as isSplit,
2228 // so skip to the end. We only want to increment PSInputNum once for the
2229 // entire split argument.
2230 if (Arg->Flags.isSplit()) {
2231 while (!Arg->Flags.isSplitEnd()) {
2232 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2233 "unexpected vector split in ps argument type");
2234 if (!SkipArg)
2235 Splits.push_back(*Arg);
2236 Arg = &Ins[++I];
2237 }
2238 }
2239
2240 if (SkipArg) {
2241 // We can safely skip PS inputs.
2242 Skipped.set(Arg->getOrigArgIndex());
2243 ++PSInputNum;
2244 continue;
2245 }
2246
2247 Info->markPSInputAllocated(PSInputNum);
2248 if (Arg->Used)
2249 Info->markPSInputEnabled(PSInputNum);
2250
2251 ++PSInputNum;
2252 }
2253
2254 Splits.push_back(*Arg);
2255 }
2256}
2257
2258// Allocate special inputs passed in VGPRs.
2260 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2261 SIMachineFunctionInfo &Info) const {
2262 const LLT S32 = LLT::scalar(32);
2264
2265 if (Info.hasWorkItemIDX()) {
2266 Register Reg = AMDGPU::VGPR0;
2267 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2268
2269 CCInfo.AllocateReg(Reg);
2270 unsigned Mask =
2271 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2272 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2273 }
2274
2275 if (Info.hasWorkItemIDY()) {
2276 assert(Info.hasWorkItemIDX());
2277 if (Subtarget->hasPackedTID()) {
2278 Info.setWorkItemIDY(
2279 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2280 } else {
2281 unsigned Reg = AMDGPU::VGPR1;
2282 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2283
2284 CCInfo.AllocateReg(Reg);
2285 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2286 }
2287 }
2288
2289 if (Info.hasWorkItemIDZ()) {
2290 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2291 if (Subtarget->hasPackedTID()) {
2292 Info.setWorkItemIDZ(
2293 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2294 } else {
2295 unsigned Reg = AMDGPU::VGPR2;
2296 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2297
2298 CCInfo.AllocateReg(Reg);
2299 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2300 }
2301 }
2302}
2303
2304// Try to allocate a VGPR at the end of the argument list, or if no argument
2305// VGPRs are left allocating a stack slot.
2306// If \p Mask is is given it indicates bitfield position in the register.
2307// If \p Arg is given use it with new ]p Mask instead of allocating new.
2308static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2309 ArgDescriptor Arg = ArgDescriptor()) {
2310 if (Arg.isSet())
2311 return ArgDescriptor::createArg(Arg, Mask);
2312
2313 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2314 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2315 if (RegIdx == ArgVGPRs.size()) {
2316 // Spill to stack required.
2317 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2318
2319 return ArgDescriptor::createStack(Offset, Mask);
2320 }
2321
2322 unsigned Reg = ArgVGPRs[RegIdx];
2323 Reg = CCInfo.AllocateReg(Reg);
2324 assert(Reg != AMDGPU::NoRegister);
2325
2326 MachineFunction &MF = CCInfo.getMachineFunction();
2327 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2328 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2329 return ArgDescriptor::createRegister(Reg, Mask);
2330}
2331
2333 const TargetRegisterClass *RC,
2334 unsigned NumArgRegs) {
2335 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2336 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2337 if (RegIdx == ArgSGPRs.size())
2338 report_fatal_error("ran out of SGPRs for arguments");
2339
2340 unsigned Reg = ArgSGPRs[RegIdx];
2341 Reg = CCInfo.AllocateReg(Reg);
2342 assert(Reg != AMDGPU::NoRegister);
2343
2344 MachineFunction &MF = CCInfo.getMachineFunction();
2345 MF.addLiveIn(Reg, RC);
2347}
2348
2349// If this has a fixed position, we still should allocate the register in the
2350// CCInfo state. Technically we could get away with this for values passed
2351// outside of the normal argument range.
2353 const TargetRegisterClass *RC,
2354 MCRegister Reg) {
2355 Reg = CCInfo.AllocateReg(Reg);
2356 assert(Reg != AMDGPU::NoRegister);
2357 MachineFunction &MF = CCInfo.getMachineFunction();
2358 MF.addLiveIn(Reg, RC);
2359}
2360
2361static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2362 if (Arg) {
2363 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2364 Arg.getRegister());
2365 } else
2366 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2367}
2368
2369static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2370 if (Arg) {
2371 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2372 Arg.getRegister());
2373 } else
2374 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2375}
2376
2377/// Allocate implicit function VGPR arguments at the end of allocated user
2378/// arguments.
2380 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2381 SIMachineFunctionInfo &Info) const {
2382 const unsigned Mask = 0x3ff;
2383 ArgDescriptor Arg;
2384
2385 if (Info.hasWorkItemIDX()) {
2386 Arg = allocateVGPR32Input(CCInfo, Mask);
2387 Info.setWorkItemIDX(Arg);
2388 }
2389
2390 if (Info.hasWorkItemIDY()) {
2391 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2392 Info.setWorkItemIDY(Arg);
2393 }
2394
2395 if (Info.hasWorkItemIDZ())
2396 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2397}
2398
2399/// Allocate implicit function VGPR arguments in fixed registers.
2401 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2402 SIMachineFunctionInfo &Info) const {
2403 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2404 if (!Reg)
2405 report_fatal_error("failed to allocated VGPR for implicit arguments");
2406
2407 const unsigned Mask = 0x3ff;
2408 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2409 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2410 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2411}
2412
2414 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2415 SIMachineFunctionInfo &Info) const {
2416 auto &ArgInfo = Info.getArgInfo();
2417 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2418
2419 // TODO: Unify handling with private memory pointers.
2420 if (UserSGPRInfo.hasDispatchPtr())
2421 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2422
2423 if (UserSGPRInfo.hasQueuePtr())
2424 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2425
2426 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2427 // constant offset from the kernarg segment.
2428 if (Info.hasImplicitArgPtr())
2429 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2430
2431 if (UserSGPRInfo.hasDispatchID())
2432 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2433
2434 // flat_scratch_init is not applicable for non-kernel functions.
2435
2436 if (Info.hasWorkGroupIDX())
2437 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2438
2439 if (Info.hasWorkGroupIDY())
2440 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2441
2442 if (Info.hasWorkGroupIDZ())
2443 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2444
2445 if (Info.hasLDSKernelId())
2446 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2447}
2448
2449// Allocate special inputs passed in user SGPRs.
2451 MachineFunction &MF,
2452 const SIRegisterInfo &TRI,
2453 SIMachineFunctionInfo &Info) const {
2454 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2455 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2456 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2457 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2458 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2459 }
2460
2461 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2462 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2463 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2464 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2465 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2466 }
2467
2468 if (UserSGPRInfo.hasDispatchPtr()) {
2469 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2470 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2471 CCInfo.AllocateReg(DispatchPtrReg);
2472 }
2473
2474 if (UserSGPRInfo.hasQueuePtr()) {
2475 Register QueuePtrReg = Info.addQueuePtr(TRI);
2476 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2477 CCInfo.AllocateReg(QueuePtrReg);
2478 }
2479
2480 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2482 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2483 CCInfo.AllocateReg(InputPtrReg);
2484
2485 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2486 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2487 }
2488
2489 if (UserSGPRInfo.hasDispatchID()) {
2490 Register DispatchIDReg = Info.addDispatchID(TRI);
2491 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2492 CCInfo.AllocateReg(DispatchIDReg);
2493 }
2494
2495 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2496 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2497 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2498 CCInfo.AllocateReg(FlatScratchInitReg);
2499 }
2500
2501 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2502 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2503 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2504 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2505 }
2506
2507 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2508 // these from the dispatch pointer.
2509}
2510
2511// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2512// sequential starting from the first argument.
2514 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2516 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2517 Function &F = MF.getFunction();
2518 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2519 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2520 bool InPreloadSequence = true;
2521 unsigned InIdx = 0;
2522 bool AlignedForImplictArgs = false;
2523 unsigned ImplicitArgOffset = 0;
2524 for (auto &Arg : F.args()) {
2525 if (!InPreloadSequence || !Arg.hasInRegAttr())
2526 break;
2527
2528 unsigned ArgIdx = Arg.getArgNo();
2529 // Don't preload non-original args or parts not in the current preload
2530 // sequence.
2531 if (InIdx < Ins.size() &&
2532 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2533 break;
2534
2535 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2536 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2537 InIdx++) {
2538 assert(ArgLocs[ArgIdx].isMemLoc());
2539 auto &ArgLoc = ArgLocs[InIdx];
2540 const Align KernelArgBaseAlign = Align(16);
2541 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2542 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2543 unsigned NumAllocSGPRs =
2544 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2545
2546 // Fix alignment for hidden arguments.
2547 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2548 if (!AlignedForImplictArgs) {
2549 ImplicitArgOffset =
2550 alignTo(LastExplicitArgOffset,
2551 Subtarget->getAlignmentForImplicitArgPtr()) -
2552 LastExplicitArgOffset;
2553 AlignedForImplictArgs = true;
2554 }
2555 ArgOffset += ImplicitArgOffset;
2556 }
2557
2558 // Arg is preloaded into the previous SGPR.
2559 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2560 assert(InIdx >= 1 && "No previous SGPR");
2561 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2562 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2563 continue;
2564 }
2565
2566 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2567 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2568 // Check for free user SGPRs for preloading.
2569 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2570 InPreloadSequence = false;
2571 break;
2572 }
2573
2574 // Preload this argument.
2575 const TargetRegisterClass *RC =
2576 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2577 SmallVectorImpl<MCRegister> *PreloadRegs =
2578 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2579
2580 if (PreloadRegs->size() > 1)
2581 RC = &AMDGPU::SGPR_32RegClass;
2582 for (auto &Reg : *PreloadRegs) {
2583 assert(Reg);
2584 MF.addLiveIn(Reg, RC);
2585 CCInfo.AllocateReg(Reg);
2586 }
2587
2588 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2589 }
2590 }
2591}
2592
2594 const SIRegisterInfo &TRI,
2595 SIMachineFunctionInfo &Info) const {
2596 // Always allocate this last since it is a synthetic preload.
2597 if (Info.hasLDSKernelId()) {
2598 Register Reg = Info.addLDSKernelId();
2599 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2600 CCInfo.AllocateReg(Reg);
2601 }
2602}
2603
2604// Allocate special input registers that are initialized per-wave.
2607 CallingConv::ID CallConv,
2608 bool IsShader) const {
2609 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2610 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2611 // Note: user SGPRs are handled by the front-end for graphics shaders
2612 // Pad up the used user SGPRs with dead inputs.
2613
2614 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2615 // before enabling architected SGPRs for workgroup IDs.
2616 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2617
2618 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2619 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2620 // rely on it to reach 16 since if we end up having no stack usage, it will
2621 // not really be added.
2622 unsigned NumRequiredSystemSGPRs =
2623 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
2624 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
2625 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2626 Register Reg = Info.addReservedUserSGPR();
2627 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2628 CCInfo.AllocateReg(Reg);
2629 }
2630 }
2631
2632 if (!HasArchitectedSGPRs) {
2633 if (Info.hasWorkGroupIDX()) {
2634 Register Reg = Info.addWorkGroupIDX();
2635 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2636 CCInfo.AllocateReg(Reg);
2637 }
2638
2639 if (Info.hasWorkGroupIDY()) {
2640 Register Reg = Info.addWorkGroupIDY();
2641 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2642 CCInfo.AllocateReg(Reg);
2643 }
2644
2645 if (Info.hasWorkGroupIDZ()) {
2646 Register Reg = Info.addWorkGroupIDZ();
2647 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2648 CCInfo.AllocateReg(Reg);
2649 }
2650 }
2651
2652 if (Info.hasWorkGroupInfo()) {
2653 Register Reg = Info.addWorkGroupInfo();
2654 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2655 CCInfo.AllocateReg(Reg);
2656 }
2657
2658 if (Info.hasPrivateSegmentWaveByteOffset()) {
2659 // Scratch wave offset passed in system SGPR.
2660 unsigned PrivateSegmentWaveByteOffsetReg;
2661
2662 if (IsShader) {
2663 PrivateSegmentWaveByteOffsetReg =
2664 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2665
2666 // This is true if the scratch wave byte offset doesn't have a fixed
2667 // location.
2668 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2669 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2670 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2671 }
2672 } else
2673 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2674
2675 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2676 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2677 }
2678
2679 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2680 Info.getNumPreloadedSGPRs() >= 16);
2681}
2682
2684 MachineFunction &MF,
2685 const SIRegisterInfo &TRI,
2686 SIMachineFunctionInfo &Info) {
2687 // Now that we've figured out where the scratch register inputs are, see if
2688 // should reserve the arguments and use them directly.
2689 MachineFrameInfo &MFI = MF.getFrameInfo();
2690 bool HasStackObjects = MFI.hasStackObjects();
2691 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2692
2693 // Record that we know we have non-spill stack objects so we don't need to
2694 // check all stack objects later.
2695 if (HasStackObjects)
2696 Info.setHasNonSpillStackObjects(true);
2697
2698 // Everything live out of a block is spilled with fast regalloc, so it's
2699 // almost certain that spilling will be required.
2700 if (TM.getOptLevel() == CodeGenOptLevel::None)
2701 HasStackObjects = true;
2702
2703 // For now assume stack access is needed in any callee functions, so we need
2704 // the scratch registers to pass in.
2705 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2706
2707 if (!ST.enableFlatScratch()) {
2708 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2709 // If we have stack objects, we unquestionably need the private buffer
2710 // resource. For the Code Object V2 ABI, this will be the first 4 user
2711 // SGPR inputs. We can reserve those and use them directly.
2712
2713 Register PrivateSegmentBufferReg =
2715 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2716 } else {
2717 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2718 // We tentatively reserve the last registers (skipping the last registers
2719 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2720 // we'll replace these with the ones immediately after those which were
2721 // really allocated. In the prologue copies will be inserted from the
2722 // argument to these reserved registers.
2723
2724 // Without HSA, relocations are used for the scratch pointer and the
2725 // buffer resource setup is always inserted in the prologue. Scratch wave
2726 // offset is still in an input SGPR.
2727 Info.setScratchRSrcReg(ReservedBufferReg);
2728 }
2729 }
2730
2732
2733 // For entry functions we have to set up the stack pointer if we use it,
2734 // whereas non-entry functions get this "for free". This means there is no
2735 // intrinsic advantage to using S32 over S34 in cases where we do not have
2736 // calls but do need a frame pointer (i.e. if we are requested to have one
2737 // because frame pointer elimination is disabled). To keep things simple we
2738 // only ever use S32 as the call ABI stack pointer, and so using it does not
2739 // imply we need a separate frame pointer.
2740 //
2741 // Try to use s32 as the SP, but move it if it would interfere with input
2742 // arguments. This won't work with calls though.
2743 //
2744 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2745 // registers.
2746 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2747 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2748 } else {
2750
2751 if (MFI.hasCalls())
2752 report_fatal_error("call in graphics shader with too many input SGPRs");
2753
2754 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2755 if (!MRI.isLiveIn(Reg)) {
2756 Info.setStackPtrOffsetReg(Reg);
2757 break;
2758 }
2759 }
2760
2761 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2762 report_fatal_error("failed to find register for SP");
2763 }
2764
2765 // hasFP should be accurate for entry functions even before the frame is
2766 // finalized, because it does not rely on the known stack size, only
2767 // properties like whether variable sized objects are present.
2768 if (ST.getFrameLowering()->hasFP(MF)) {
2769 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2770 }
2771}
2772
2775 return !Info->isEntryFunction();
2776}
2777
2779
2781 MachineBasicBlock *Entry,
2782 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2784
2785 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2786 if (!IStart)
2787 return;
2788
2789 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2790 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2791 MachineBasicBlock::iterator MBBI = Entry->begin();
2792 for (const MCPhysReg *I = IStart; *I; ++I) {
2793 const TargetRegisterClass *RC = nullptr;
2794 if (AMDGPU::SReg_64RegClass.contains(*I))
2795 RC = &AMDGPU::SGPR_64RegClass;
2796 else if (AMDGPU::SReg_32RegClass.contains(*I))
2797 RC = &AMDGPU::SGPR_32RegClass;
2798 else
2799 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2800
2801 Register NewVR = MRI->createVirtualRegister(RC);
2802 // Create copy from CSR to a virtual register.
2803 Entry->addLiveIn(*I);
2804 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2805 .addReg(*I);
2806
2807 // Insert the copy-back instructions right before the terminator.
2808 for (auto *Exit : Exits)
2809 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2810 TII->get(TargetOpcode::COPY), *I)
2811 .addReg(NewVR);
2812 }
2813}
2814
2816 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2817 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2818 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2820
2822 const Function &Fn = MF.getFunction();
2825
2826 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2827 DiagnosticInfoUnsupported NoGraphicsHSA(
2828 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2829 DAG.getContext()->diagnose(NoGraphicsHSA);
2830 return DAG.getEntryNode();
2831 }
2832
2835 BitVector Skipped(Ins.size());
2836 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2837 *DAG.getContext());
2838
2839 bool IsGraphics = AMDGPU::isGraphics(CallConv);
2840 bool IsKernel = AMDGPU::isKernel(CallConv);
2841 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2842
2843 if (IsGraphics) {
2844 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2845 assert(!UserSGPRInfo.hasDispatchPtr() &&
2846 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2847 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2848 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2849 (void)UserSGPRInfo;
2850 if (!Subtarget->enableFlatScratch())
2851 assert(!UserSGPRInfo.hasFlatScratchInit());
2852 if ((CallConv != CallingConv::AMDGPU_CS &&
2853 CallConv != CallingConv::AMDGPU_Gfx) ||
2854 !Subtarget->hasArchitectedSGPRs())
2855 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2856 !Info->hasWorkGroupIDZ());
2857 }
2858
2859 if (CallConv == CallingConv::AMDGPU_PS) {
2860 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2861
2862 // At least one interpolation mode must be enabled or else the GPU will
2863 // hang.
2864 //
2865 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2866 // set PSInputAddr, the user wants to enable some bits after the compilation
2867 // based on run-time states. Since we can't know what the final PSInputEna
2868 // will look like, so we shouldn't do anything here and the user should take
2869 // responsibility for the correct programming.
2870 //
2871 // Otherwise, the following restrictions apply:
2872 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2873 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2874 // enabled too.
2875 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2876 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2877 CCInfo.AllocateReg(AMDGPU::VGPR0);
2878 CCInfo.AllocateReg(AMDGPU::VGPR1);
2879 Info->markPSInputAllocated(0);
2880 Info->markPSInputEnabled(0);
2881 }
2882 if (Subtarget->isAmdPalOS()) {
2883 // For isAmdPalOS, the user does not enable some bits after compilation
2884 // based on run-time states; the register values being generated here are
2885 // the final ones set in hardware. Therefore we need to apply the
2886 // workaround to PSInputAddr and PSInputEnable together. (The case where
2887 // a bit is set in PSInputAddr but not PSInputEnable is where the
2888 // frontend set up an input arg for a particular interpolation mode, but
2889 // nothing uses that input arg. Really we should have an earlier pass
2890 // that removes such an arg.)
2891 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2892 if ((PsInputBits & 0x7F) == 0 ||
2893 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2894 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
2895 }
2896 } else if (IsKernel) {
2897 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2898 } else {
2899 Splits.append(Ins.begin(), Ins.end());
2900 }
2901
2902 if (IsKernel)
2903 analyzeFormalArgumentsCompute(CCInfo, Ins);
2904
2905 if (IsEntryFunc) {
2906 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2907 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2908 if (IsKernel && Subtarget->hasKernargPreload())
2909 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
2910
2911 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
2912 } else if (!IsGraphics) {
2913 // For the fixed ABI, pass workitem IDs in the last argument register.
2914 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2915
2916 // FIXME: Sink this into allocateSpecialInputSGPRs
2917 if (!Subtarget->enableFlatScratch())
2918 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2919
2920 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2921 }
2922
2923 if (!IsKernel) {
2924 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2925 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2926 }
2927
2929
2930 // FIXME: This is the minimum kernel argument alignment. We should improve
2931 // this to the maximum alignment of the arguments.
2932 //
2933 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2934 // kern arg offset.
2935 const Align KernelArgBaseAlign = Align(16);
2936
2937 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2938 const ISD::InputArg &Arg = Ins[i];
2939 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2940 InVals.push_back(DAG.getUNDEF(Arg.VT));
2941 continue;
2942 }
2943
2944 CCValAssign &VA = ArgLocs[ArgIdx++];
2945 MVT VT = VA.getLocVT();
2946
2947 if (IsEntryFunc && VA.isMemLoc()) {
2948 VT = Ins[i].VT;
2949 EVT MemVT = VA.getLocVT();
2950
2951 const uint64_t Offset = VA.getLocMemOffset();
2952 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2953
2954 if (Arg.Flags.isByRef()) {
2955 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2956
2957 const GCNTargetMachine &TM =
2958 static_cast<const GCNTargetMachine &>(getTargetMachine());
2959 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2960 Arg.Flags.getPointerAddrSpace())) {
2963 }
2964
2965 InVals.push_back(Ptr);
2966 continue;
2967 }
2968
2969 SDValue NewArg;
2970 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
2971 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2972 // In this case the argument is packed into the previous preload SGPR.
2973 int64_t AlignDownOffset = alignDown(Offset, 4);
2974 int64_t OffsetDiff = Offset - AlignDownOffset;
2975 EVT IntVT = MemVT.changeTypeToInteger();
2976
2980 Register Reg =
2981 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2982
2983 assert(Reg);
2984 Register VReg = MRI.getLiveInVirtReg(Reg);
2985 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2986
2987 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
2988 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
2989
2990 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
2991 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
2992 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
2993 Ins[i].Flags.isSExt(), &Ins[i]);
2994
2995 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
2996 } else {
3000 const SmallVectorImpl<MCRegister> &PreloadRegs =
3001 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3002
3003 SDValue Copy;
3004 if (PreloadRegs.size() == 1) {
3005 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3006 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3007 NewArg = DAG.getCopyFromReg(
3008 Chain, DL, VReg,
3010 TRI->getRegSizeInBits(*RC)));
3011
3012 } else {
3013 // If the kernarg alignment does not match the alignment of the SGPR
3014 // tuple RC that can accommodate this argument, it will be built up
3015 // via copies from from the individual SGPRs that the argument was
3016 // preloaded to.
3018 for (auto Reg : PreloadRegs) {
3019 Register VReg = MRI.getLiveInVirtReg(Reg);
3020 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3021 Elts.push_back(Copy);
3022 }
3023 NewArg =
3024 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3025 PreloadRegs.size()),
3026 DL, Elts);
3027 }
3028
3029 // If the argument was preloaded to multiple consecutive 32-bit
3030 // registers because of misalignment between addressable SGPR tuples
3031 // and the argument size, we can still assume that because of kernarg
3032 // segment alignment restrictions that NewArg's size is the same as
3033 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3034 // truncate since we cannot preload to less than a single SGPR and the
3035 // MemVT may be smaller.
3036 EVT MemVTInt =
3038 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3039 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3040
3041 NewArg = DAG.getBitcast(MemVT, NewArg);
3042 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3043 Ins[i].Flags.isSExt(), &Ins[i]);
3044 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3045 }
3046 } else {
3047 // Hidden arguments that are in the kernel signature must be preloaded
3048 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3049 // the argument list and is not preloaded.
3050 if (Arg.isOrigArg()) {
3051 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3052 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3053 DiagnosticInfoUnsupported NonPreloadHiddenArg(
3054 *OrigArg->getParent(),
3055 "hidden argument in kernel signature was not preloaded",
3056 DL.getDebugLoc());
3057 DAG.getContext()->diagnose(NonPreloadHiddenArg);
3058 }
3059 }
3060
3061 NewArg =
3062 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3063 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3064 }
3065 Chains.push_back(NewArg.getValue(1));
3066
3067 auto *ParamTy =
3068 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3070 ParamTy &&
3071 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3072 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3073 // On SI local pointers are just offsets into LDS, so they are always
3074 // less than 16-bits. On CI and newer they could potentially be
3075 // real pointers, so we can't guarantee their size.
3076 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3077 DAG.getValueType(MVT::i16));
3078 }
3079
3080 InVals.push_back(NewArg);
3081 continue;
3082 }
3083 if (!IsEntryFunc && VA.isMemLoc()) {
3084 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3085 InVals.push_back(Val);
3086 if (!Arg.Flags.isByVal())
3087 Chains.push_back(Val.getValue(1));
3088 continue;
3089 }
3090
3091 assert(VA.isRegLoc() && "Parameter must be in a register!");
3092
3093 Register Reg = VA.getLocReg();
3094 const TargetRegisterClass *RC = nullptr;
3095 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3096 RC = &AMDGPU::VGPR_32RegClass;
3097 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3098 RC = &AMDGPU::SGPR_32RegClass;
3099 else
3100 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3101 EVT ValVT = VA.getValVT();
3102
3103 Reg = MF.addLiveIn(Reg, RC);
3104 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3105
3106 if (Arg.Flags.isSRet()) {
3107 // The return object should be reasonably addressable.
3108
3109 // FIXME: This helps when the return is a real sret. If it is a
3110 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3111 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3112 unsigned NumBits =
3114 Val = DAG.getNode(
3115 ISD::AssertZext, DL, VT, Val,
3116 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3117 }
3118
3119 // If this is an 8 or 16-bit value, it is really passed promoted
3120 // to 32 bits. Insert an assert[sz]ext to capture this, then
3121 // truncate to the right size.
3122 switch (VA.getLocInfo()) {
3123 case CCValAssign::Full:
3124 break;
3125 case CCValAssign::BCvt:
3126 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3127 break;
3128 case CCValAssign::SExt:
3129 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val, DAG.getValueType(ValVT));
3130 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3131 break;
3132 case CCValAssign::ZExt:
3133 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, DAG.getValueType(ValVT));
3134 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3135 break;
3136 case CCValAssign::AExt:
3137 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3138 break;
3139 default:
3140 llvm_unreachable("Unknown loc info!");
3141 }
3142
3143 InVals.push_back(Val);
3144 }
3145
3146 // Start adding system SGPRs.
3147 if (IsEntryFunc)
3148 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3149
3150 // DAG.getPass() returns nullptr when using new pass manager.
3151 // TODO: Use DAG.getMFAM() to access analysis result.
3152 if (DAG.getPass()) {
3153 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3154 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3155 }
3156
3157 unsigned StackArgSize = CCInfo.getStackSize();
3158 Info->setBytesInStackArgArea(StackArgSize);
3159
3160 return Chains.empty() ? Chain
3161 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3162}
3163
3164// TODO: If return values can't fit in registers, we should return as many as
3165// possible in registers before passing on stack.
3167 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3168 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3169 const Type *RetTy) const {
3170 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3171 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3172 // for shaders. Vector types should be explicitly handled by CC.
3173 if (AMDGPU::isEntryFunctionCC(CallConv))
3174 return true;
3175
3177 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3178 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3179 return false;
3180
3181 // We must use the stack if return would require unavailable registers.
3182 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3183 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3184 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3185 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3186 return false;
3187
3188 return true;
3189}
3190
3191SDValue
3193 bool isVarArg,
3195 const SmallVectorImpl<SDValue> &OutVals,
3196 const SDLoc &DL, SelectionDAG &DAG) const {
3199
3200 if (AMDGPU::isKernel(CallConv)) {
3201 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3202 OutVals, DL, DAG);
3203 }
3204
3205 bool IsShader = AMDGPU::isShader(CallConv);
3206
3207 Info->setIfReturnsVoid(Outs.empty());
3208 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3209
3210 // CCValAssign - represent the assignment of the return value to a location.
3213
3214 // CCState - Info about the registers and stack slots.
3215 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3216 *DAG.getContext());
3217
3218 // Analyze outgoing return values.
3219 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3220
3221 SDValue Glue;
3223 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3224
3225 // Copy the result values into the output registers.
3226 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3227 ++I, ++RealRVLocIdx) {
3228 CCValAssign &VA = RVLocs[I];
3229 assert(VA.isRegLoc() && "Can only return in registers!");
3230 // TODO: Partially return in registers if return values don't fit.
3231 SDValue Arg = OutVals[RealRVLocIdx];
3232
3233 // Copied from other backends.
3234 switch (VA.getLocInfo()) {
3235 case CCValAssign::Full:
3236 break;
3237 case CCValAssign::BCvt:
3238 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3239 break;
3240 case CCValAssign::SExt:
3241 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3242 break;
3243 case CCValAssign::ZExt:
3244 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3245 break;
3246 case CCValAssign::AExt:
3247 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3248 break;
3249 default:
3250 llvm_unreachable("Unknown loc info!");
3251 }
3252
3253 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3254 Glue = Chain.getValue(1);
3255 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3256 }
3257
3258 // FIXME: Does sret work properly?
3259 if (!Info->isEntryFunction()) {
3260 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3261 const MCPhysReg *I =
3262 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3263 if (I) {
3264 for (; *I; ++I) {
3265 if (AMDGPU::SReg_64RegClass.contains(*I))
3266 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3267 else if (AMDGPU::SReg_32RegClass.contains(*I))
3268 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3269 else
3270 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3271 }
3272 }
3273 }
3274
3275 // Update chain and glue.
3276 RetOps[0] = Chain;
3277 if (Glue.getNode())
3278 RetOps.push_back(Glue);
3279
3280 unsigned Opc = AMDGPUISD::ENDPGM;
3281 if (!IsWaveEnd)
3283 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3284}
3285
3287 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3288 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3289 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3290 SDValue ThisVal) const {
3291 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3292
3293 // Assign locations to each value returned by this call.
3295 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3296 *DAG.getContext());
3297 CCInfo.AnalyzeCallResult(Ins, RetCC);
3298
3299 // Copy all of the result registers out of their specified physreg.
3300 for (CCValAssign VA : RVLocs) {
3301 SDValue Val;
3302
3303 if (VA.isRegLoc()) {
3304 Val =
3305 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3306 Chain = Val.getValue(1);
3307 InGlue = Val.getValue(2);
3308 } else if (VA.isMemLoc()) {
3309 report_fatal_error("TODO: return values in memory");
3310 } else
3311 llvm_unreachable("unknown argument location type");
3312
3313 switch (VA.getLocInfo()) {
3314 case CCValAssign::Full:
3315 break;
3316 case CCValAssign::BCvt:
3317 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3318 break;
3319 case CCValAssign::ZExt:
3320 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3321 DAG.getValueType(VA.getValVT()));
3322 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3323 break;
3324 case CCValAssign::SExt:
3325 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3326 DAG.getValueType(VA.getValVT()));
3327 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3328 break;
3329 case CCValAssign::AExt:
3330 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3331 break;
3332 default:
3333 llvm_unreachable("Unknown loc info!");
3334 }
3335
3336 InVals.push_back(Val);
3337 }
3338
3339 return Chain;
3340}
3341
3342// Add code to pass special inputs required depending on used features separate
3343// from the explicit user arguments present in the IR.
3345 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3346 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3347 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3348 // If we don't have a call site, this was a call inserted by
3349 // legalization. These can never use special inputs.
3350 if (!CLI.CB)
3351 return;
3352
3353 SelectionDAG &DAG = CLI.DAG;
3354 const SDLoc &DL = CLI.DL;
3355 const Function &F = DAG.getMachineFunction().getFunction();
3356
3357 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3358 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3359
3360 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3362 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3363 // DAG.getPass() returns nullptr when using new pass manager.
3364 // TODO: Use DAG.getMFAM() to access analysis result.
3365 if (DAG.getPass()) {
3366 auto &ArgUsageInfo =
3368 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3369 }
3370 }
3371
3372 // TODO: Unify with private memory register handling. This is complicated by
3373 // the fact that at least in kernels, the input argument is not necessarily
3374 // in the same location as the input.
3375 // clang-format off
3376 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3378 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3379 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3380 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3381 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3382 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3383 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3384 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3385 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3386 };
3387 // clang-format on
3388
3389 for (auto [InputID, Attr] : ImplicitAttrs) {
3390 // If the callee does not use the attribute value, skip copying the value.
3391 if (CLI.CB->hasFnAttr(Attr))
3392 continue;
3393
3394 const auto [OutgoingArg, ArgRC, ArgTy] =
3395 CalleeArgInfo->getPreloadedValue(InputID);
3396 if (!OutgoingArg)
3397 continue;
3398
3399 const auto [IncomingArg, IncomingArgRC, Ty] =
3400 CallerArgInfo.getPreloadedValue(InputID);
3401 assert(IncomingArgRC == ArgRC);
3402
3403 // All special arguments are ints for now.
3404 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3405 SDValue InputReg;
3406
3407 if (IncomingArg) {
3408 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3409 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3410 // The implicit arg ptr is special because it doesn't have a corresponding
3411 // input for kernels, and is computed from the kernarg segment pointer.
3412 InputReg = getImplicitArgPtr(DAG, DL);
3413 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3414 std::optional<uint32_t> Id =
3416 if (Id.has_value()) {
3417 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3418 } else {
3419 InputReg = DAG.getUNDEF(ArgVT);
3420 }
3421 } else {
3422 // We may have proven the input wasn't needed, although the ABI is
3423 // requiring it. We just need to allocate the register appropriately.
3424 InputReg = DAG.getUNDEF(ArgVT);
3425 }
3426
3427 if (OutgoingArg->isRegister()) {
3428 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3429 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3430 report_fatal_error("failed to allocate implicit input argument");
3431 } else {
3432 unsigned SpecialArgOffset =
3433 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3434 SDValue ArgStore =
3435 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3436 MemOpChains.push_back(ArgStore);
3437 }
3438 }
3439
3440 // Pack workitem IDs into a single register or pass it as is if already
3441 // packed.
3442
3443 auto [OutgoingArg, ArgRC, Ty] =
3445 if (!OutgoingArg)
3446 std::tie(OutgoingArg, ArgRC, Ty) =
3448 if (!OutgoingArg)
3449 std::tie(OutgoingArg, ArgRC, Ty) =
3451 if (!OutgoingArg)
3452 return;
3453
3454 const ArgDescriptor *IncomingArgX = std::get<0>(
3456 const ArgDescriptor *IncomingArgY = std::get<0>(
3458 const ArgDescriptor *IncomingArgZ = std::get<0>(
3460
3461 SDValue InputReg;
3462 SDLoc SL;
3463
3464 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3465 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3466 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3467
3468 // If incoming ids are not packed we need to pack them.
3469 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3470 NeedWorkItemIDX) {
3471 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3472 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3473 } else {
3474 InputReg = DAG.getConstant(0, DL, MVT::i32);
3475 }
3476 }
3477
3478 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3479 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3480 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3481 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3482 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3483 InputReg = InputReg.getNode()
3484 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3485 : Y;
3486 }
3487
3488 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3489 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3490 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3491 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3492 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3493 InputReg = InputReg.getNode()
3494 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3495 : Z;
3496 }
3497
3498 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3499 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3500 // We're in a situation where the outgoing function requires the workitem
3501 // ID, but the calling function does not have it (e.g a graphics function
3502 // calling a C calling convention function). This is illegal, but we need
3503 // to produce something.
3504 InputReg = DAG.getUNDEF(MVT::i32);
3505 } else {
3506 // Workitem ids are already packed, any of present incoming arguments
3507 // will carry all required fields.
3508 ArgDescriptor IncomingArg =
3509 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3510 : IncomingArgY ? *IncomingArgY
3511 : *IncomingArgZ,
3512 ~0u);
3513 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3514 }
3515 }
3516
3517 if (OutgoingArg->isRegister()) {
3518 if (InputReg)
3519 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3520
3521 CCInfo.AllocateReg(OutgoingArg->getRegister());
3522 } else {
3523 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3524 if (InputReg) {
3525 SDValue ArgStore =
3526 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3527 MemOpChains.push_back(ArgStore);
3528 }
3529 }
3530}
3531
3533 return CC == CallingConv::Fast;
3534}
3535
3536/// Return true if we might ever do TCO for calls with this calling convention.
3538 switch (CC) {
3539 case CallingConv::C:
3541 return true;
3542 default:
3543 return canGuaranteeTCO(CC);
3544 }
3545}
3546
3548 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3550 const SmallVectorImpl<SDValue> &OutVals,
3551 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3552 if (AMDGPU::isChainCC(CalleeCC))
3553 return true;
3554
3555 if (!mayTailCallThisCC(CalleeCC))
3556 return false;
3557
3558 // For a divergent call target, we need to do a waterfall loop over the
3559 // possible callees which precludes us from using a simple jump.
3560 if (Callee->isDivergent())
3561 return false;
3562
3564 const Function &CallerF = MF.getFunction();
3565 CallingConv::ID CallerCC = CallerF.getCallingConv();
3567 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3568
3569 // Kernels aren't callable, and don't have a live in return address so it
3570 // doesn't make sense to do a tail call with entry functions.
3571 if (!CallerPreserved)
3572 return false;
3573
3574 bool CCMatch = CallerCC == CalleeCC;
3575
3577 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3578 return true;
3579 return false;
3580 }
3581
3582 // TODO: Can we handle var args?
3583 if (IsVarArg)
3584 return false;
3585
3586 for (const Argument &Arg : CallerF.args()) {
3587 if (Arg.hasByValAttr())
3588 return false;
3589 }
3590
3591 LLVMContext &Ctx = *DAG.getContext();
3592
3593 // Check that the call results are passed in the same way.
3594 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3595 CCAssignFnForCall(CalleeCC, IsVarArg),
3596 CCAssignFnForCall(CallerCC, IsVarArg)))
3597 return false;
3598
3599 // The callee has to preserve all registers the caller needs to preserve.
3600 if (!CCMatch) {
3601 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3602 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3603 return false;
3604 }
3605
3606 // Nothing more to check if the callee is taking no arguments.
3607 if (Outs.empty())
3608 return true;
3609
3611 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3612
3613 // FIXME: We are not allocating special input registers, so we will be
3614 // deciding based on incorrect register assignments.
3615 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3616
3617 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3618 // If the stack arguments for this call do not fit into our own save area then
3619 // the call cannot be made tail.
3620 // TODO: Is this really necessary?
3621 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3622 return false;
3623
3624 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
3625 // FIXME: What about inreg arguments that end up passed in memory?
3626 if (!CCVA.isRegLoc())
3627 continue;
3628
3629 // If we are passing an argument in an SGPR, and the value is divergent,
3630 // this call requires a waterfall loop.
3631 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3632 LLVM_DEBUG(
3633 dbgs() << "Cannot tail call due to divergent outgoing argument in "
3634 << printReg(CCVA.getLocReg(), TRI) << '\n');
3635 return false;
3636 }
3637 }
3638
3639 const MachineRegisterInfo &MRI = MF.getRegInfo();
3640 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3641}
3642
3644 if (!CI->isTailCall())
3645 return false;
3646
3647 const Function *ParentFn = CI->getParent()->getParent();
3649 return false;
3650 return true;
3651}
3652
3653// The wave scratch offset register is used as the global base pointer.
3655 SmallVectorImpl<SDValue> &InVals) const {
3656 CallingConv::ID CallConv = CLI.CallConv;
3657 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3658
3659 SelectionDAG &DAG = CLI.DAG;
3660
3661 TargetLowering::ArgListEntry RequestedExec;
3662 if (IsChainCallConv) {
3663 // The last argument should be the value that we need to put in EXEC.
3664 // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
3665 // don't treat it like the rest of the arguments.
3666 RequestedExec = CLI.Args.back();
3667 assert(RequestedExec.Node && "No node for EXEC");
3668
3669 if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3670 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3671
3672 assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg");
3673 CLI.Outs.pop_back();
3674 CLI.OutVals.pop_back();
3675
3676 if (RequestedExec.Ty->isIntegerTy(64)) {
3677 assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up");
3678 CLI.Outs.pop_back();
3679 CLI.OutVals.pop_back();
3680 }
3681
3682 assert(CLI.Outs.back().OrigArgIndex != 2 &&
3683 "Haven't popped all the pieces of the EXEC mask");
3684 }
3685
3686 const SDLoc &DL = CLI.DL;
3688 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3690 SDValue Chain = CLI.Chain;
3691 SDValue Callee = CLI.Callee;
3692 bool &IsTailCall = CLI.IsTailCall;
3693 bool IsVarArg = CLI.IsVarArg;
3694 bool IsSibCall = false;
3696
3697 if (Callee.isUndef() || isNullConstant(Callee)) {
3698 if (!CLI.IsTailCall) {
3699 for (ISD::InputArg &Arg : CLI.Ins)
3700 InVals.push_back(DAG.getUNDEF(Arg.VT));
3701 }
3702
3703 return Chain;
3704 }
3705
3706 if (IsVarArg) {
3707 return lowerUnhandledCall(CLI, InVals,
3708 "unsupported call to variadic function ");
3709 }
3710
3711 if (!CLI.CB)
3712 report_fatal_error("unsupported libcall legalization");
3713
3714 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3715 return lowerUnhandledCall(CLI, InVals,
3716 "unsupported required tail call to function ");
3717 }
3718
3719 if (IsTailCall) {
3720 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
3721 Outs, OutVals, Ins, DAG);
3722 if (!IsTailCall &&
3723 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3724 report_fatal_error("failed to perform tail call elimination on a call "
3725 "site marked musttail or on llvm.amdgcn.cs.chain");
3726 }
3727
3728 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3729
3730 // A sibling call is one where we're under the usual C ABI and not planning
3731 // to change that but can still do a tail call:
3732 if (!TailCallOpt && IsTailCall)
3733 IsSibCall = true;
3734
3735 if (IsTailCall)
3736 ++NumTailCalls;
3737 }
3738
3741 SmallVector<SDValue, 8> MemOpChains;
3742
3743 // Analyze operands of the call, assigning locations to each operand.
3745 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3746 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3747
3748 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
3749 // With a fixed ABI, allocate fixed registers before user arguments.
3750 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3751 }
3752
3753 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3754
3755 // Get a count of how many bytes are to be pushed on the stack.
3756 unsigned NumBytes = CCInfo.getStackSize();
3757
3758 if (IsSibCall) {
3759 // Since we're not changing the ABI to make this a tail call, the memory
3760 // operands are already available in the caller's incoming argument space.
3761 NumBytes = 0;
3762 }
3763
3764 // FPDiff is the byte offset of the call's argument area from the callee's.
3765 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3766 // by this amount for a tail call. In a sibling call it must be 0 because the
3767 // caller will deallocate the entire stack and the callee still expects its
3768 // arguments to begin at SP+0. Completely unused for non-tail calls.
3769 int32_t FPDiff = 0;
3770 MachineFrameInfo &MFI = MF.getFrameInfo();
3771 auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3772
3773 // Adjust the stack pointer for the new arguments...
3774 // These operations are automatically eliminated by the prolog/epilog pass
3775 if (!IsSibCall)
3776 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3777
3778 if (!IsSibCall || IsChainCallConv) {
3779 if (!Subtarget->enableFlatScratch()) {
3780 SmallVector<SDValue, 4> CopyFromChains;
3781
3782 // In the HSA case, this should be an identity copy.
3783 SDValue ScratchRSrcReg =
3784 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3785 RegsToPass.emplace_back(IsChainCallConv
3786 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3787 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3788 ScratchRSrcReg);
3789 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3790 Chain = DAG.getTokenFactor(DL, CopyFromChains);
3791 }
3792 }
3793
3794 const unsigned NumSpecialInputs = RegsToPass.size();
3795
3796 MVT PtrVT = MVT::i32;
3797
3798 // Walk the register/memloc assignments, inserting copies/loads.
3799 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3800 CCValAssign &VA = ArgLocs[i];
3801 SDValue Arg = OutVals[i];
3802
3803 // Promote the value if needed.
3804 switch (VA.getLocInfo()) {
3805 case CCValAssign::Full:
3806 break;
3807 case CCValAssign::BCvt:
3808 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3809 break;
3810 case CCValAssign::ZExt:
3811 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3812 break;
3813 case CCValAssign::SExt:
3814 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3815 break;
3816 case CCValAssign::AExt:
3817 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3818 break;
3819 case CCValAssign::FPExt:
3820 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3821 break;
3822 default:
3823 llvm_unreachable("Unknown loc info!");
3824 }
3825
3826 if (VA.isRegLoc()) {
3827 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
3828 } else {
3829 assert(VA.isMemLoc());
3830
3831 SDValue DstAddr;
3832 MachinePointerInfo DstInfo;
3833
3834 unsigned LocMemOffset = VA.getLocMemOffset();
3835 int32_t Offset = LocMemOffset;
3836
3837 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3838 MaybeAlign Alignment;
3839
3840 if (IsTailCall) {
3841 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3842 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
3843 : VA.getValVT().getStoreSize();
3844
3845 // FIXME: We can have better than the minimum byval required alignment.
3846 Alignment =
3847 Flags.isByVal()
3848 ? Flags.getNonZeroByValAlign()
3849 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3850
3851 Offset = Offset + FPDiff;
3852 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3853
3854 DstAddr = DAG.getFrameIndex(FI, PtrVT);
3855 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3856
3857 // Make sure any stack arguments overlapping with where we're storing
3858 // are loaded before this eventual operation. Otherwise they'll be
3859 // clobbered.
3860
3861 // FIXME: Why is this really necessary? This seems to just result in a
3862 // lot of code to copy the stack and write them back to the same
3863 // locations, which are supposed to be immutable?
3864 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3865 } else {
3866 // Stores to the argument stack area are relative to the stack pointer.
3867 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3868 MVT::i32);
3869 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3870 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3871 Alignment =
3872 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3873 }
3874
3875 if (Outs[i].Flags.isByVal()) {
3876 SDValue SizeNode =
3877 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3878 SDValue Cpy =
3879 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3880 Outs[i].Flags.getNonZeroByValAlign(),
3881 /*isVol = */ false, /*AlwaysInline = */ true,
3882 /*CI=*/nullptr, std::nullopt, DstInfo,
3884
3885 MemOpChains.push_back(Cpy);
3886 } else {
3887 SDValue Store =
3888 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3889 MemOpChains.push_back(Store);
3890 }
3891 }
3892 }
3893
3894 if (!MemOpChains.empty())
3895 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3896
3897 SDValue ReadFirstLaneID =
3898 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3899
3900 SDValue TokenGlue;
3901 if (CLI.ConvergenceControlToken) {
3902 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
3904 }
3905
3906 // Build a sequence of copy-to-reg nodes chained together with token chain
3907 // and flag operands which copy the outgoing args into the appropriate regs.
3908 SDValue InGlue;
3909
3910 unsigned ArgIdx = 0;
3911 for (auto [Reg, Val] : RegsToPass) {
3912 if (ArgIdx++ >= NumSpecialInputs &&
3913 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
3914 // For chain calls, the inreg arguments are required to be
3915 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
3916 // they are uniform.
3917 //
3918 // For other calls, if an inreg arguments is known to be uniform,
3919 // speculatively insert a readfirstlane in case it is in a VGPR.
3920 //
3921 // FIXME: We need to execute this in a waterfall loop if it is a divergent
3922 // value, so let that continue to produce invalid code.
3923
3924 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
3925 if (TokenGlue)
3926 ReadfirstlaneArgs.push_back(TokenGlue);
3928 ReadfirstlaneArgs);
3929 }
3930
3931 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
3932 InGlue = Chain.getValue(1);
3933 }
3934
3935 // We don't usually want to end the call-sequence here because we would tidy
3936 // the frame up *after* the call, however in the ABI-changing tail-call case
3937 // we've carefully laid out the parameters so that when sp is reset they'll be
3938 // in the correct location.
3939 if (IsTailCall && !IsSibCall) {
3940 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
3941 InGlue = Chain.getValue(1);
3942 }
3943
3944 std::vector<SDValue> Ops({Chain});
3945
3946 // Add a redundant copy of the callee global which will not be legalized, as
3947 // we need direct access to the callee later.
3948 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3949 const GlobalValue *GV = GSD->getGlobal();
3950 Ops.push_back(Callee);
3951 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3952 } else {
3953 if (IsTailCall) {
3954 // isEligibleForTailCallOptimization considered whether the call target is
3955 // divergent, but we may still end up with a uniform value in a VGPR.
3956 // Insert a readfirstlane just in case.
3957 SDValue ReadFirstLaneID =
3958 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3959
3960 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
3961 if (TokenGlue)
3962 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
3963 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
3964 ReadfirstlaneArgs);
3965 }
3966
3967 Ops.push_back(Callee);
3968 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3969 }
3970
3971 if (IsTailCall) {
3972 // Each tail call may have to adjust the stack by a different amount, so
3973 // this information must travel along with the operation for eventual
3974 // consumption by emitEpilogue.
3975 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3976 }
3977
3978 if (IsChainCallConv)
3979 Ops.push_back(RequestedExec.Node);
3980
3981 // Add argument registers to the end of the list so that they are known live
3982 // into the call.
3983 for (auto &[Reg, Val] : RegsToPass)
3984 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
3985
3986 // Add a register mask operand representing the call-preserved registers.
3987 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3988 assert(Mask && "Missing call preserved mask for calling convention");
3989 Ops.push_back(DAG.getRegisterMask(Mask));
3990
3991 if (SDValue Token = CLI.ConvergenceControlToken) {
3993 GlueOps.push_back(Token);
3994 if (InGlue)
3995 GlueOps.push_back(InGlue);
3996
3997 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
3998 MVT::Glue, GlueOps),
3999 0);
4000 }
4001
4002 if (InGlue)
4003 Ops.push_back(InGlue);
4004
4005 // If we're doing a tall call, use a TC_RETURN here rather than an
4006 // actual call instruction.
4007 if (IsTailCall) {
4008 MFI.setHasTailCall();
4009 unsigned OPC = AMDGPUISD::TC_RETURN;
4010 switch (CallConv) {
4013 break;
4017 break;
4018 }
4019
4020 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4021 }
4022
4023 // Returns a chain and a flag for retval copy to use.
4024 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4025 Chain = Call.getValue(0);
4026 InGlue = Call.getValue(1);
4027
4028 uint64_t CalleePopBytes = NumBytes;
4029 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4030 if (!Ins.empty())
4031 InGlue = Chain.getValue(1);
4032
4033 // Handle result values, copying them out of physregs into vregs that we
4034 // return.
4035 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4036 InVals, /*IsThisReturn=*/false, SDValue());
4037}
4038
4039// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4040// except for:
4041// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4042// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4044 SelectionDAG &DAG) const {
4045 const MachineFunction &MF = DAG.getMachineFunction();
4047
4048 SDLoc dl(Op);
4049 EVT VT = Op.getValueType();
4050 SDValue Chain = Op.getOperand(0);
4051 Register SPReg = Info->getStackPtrOffsetReg();
4052
4053 // Chain the dynamic stack allocation so that it doesn't modify the stack
4054 // pointer when other instructions are using the stack.
4055 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4056
4057 SDValue Size = Op.getOperand(1);
4058 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4059 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4060
4061 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4063 "Stack grows upwards for AMDGPU");
4064
4065 Chain = BaseAddr.getValue(1);
4066 Align StackAlign = TFL->getStackAlign();
4067 if (Alignment > StackAlign) {
4068 uint64_t ScaledAlignment = (uint64_t)Alignment.value()
4069 << Subtarget->getWavefrontSizeLog2();
4070 uint64_t StackAlignMask = ScaledAlignment - 1;
4071 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4072 DAG.getConstant(StackAlignMask, dl, VT));
4073 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4074 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4075 }
4076
4077 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4078 SDValue NewSP;
4079 if (isa<ConstantSDNode>(Size)) {
4080 // For constant sized alloca, scale alloca size by wave-size
4081 SDValue ScaledSize = DAG.getNode(
4082 ISD::SHL, dl, VT, Size,
4083 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4084 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4085 } else {
4086 // For dynamic sized alloca, perform wave-wide reduction to get max of
4087 // alloca size(divergent) and then scale it by wave-size
4088 SDValue WaveReduction =
4089 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4090 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4091 Size, DAG.getConstant(0, dl, MVT::i32));
4092 SDValue ScaledSize = DAG.getNode(
4093 ISD::SHL, dl, VT, Size,
4094 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4095 NewSP =
4096 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4097 SDValue ReadFirstLaneID =
4098 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4099 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4100 NewSP);
4101 }
4102
4103 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4104 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4105
4106 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4107}
4108
4110 if (Op.getValueType() != MVT::i32)
4111 return Op; // Defer to cannot select error.
4112
4114 SDLoc SL(Op);
4115
4116 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4117
4118 // Convert from wave uniform to swizzled vector address. This should protect
4119 // from any edge cases where the stacksave result isn't directly used with
4120 // stackrestore.
4121 SDValue VectorAddress =
4122 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4123 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4124}
4125
4127 SelectionDAG &DAG) const {
4128 SDLoc SL(Op);
4129 assert(Op.getValueType() == MVT::i32);
4130
4131 uint32_t BothRoundHwReg =
4133 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4134
4135 SDValue IntrinID =
4136 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4137 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4138 Op.getOperand(0), IntrinID, GetRoundBothImm);
4139
4140 // There are two rounding modes, one for f32 and one for f64/f16. We only
4141 // report in the standard value range if both are the same.
4142 //
4143 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4144 // ties away from zero is not supported, and the other values are rotated by
4145 // 1.
4146 //
4147 // If the two rounding modes are not the same, report a target defined value.
4148
4149 // Mode register rounding mode fields:
4150 //
4151 // [1:0] Single-precision round mode.
4152 // [3:2] Double/Half-precision round mode.
4153 //
4154 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4155 //
4156 // Hardware Spec
4157 // Toward-0 3 0
4158 // Nearest Even 0 1
4159 // +Inf 1 2
4160 // -Inf 2 3
4161 // NearestAway0 N/A 4
4162 //
4163 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4164 // table we can index by the raw hardware mode.
4165 //
4166 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4167
4168 SDValue BitTable =
4170
4171 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4172 SDValue RoundModeTimesNumBits =
4173 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4174
4175 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4176 // knew only one mode was demanded.
4177 SDValue TableValue =
4178 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4179 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4180
4181 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4182 SDValue TableEntry =
4183 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4184
4185 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4186 // if it's an extended value.
4187 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4188 SDValue IsStandardValue =
4189 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4190 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4191 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4192 TableEntry, EnumOffset);
4193
4194 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4195}
4196
4198 SelectionDAG &DAG) const {
4199 SDLoc SL(Op);
4200
4201 SDValue NewMode = Op.getOperand(1);
4202 assert(NewMode.getValueType() == MVT::i32);
4203
4204 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4205 // hardware MODE.fp_round values.
4206 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4207 uint32_t ClampedVal = std::min(
4208 static_cast<uint32_t>(ConstMode->getZExtValue()),
4210 NewMode = DAG.getConstant(
4211 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4212 } else {
4213 // If we know the input can only be one of the supported standard modes in
4214 // the range 0-3, we can use a simplified mapping to hardware values.
4215 KnownBits KB = DAG.computeKnownBits(NewMode);
4216 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4217 // The supported standard values are 0-3. The extended values start at 8. We
4218 // need to offset by 4 if the value is in the extended range.
4219
4220 if (UseReducedTable) {
4221 // Truncate to the low 32-bits.
4222 SDValue BitTable = DAG.getConstant(
4223 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4224
4225 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4226 SDValue RoundModeTimesNumBits =
4227 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4228
4229 NewMode =
4230 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4231
4232 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4233 // the table extracted bits into inline immediates.
4234 } else {
4235 // table_index = umin(value, value - 4)
4236 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4237 SDValue BitTable =
4239
4240 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4241 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4242 SDValue IndexVal =
4243 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4244
4245 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4246 SDValue RoundModeTimesNumBits =
4247 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4248
4249 SDValue TableValue =
4250 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4251 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4252
4253 // No need to mask out the high bits since the setreg will ignore them
4254 // anyway.
4255 NewMode = TruncTable;
4256 }
4257
4258 // Insert a readfirstlane in case the value is a VGPR. We could do this
4259 // earlier and keep more operations scalar, but that interferes with
4260 // combining the source.
4261 SDValue ReadFirstLaneID =
4262 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4263 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4264 ReadFirstLaneID, NewMode);
4265 }
4266
4267 // N.B. The setreg will be later folded into s_round_mode on supported
4268 // targets.
4269 SDValue IntrinID =
4270 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4271 uint32_t BothRoundHwReg =
4273 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4274
4275 SDValue SetReg =
4276 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4277 IntrinID, RoundBothImm, NewMode);
4278
4279 return SetReg;
4280}
4281
4283 if (Op->isDivergent())
4284 return SDValue();
4285
4286 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4291 break;
4292 default:
4293 return SDValue();
4294 }
4295
4296 return Op;
4297}
4298
4299// Work around DAG legality rules only based on the result type.
4301 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4302 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4303 EVT SrcVT = Src.getValueType();
4304
4305 if (SrcVT.getScalarType() != MVT::bf16)
4306 return Op;
4307
4308 SDLoc SL(Op);
4309 SDValue BitCast =
4310 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4311
4312 EVT DstVT = Op.getValueType();
4313 if (IsStrict)
4314 llvm_unreachable("Need STRICT_BF16_TO_FP");
4315
4316 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4317}
4318
4320 SDLoc SL(Op);
4321 if (Op.getValueType() != MVT::i64)
4322 return Op;
4323
4324 uint32_t ModeHwReg =
4326 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4327 uint32_t TrapHwReg =
4329 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4330
4331 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4332 SDValue IntrinID =
4333 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4334 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4335 Op.getOperand(0), IntrinID, ModeHwRegImm);
4336 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4337 Op.getOperand(0), IntrinID, TrapHwRegImm);
4338 SDValue TokenReg =
4339 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4340 GetTrapReg.getValue(1));
4341
4342 SDValue CvtPtr =
4343 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4344 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4345
4346 return DAG.getMergeValues({Result, TokenReg}, SL);
4347}
4348
4350 SDLoc SL(Op);
4351 if (Op.getOperand(1).getValueType() != MVT::i64)
4352 return Op;
4353
4354 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4355 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4356 DAG.getConstant(0, SL, MVT::i32));
4357 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4358 DAG.getConstant(1, SL, MVT::i32));
4359
4360 SDValue ReadFirstLaneID =
4361 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4362 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4363 ReadFirstLaneID, NewModeReg);
4364 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4365 ReadFirstLaneID, NewTrapReg);
4366
4367 unsigned ModeHwReg =
4369 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4370 unsigned TrapHwReg =
4372 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4373
4374 SDValue IntrinID =
4375 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4376 SDValue SetModeReg =
4377 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4378 IntrinID, ModeHwRegImm, NewModeReg);
4379 SDValue SetTrapReg =
4380 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4381 IntrinID, TrapHwRegImm, NewTrapReg);
4382 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4383}
4384
4386 const MachineFunction &MF) const {
4388 .Case("m0", AMDGPU::M0)
4389 .Case("exec", AMDGPU::EXEC)
4390 .Case("exec_lo", AMDGPU::EXEC_LO)
4391 .Case("exec_hi", AMDGPU::EXEC_HI)
4392 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4393 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4394 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4395 .Default(Register());
4396
4397 if (Reg == AMDGPU::NoRegister) {
4399 Twine("invalid register name \"" + StringRef(RegName) + "\"."));
4400 }
4401
4402 if (!Subtarget->hasFlatScrRegister() &&
4403 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4404 report_fatal_error(Twine("invalid register \"" + StringRef(RegName) +
4405 "\" for subtarget."));
4406 }
4407
4408 switch (Reg) {
4409 case AMDGPU::M0:
4410 case AMDGPU::EXEC_LO:
4411 case AMDGPU::EXEC_HI:
4412 case AMDGPU::FLAT_SCR_LO:
4413 case AMDGPU::FLAT_SCR_HI:
4414 if (VT.getSizeInBits() == 32)
4415 return Reg;
4416 break;
4417 case AMDGPU::EXEC:
4418 case AMDGPU::FLAT_SCR:
4419 if (VT.getSizeInBits() == 64)
4420 return Reg;
4421 break;
4422 default:
4423 llvm_unreachable("missing register type checking");
4424 }
4425
4427 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4428}
4429
4430// If kill is not the last instruction, split the block so kill is always a
4431// proper terminator.
4434 MachineBasicBlock *BB) const {
4435 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
4437 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4438 return SplitBB;
4439}
4440
4441// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4442// \p MI will be the only instruction in the loop body block. Otherwise, it will
4443// be the first instruction in the remainder block.
4444//
4445/// \returns { LoopBody, Remainder }
4446static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4450
4451 // To insert the loop we need to split the block. Move everything after this
4452 // point to a new block, and insert a new empty block between the two.
4454 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4456 ++MBBI;
4457
4458 MF->insert(MBBI, LoopBB);
4459 MF->insert(MBBI, RemainderBB);
4460
4461 LoopBB->addSuccessor(LoopBB);
4462 LoopBB->addSuccessor(RemainderBB);
4463
4464 // Move the rest of the block into a new block.
4465 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4466
4467 if (InstInLoop) {
4468 auto Next = std::next(I);
4469
4470 // Move instruction to loop body.
4471 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4472
4473 // Move the rest of the block.
4474 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4475 } else {
4476 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4477 }
4478
4479 MBB.addSuccessor(LoopBB);
4480
4481 return std::pair(LoopBB, RemainderBB);
4482}
4483
4484/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4486 MachineBasicBlock *MBB = MI.getParent();
4488 auto I = MI.getIterator();
4489 auto E = std::next(I);
4490
4491 // clang-format off
4492 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4493 .addImm(0);
4494 // clang-format on
4495
4496 MIBundleBuilder Bundler(*MBB, I, E);
4497 finalizeBundle(*MBB, Bundler.begin());
4498}
4499
4502 MachineBasicBlock *BB) const {
4503 const DebugLoc &DL = MI.getDebugLoc();
4504
4506
4508
4509 // Apparently kill flags are only valid if the def is in the same block?
4510 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4511 Src->setIsKill(false);
4512
4513 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
4514
4515 MachineBasicBlock::iterator I = LoopBB->end();
4516
4517 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4519
4520 // Clear TRAP_STS.MEM_VIOL
4521 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4522 .addImm(0)
4523 .addImm(EncodedReg);
4524
4526
4527 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4528
4529 // Load and check TRAP_STS.MEM_VIOL
4530 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4531 .addImm(EncodedReg);
4532
4533 // FIXME: Do we need to use an isel pseudo that may clobber scc?
4534 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4535 .addReg(Reg, RegState::Kill)
4536 .addImm(0);
4537 // clang-format off
4538 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4539 .addMBB(LoopBB);
4540 // clang-format on
4541
4542 return RemainderBB;
4543}
4544
4545// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4546// wavefront. If the value is uniform and just happens to be in a VGPR, this
4547// will only do one iteration. In the worst case, this will loop 64 times.
4548//
4549// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4552 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4553 const DebugLoc &DL, const MachineOperand &Idx,
4554 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4555 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4556 Register &SGPRIdxReg) {
4557
4558 MachineFunction *MF = OrigBB.getParent();
4559 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4560 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4562
4563 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4564 Register PhiExec = MRI.createVirtualRegister(BoolRC);
4565 Register NewExec = MRI.createVirtualRegister(BoolRC);
4566 Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4567 Register CondReg = MRI.createVirtualRegister(BoolRC);
4568
4569 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
4570 .addReg(InitReg)
4571 .addMBB(&OrigBB)
4572 .addReg(ResultReg)
4573 .addMBB(&LoopBB);
4574
4575 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
4576 .addReg(InitSaveExecReg)
4577 .addMBB(&OrigBB)
4578 .addReg(NewExec)
4579 .addMBB(&LoopBB);
4580
4581 // Read the next variant <- also loop target.
4582 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4583 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
4584
4585 // Compare the just read M0 value to all possible Idx values.
4586 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4587 .addReg(CurrentIdxReg)
4588 .addReg(Idx.getReg(), 0, Idx.getSubReg());
4589
4590 // Update EXEC, save the original EXEC value to VCC.
4591 BuildMI(LoopBB, I, DL,
4592 TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4593 : AMDGPU::S_AND_SAVEEXEC_B64),
4594 NewExec)
4595 .addReg(CondReg, RegState::Kill);
4596
4597 MRI.setSimpleHint(NewExec, CondReg);
4598
4599 if (UseGPRIdxMode) {
4600 if (Offset == 0) {
4601 SGPRIdxReg = CurrentIdxReg;
4602 } else {
4603 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4604 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4605 .addReg(CurrentIdxReg, RegState::Kill)
4606 .addImm(Offset);
4607 }
4608 } else {
4609 // Move index from VCC into M0
4610 if (Offset == 0) {
4611 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4612 .addReg(CurrentIdxReg, RegState::Kill);
4613 } else {
4614 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4615 .addReg(CurrentIdxReg, RegState::Kill)
4616 .addImm(Offset);
4617 }
4618 }
4619
4620 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4621 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4622 MachineInstr *InsertPt =
4623 BuildMI(LoopBB, I, DL,
4624 TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4625 : AMDGPU::S_XOR_B64_term),
4626 Exec)
4627 .addReg(Exec)
4628 .addReg(NewExec);
4629
4630 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4631 // s_cbranch_scc0?
4632
4633 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4634 // clang-format off
4635 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4636 .addMBB(&LoopBB);
4637 // clang-format on
4638
4639 return InsertPt->getIterator();
4640}
4641
4642// This has slightly sub-optimal regalloc when the source vector is killed by
4643// the read. The register allocator does not understand that the kill is
4644// per-workitem, so is kept alive for the whole loop so we end up not re-using a
4645// subregister from it, using 1 more VGPR than necessary. This was saved when
4646// this was expanded after register allocation.
4649 unsigned InitResultReg, unsigned PhiReg, int Offset,
4650 bool UseGPRIdxMode, Register &SGPRIdxReg) {
4652 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4653 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4655 const DebugLoc &DL = MI.getDebugLoc();
4657
4658 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
4659 Register DstReg = MI.getOperand(0).getReg();
4660 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4661 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
4662 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4663 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4664
4665 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
4666
4667 // Save the EXEC mask
4668 // clang-format off
4669 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
4670 .addReg(Exec);
4671 // clang-format on
4672
4673 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
4674
4675 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4676
4677 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
4678 InitResultReg, DstReg, PhiReg, TmpExec,
4679 Offset, UseGPRIdxMode, SGPRIdxReg);
4680
4681 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
4683 ++MBBI;
4684 MF->insert(MBBI, LandingPad);
4685 LoopBB->removeSuccessor(RemainderBB);
4686 LandingPad->addSuccessor(RemainderBB);
4687 LoopBB->addSuccessor(LandingPad);
4688 MachineBasicBlock::iterator First = LandingPad->begin();
4689 // clang-format off
4690 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
4691 .addReg(SaveExec);
4692 // clang-format on
4693
4694 return InsPt;
4695}
4696
4697// Returns subreg index, offset
4698static std::pair<unsigned, int>
4700 const TargetRegisterClass *SuperRC, unsigned VecReg,
4701 int Offset) {
4702 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
4703
4704 // Skip out of bounds offsets, or else we would end up using an undefined
4705 // register.
4706 if (Offset >= NumElts || Offset < 0)
4707 return std::pair(AMDGPU::sub0, Offset);
4708
4709 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
4710}
4711
4714 int Offset) {
4715 MachineBasicBlock *MBB = MI.getParent();
4716 const DebugLoc &DL = MI.getDebugLoc();
4718
4719 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4720
4721 assert(Idx->getReg() != AMDGPU::NoRegister);
4722
4723 if (Offset == 0) {
4724 // clang-format off
4725 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4726 .add(*Idx);
4727 // clang-format on
4728 } else {
4729 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4730 .add(*Idx)
4731 .addImm(Offset);
4732 }
4733}
4734
4737 int Offset) {
4738 MachineBasicBlock *MBB = MI.getParent();
4739 const DebugLoc &DL = MI.getDebugLoc();
4741
4742 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4743
4744 if (Offset == 0)
4745 return Idx->getReg();
4746
4747 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4748 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
4749 .add(*Idx)
4750 .addImm(Offset);
4751 return Tmp;
4752}
4753
4756 const GCNSubtarget &ST) {
4757 const SIInstrInfo *TII = ST.getInstrInfo();
4758 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4761
4762 Register Dst = MI.getOperand(0).getReg();
4763 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4764 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
4765 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4766
4767 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
4768 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4769
4770 unsigned SubReg;
4771 std::tie(SubReg, Offset) =
4772 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
4773
4774 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4775
4776 // Check for a SGPR index.
4777 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4779 const DebugLoc &DL = MI.getDebugLoc();
4780
4781 if (UseGPRIdxMode) {
4782 // TODO: Look at the uses to avoid the copy. This may require rescheduling
4783 // to avoid interfering with other uses, so probably requires a new
4784 // optimization pass.
4786
4787 const MCInstrDesc &GPRIDXDesc =
4788 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4789 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4790 .addReg(SrcReg)
4791 .addReg(Idx)
4792 .addImm(SubReg);
4793 } else {
4795
4796 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4797 .addReg(SrcReg, 0, SubReg)
4798 .addReg(SrcReg, RegState::Implicit);
4799 }
4800
4801 MI.eraseFromParent();
4802
4803 return &MBB;
4804 }
4805
4806 // Control flow needs to be inserted if indexing with a VGPR.
4807 const DebugLoc &DL = MI.getDebugLoc();
4809
4810 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4811 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4812
4813 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
4814
4815 Register SGPRIdxReg;
4816 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
4817 UseGPRIdxMode, SGPRIdxReg);
4818
4819 MachineBasicBlock *LoopBB = InsPt->getParent();
4820
4821 if (UseGPRIdxMode) {
4822 const MCInstrDesc &GPRIDXDesc =
4823 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4824
4825 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4826 .addReg(SrcReg)
4827 .addReg(SGPRIdxReg)
4828 .addImm(SubReg);
4829 } else {
4830 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4831 .addReg(SrcReg, 0, SubReg)
4832 .addReg(SrcReg, RegState::Implicit);
4833 }
4834
4835 MI.eraseFromParent();
4836
4837 return LoopBB;
4838}
4839
4842 const GCNSubtarget &ST) {
4843 const SIInstrInfo *TII = ST.getInstrInfo();
4844 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4847
4848 Register Dst = MI.getOperand(0).getReg();
4849 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
4850 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4851 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
4852 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4853 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
4854 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4855
4856 // This can be an immediate, but will be folded later.
4857 assert(Val->getReg());
4858
4859 unsigned SubReg;
4860 std::tie(SubReg, Offset) =
4861 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
4862 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4863
4864 if (Idx->getReg() == AMDGPU::NoRegister) {
4866 const DebugLoc &DL = MI.getDebugLoc();
4867
4868 assert(Offset == 0);
4869
4870 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
4871 .add(*SrcVec)
4872 .add(*Val)
4873 .addImm(SubReg);
4874
4875 MI.eraseFromParent();
4876 return &MBB;
4877 }
4878
4879 // Check for a SGPR index.
4880 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4882 const DebugLoc &DL = MI.getDebugLoc();
4883
4884 if (UseGPRIdxMode) {
4886
4887 const MCInstrDesc &GPRIDXDesc =
4888 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4889 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4890 .addReg(SrcVec->getReg())
4891 .add(*Val)
4892 .addReg(Idx)
4893 .addImm(SubReg);
4894 } else {
4896
4897 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4898 TRI.getRegSizeInBits(*VecRC), 32, false);
4899 BuildMI(MBB, I, DL, MovRelDesc, Dst)
4900 .addReg(SrcVec->getReg())
4901 .add(*Val)
4902 .addImm(SubReg);
4903 }
4904 MI.eraseFromParent();
4905 return &MBB;
4906 }
4907
4908 // Control flow needs to be inserted if indexing with a VGPR.
4909 if (Val->isReg())
4910 MRI.clearKillFlags(Val->getReg());
4911
4912 const DebugLoc &DL = MI.getDebugLoc();
4913
4914 Register PhiReg = MRI.createVirtualRegister(VecRC);
4915
4916 Register SGPRIdxReg;
4917 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
4918 UseGPRIdxMode, SGPRIdxReg);
4919 MachineBasicBlock *LoopBB = InsPt->getParent();
4920
4921 if (UseGPRIdxMode) {
4922 const MCInstrDesc &GPRIDXDesc =
4923 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4924
4925 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4926 .addReg(PhiReg)
4927 .add(*Val)
4928 .addReg(SGPRIdxReg)
4929 .addImm(SubReg);
4930 } else {
4931 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4932 TRI.getRegSizeInBits(*VecRC), 32, false);
4933 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
4934 .addReg(PhiReg)
4935 .add(*Val)
4936 .addImm(SubReg);
4937 }
4938
4939 MI.eraseFromParent();
4940 return LoopBB;
4941}
4942
4945 const GCNSubtarget &ST,
4946 unsigned Opc) {
4948 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4949 const DebugLoc &DL = MI.getDebugLoc();
4950 const SIInstrInfo *TII = ST.getInstrInfo();
4951
4952 // Reduction operations depend on whether the input operand is SGPR or VGPR.
4953 Register SrcReg = MI.getOperand(1).getReg();
4954 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
4955 Register DstReg = MI.getOperand(0).getReg();
4956 MachineBasicBlock *RetBB = nullptr;
4957 if (isSGPR) {
4958 // These operations with a uniform value i.e. SGPR are idempotent.
4959 // Reduced value will be same as given sgpr.
4960 // clang-format off
4961 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
4962 .addReg(SrcReg);
4963 // clang-format on
4964 RetBB = &BB;
4965 } else {
4966 // TODO: Implement DPP Strategy and switch based on immediate strategy
4967 // operand. For now, for all the cases (default, Iterative and DPP we use
4968 // iterative approach by default.)
4969
4970 // To reduce the VGPR using iterative approach, we need to iterate
4971 // over all the active lanes. Lowering consists of ComputeLoop,
4972 // which iterate over only active lanes. We use copy of EXEC register
4973 // as induction variable and every active lane modifies it using bitset0
4974 // so that we will get the next active lane for next iteration.
4976 Register SrcReg = MI.getOperand(1).getReg();
4977
4978 // Create Control flow for loop
4979 // Split MI's Machine Basic block into For loop
4980 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
4981
4982 // Create virtual registers required for lowering.
4983 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4984 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4985 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
4986 Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
4987
4988 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
4989 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4990 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4991
4992 Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
4993 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
4994
4995 bool IsWave32 = ST.isWave32();
4996 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4997 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4998
4999 // Create initail values of induction variable from Exec, Accumulator and
5000 // insert branch instr to newly created ComputeBlockk
5001 uint32_t InitalValue =
5002 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
5003 auto TmpSReg =
5004 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
5005 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
5006 .addImm(InitalValue);
5007 // clang-format off
5008 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5009 .addMBB(ComputeLoop);
5010 // clang-format on
5011
5012 // Start constructing ComputeLoop
5013 I = ComputeLoop->end();
5014 auto Accumulator =
5015 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5016 .addReg(InitalValReg)
5017 .addMBB(&BB);
5018 auto ActiveBits =
5019 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5020 .addReg(TmpSReg->getOperand(0).getReg())
5021 .addMBB(&BB);
5022
5023 // Perform the computations
5024 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5025 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5026 .addReg(ActiveBits->getOperand(0).getReg());
5027 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5028 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5029 .addReg(SrcReg)
5030 .addReg(FF1->getOperand(0).getReg());
5031 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5032 .addReg(Accumulator->getOperand(0).getReg())
5033 .addReg(LaneValue->getOperand(0).getReg());
5034
5035 // Manipulate the iterator to get the next active lane
5036 unsigned BITSETOpc =
5037 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5038 auto NewActiveBits =
5039 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5040 .addReg(FF1->getOperand(0).getReg())
5041 .addReg(ActiveBits->getOperand(0).getReg());
5042
5043 // Add phi nodes
5044 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5045 .addMBB(ComputeLoop);
5046 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5047 .addMBB(ComputeLoop);
5048
5049 // Creating branching
5050 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5051 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5052 .addReg(NewActiveBits->getOperand(0).getReg())
5053 .addImm(0);
5054 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5055 .addMBB(ComputeLoop);
5056
5057 RetBB = ComputeEnd;
5058 }
5059 MI.eraseFromParent();
5060 return RetBB;
5061}
5062
5065 MachineBasicBlock *BB) const {
5066
5068 MachineFunction *MF = BB->getParent();
5070
5071 switch (MI.getOpcode()) {
5072 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5073 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5074 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5075 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5076 case AMDGPU::S_UADDO_PSEUDO:
5077 case AMDGPU::S_USUBO_PSEUDO: {
5078 const DebugLoc &DL = MI.getDebugLoc();
5079 MachineOperand &Dest0 = MI.getOperand(0);
5080 MachineOperand &Dest1 = MI.getOperand(1);
5081 MachineOperand &Src0 = MI.getOperand(2);
5082 MachineOperand &Src1 = MI.getOperand(3);
5083
5084 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5085 ? AMDGPU::S_ADD_I32
5086 : AMDGPU::S_SUB_I32;
5087 // clang-format off
5088 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
5089 .add(Src0)
5090 .add(Src1);
5091 // clang-format on
5092
5093 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
5094 .addImm(1)
5095 .addImm(0);
5096
5097 MI.eraseFromParent();
5098 return BB;
5099 }
5100 case AMDGPU::S_ADD_U64_PSEUDO:
5101 case AMDGPU::S_SUB_U64_PSEUDO: {
5102 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5103 // For GFX12, we emit s_add_u64 and s_sub_u64.
5104 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5106 const DebugLoc &DL = MI.getDebugLoc();
5107 MachineOperand &Dest = MI.getOperand(0);
5108 MachineOperand &Src0 = MI.getOperand(1);
5109 MachineOperand &Src1 = MI.getOperand(2);
5110 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5111 if (Subtarget->hasScalarAddSub64()) {
5112 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5113 // clang-format off
5114 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5115 .add(Src0)
5116 .add(Src1);
5117 // clang-format on
5118 } else {
5119 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5120 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5121
5122 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5123 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5124
5125 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5126 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5127 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5128 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5129
5130 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5131 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5132 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5133 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5134
5135 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5136 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5137 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5138 .add(Src0Sub0)
5139 .add(Src1Sub0);
5140 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5141 .add(Src0Sub1)
5142 .add(Src1Sub1);
5143 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5144 .addReg(DestSub0)
5145 .addImm(AMDGPU::sub0)
5146 .addReg(DestSub1)
5147 .addImm(AMDGPU::sub1);
5148 }
5149 MI.eraseFromParent();
5150 return BB;
5151 }
5152 case AMDGPU::V_ADD_U64_PSEUDO:
5153 case AMDGPU::V_SUB_U64_PSEUDO: {
5155 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5156 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5157 const DebugLoc &DL = MI.getDebugLoc();
5158
5159 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5160
5161 MachineOperand &Dest = MI.getOperand(0);
5162 MachineOperand &Src0 = MI.getOperand(1);
5163 MachineOperand &Src1 = MI.getOperand(2);
5164
5165 if (IsAdd && ST.hasLshlAddB64()) {
5166 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
5167 Dest.getReg())
5168 .add(Src0)
5169 .addImm(0)
5170 .add(Src1);
5171 TII->legalizeOperands(*Add);
5172 MI.eraseFromParent();
5173 return BB;
5174 }
5175
5176 const auto *CarryRC = TRI->getWaveMaskRegClass();
5177
5178 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5179 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5180
5181 Register CarryReg = MRI.createVirtualRegister(CarryRC);
5182 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
5183
5184 const TargetRegisterClass *Src0RC = Src0.isReg()
5185 ? MRI.getRegClass(Src0.getReg())
5186 : &AMDGPU::VReg_64RegClass;
5187 const TargetRegisterClass *Src1RC = Src1.isReg()
5188 ? MRI.getRegClass(Src1.getReg())
5189 : &AMDGPU::VReg_64RegClass;
5190
5191 const TargetRegisterClass *Src0SubRC =
5192 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5193 const TargetRegisterClass *Src1SubRC =
5194 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5195
5196 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
5197 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5198 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
5199 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5200
5201 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
5202 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5203 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
5204 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5205
5206 unsigned LoOpc =
5207 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5208 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5209 .addReg(CarryReg, RegState::Define)
5210 .add(SrcReg0Sub0)
5211 .add(SrcReg1Sub0)
5212 .addImm(0); // clamp bit
5213
5214 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5215 MachineInstr *HiHalf =
5216 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5217 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
5218 .add(SrcReg0Sub1)
5219 .add(SrcReg1Sub1)
5220 .addReg(CarryReg, RegState::Kill)
5221 .addImm(0); // clamp bit
5222
5223 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5224 .addReg(DestSub0)
5225 .addImm(AMDGPU::sub0)
5226 .addReg(DestSub1)
5227 .addImm(AMDGPU::sub1);
5228 TII->legalizeOperands(*LoHalf);
5229 TII->legalizeOperands(*HiHalf);
5230 MI.eraseFromParent();
5231 return BB;
5232 }
5233 case AMDGPU::S_ADD_CO_PSEUDO:
5234 case AMDGPU::S_SUB_CO_PSEUDO: {
5235 // This pseudo has a chance to be selected
5236 // only from uniform add/subcarry node. All the VGPR operands
5237 // therefore assumed to be splat vectors.
5239 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5240 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5242 const DebugLoc &DL = MI.getDebugLoc();
5243 MachineOperand &Dest = MI.getOperand(0);
5244 MachineOperand &CarryDest = MI.getOperand(1);
5245 MachineOperand &Src0 = MI.getOperand(2);
5246 MachineOperand &Src1 = MI.getOperand(3);
5247 MachineOperand &Src2 = MI.getOperand(4);
5248 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5249 ? AMDGPU::S_ADDC_U32
5250 : AMDGPU::S_SUBB_U32;
5251 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5252 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5253 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5254 .addReg(Src0.getReg());
5255 Src0.setReg(RegOp0);
5256 }
5257 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5258 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5259 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5260 .addReg(Src1.getReg());
5261 Src1.setReg(RegOp1);
5262 }
5263 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5264 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
5265 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5266 .addReg(Src2.getReg());
5267 Src2.setReg(RegOp2);
5268 }
5269
5270 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
5271 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
5272 assert(WaveSize == 64 || WaveSize == 32);
5273
5274 if (WaveSize == 64) {
5275 if (ST.hasScalarCompareEq64()) {
5276 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
5277 .addReg(Src2.getReg())
5278 .addImm(0);
5279 } else {
5280 const TargetRegisterClass *SubRC =
5281 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5282 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5283 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5284 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5285 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5286 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5287
5288 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
5289 .add(Src2Sub0)
5290 .add(Src2Sub1);
5291
5292 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5293 .addReg(Src2_32, RegState::Kill)
5294 .addImm(0);
5295 }
5296 } else {
5297 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5298 .addReg(Src2.getReg())
5299 .addImm(0);
5300 }
5301
5302 // clang-format off
5303 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg())
5304 .add(Src0)
5305 .add(Src1);
5306 // clang-format on
5307
5308 unsigned SelOpc =
5309 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5310
5311 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
5312 .addImm(-1)
5313 .addImm(0);
5314
5315 MI.eraseFromParent();
5316 return BB;
5317 }
5318 case AMDGPU::SI_INIT_M0: {
5319 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5320 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5321 .add(MI.getOperand(0));
5322 MI.eraseFromParent();
5323 return BB;
5324 }
5325 case AMDGPU::GET_GROUPSTATICSIZE: {
5326 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
5327 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
5328 DebugLoc DL = MI.getDebugLoc();
5329 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
5330 .add(MI.getOperand(0))
5331 .addImm(MFI->getLDSSize());
5332 MI.eraseFromParent();
5333 return BB;
5334 }
5335 case AMDGPU::GET_SHADERCYCLESHILO: {
5338 const DebugLoc &DL = MI.getDebugLoc();
5339 // The algorithm is:
5340 //
5341 // hi1 = getreg(SHADER_CYCLES_HI)
5342 // lo1 = getreg(SHADER_CYCLES_LO)
5343 // hi2 = getreg(SHADER_CYCLES_HI)
5344 //
5345 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
5346 // Otherwise there was overflow and the result is hi2:0. In both cases the
5347 // result should represent the actual time at some point during the sequence
5348 // of three getregs.
5349 using namespace AMDGPU::Hwreg;
5350 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5351 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
5352 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5353 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5354 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
5355 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5356 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5357 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
5358 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5359 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
5360 .addReg(RegHi1)
5361 .addReg(RegHi2);
5362 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5363 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
5364 .addReg(RegLo1)
5365 .addImm(0);
5366 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
5367 .add(MI.getOperand(0))
5368 .addReg(RegLo)
5369 .addImm(AMDGPU::sub0)
5370 .addReg(RegHi2)
5371 .addImm(AMDGPU::sub1);
5372 MI.eraseFromParent();
5373 return BB;
5374 }
5375 case AMDGPU::SI_INDIRECT_SRC_V1:
5376 case AMDGPU::SI_INDIRECT_SRC_V2:
5377 case AMDGPU::SI_INDIRECT_SRC_V4:
5378 case AMDGPU::SI_INDIRECT_SRC_V8:
5379 case AMDGPU::SI_INDIRECT_SRC_V9:
5380 case AMDGPU::SI_INDIRECT_SRC_V10:
5381 case AMDGPU::SI_INDIRECT_SRC_V11:
5382 case AMDGPU::SI_INDIRECT_SRC_V12:
5383 case AMDGPU::SI_INDIRECT_SRC_V16:
5384 case AMDGPU::SI_INDIRECT_SRC_V32:
5385 return emitIndirectSrc(MI, *BB, *getSubtarget());
5386 case AMDGPU::SI_INDIRECT_DST_V1:
5387 case AMDGPU::SI_INDIRECT_DST_V2:
5388 case AMDGPU::SI_INDIRECT_DST_V4:
5389 case AMDGPU::SI_INDIRECT_DST_V8:
5390 case AMDGPU::SI_INDIRECT_DST_V9:
5391 case AMDGPU::SI_INDIRECT_DST_V10:
5392 case AMDGPU::SI_INDIRECT_DST_V11:
5393 case AMDGPU::SI_INDIRECT_DST_V12:
5394 case AMDGPU::SI_INDIRECT_DST_V16:
5395 case AMDGPU::SI_INDIRECT_DST_V32:
5396 return emitIndirectDst(MI, *BB, *getSubtarget());
5397 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5398 case AMDGPU::SI_KILL_I1_PSEUDO:
5399 return splitKillBlock(MI, BB);
5400 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5402 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5403 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5404
5405 Register Dst = MI.getOperand(0).getReg();
5406 const MachineOperand &Src0 = MI.getOperand(1);
5407 const MachineOperand &Src1 = MI.getOperand(2);
5408 const DebugLoc &DL = MI.getDebugLoc();
5409 Register SrcCond = MI.getOperand(3).getReg();
5410
5411 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5412 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5413 const auto *CondRC = TRI->getWaveMaskRegClass();
5414 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
5415
5416 const TargetRegisterClass *Src0RC = Src0.isReg()
5417 ? MRI.getRegClass(Src0.getReg())
5418 : &AMDGPU::VReg_64RegClass;
5419 const TargetRegisterClass *Src1RC = Src1.isReg()
5420 ? MRI.getRegClass(Src1.getReg())
5421 : &AMDGPU::VReg_64RegClass;
5422
5423 const TargetRegisterClass *Src0SubRC =
5424 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5425 const TargetRegisterClass *Src1SubRC =
5426 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5427
5428 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5429 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5430 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5431 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5432
5433 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5434 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5435 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5436 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5437
5438 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
5439 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5440 .addImm(0)
5441 .add(Src0Sub0)
5442 .addImm(0)
5443 .add(Src1Sub0)
5444 .addReg(SrcCondCopy);
5445 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5446 .addImm(0)
5447 .add(Src0Sub1)
5448 .addImm(0)
5449 .add(Src1Sub1)
5450 .addReg(SrcCondCopy);
5451
5452 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5453 .addReg(DstLo)
5454 .addImm(AMDGPU::sub0)
5455 .addReg(DstHi)
5456 .addImm(AMDGPU::sub1);
5457 MI.eraseFromParent();
5458 return BB;
5459 }
5460 case AMDGPU::SI_BR_UNDEF: {
5462 const DebugLoc &DL = MI.getDebugLoc();
5463 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5464 .add(MI.getOperand(0));
5465 Br->getOperand(1).setIsUndef(); // read undef SCC
5466 MI.eraseFromParent();
5467 return BB;
5468 }
5469 case AMDGPU::ADJCALLSTACKUP:
5470 case AMDGPU::ADJCALLSTACKDOWN: {
5472 MachineInstrBuilder MIB(*MF, &MI);
5473 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
5474 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
5475 return BB;
5476 }
5477 case AMDGPU::SI_CALL_ISEL: {
5479 const DebugLoc &DL = MI.getDebugLoc();
5480
5481 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
5482
5484 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5485
5486 for (const MachineOperand &MO : MI.operands())
5487 MIB.add(MO);
5488
5489 MIB.cloneMemRefs(MI);
5490 MI.eraseFromParent();
5491 return BB;
5492 }
5493 case AMDGPU::V_ADD_CO_U32_e32:
5494 case AMDGPU::V_SUB_CO_U32_e32:
5495 case AMDGPU::V_SUBREV_CO_U32_e32: {
5496 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
5497 const DebugLoc &DL = MI.getDebugLoc();
5498 unsigned Opc = MI.getOpcode();
5499
5500 bool NeedClampOperand = false;
5501 if (TII->pseudoToMCOpcode(Opc) == -1) {
5502 Opc = AMDGPU::getVOPe64(Opc);
5503 NeedClampOperand = true;
5504 }
5505
5506 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
5507 if (TII->isVOP3(*I)) {
5508 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5509 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5510 I.addReg(TRI->getVCC(), RegState::Define);
5511 }
5512 I.add(MI.getOperand(1)).add(MI.getOperand(2));
5513 if (NeedClampOperand)
5514 I.addImm(0); // clamp bit for e64 encoding
5515
5516 TII->legalizeOperands(*I);
5517
5518 MI.eraseFromParent();
5519 return BB;
5520 }
5521 case AMDGPU::V_ADDC_U32_e32:
5522 case AMDGPU::V_SUBB_U32_e32:
5523 case AMDGPU::V_SUBBREV_U32_e32:
5524 // These instructions have an implicit use of vcc which counts towards the
5525 // constant bus limit.
5526 TII->legalizeOperands(MI);
5527 return BB;
5528 case AMDGPU::DS_GWS_INIT:
5529 case AMDGPU::DS_GWS_SEMA_BR:
5530 case AMDGPU::DS_GWS_BARRIER:
5531 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
5532 [[fallthrough]];
5533 case AMDGPU::DS_GWS_SEMA_V:
5534 case AMDGPU::DS_GWS_SEMA_P:
5535 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5536 // A s_waitcnt 0 is required to be the instruction immediately following.
5537 if (getSubtarget()->hasGWSAutoReplay()) {
5539 return BB;
5540 }
5541
5542 return emitGWSMemViolTestLoop(MI, BB);
5543 case AMDGPU::S_SETREG_B32: {
5544 // Try to optimize cases that only set the denormal mode or rounding mode.
5545 //
5546 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
5547 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
5548 // instead.
5549 //
5550 // FIXME: This could be predicates on the immediate, but tablegen doesn't
5551 // allow you to have a no side effect instruction in the output of a
5552 // sideeffecting pattern.
5553 auto [ID, Offset, Width] =
5554 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
5556 return BB;
5557
5558 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5559 const unsigned SetMask = WidthMask << Offset;
5560
5561 if (getSubtarget()->hasDenormModeInst()) {
5562 unsigned SetDenormOp = 0;
5563 unsigned SetRoundOp = 0;
5564
5565 // The dedicated instructions can only set the whole denorm or round mode
5566 // at once, not a subset of bits in either.
5567 if (SetMask ==
5569 // If this fully sets both the round and denorm mode, emit the two
5570 // dedicated instructions for these.
5571 SetRoundOp = AMDGPU::S_ROUND_MODE;
5572 SetDenormOp = AMDGPU::S_DENORM_MODE;
5573 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
5574 SetRoundOp = AMDGPU::S_ROUND_MODE;
5575 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
5576 SetDenormOp = AMDGPU::S_DENORM_MODE;
5577 }
5578
5579 if (SetRoundOp || SetDenormOp) {
5581 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
5582 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5583 unsigned ImmVal = Def->getOperand(1).getImm();
5584 if (SetRoundOp) {
5585 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
5586 .addImm(ImmVal & 0xf);
5587
5588 // If we also have the denorm mode, get just the denorm mode bits.
5589 ImmVal >>= 4;
5590 }
5591
5592 if (SetDenormOp) {
5593 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
5594 .addImm(ImmVal & 0xf);
5595 }
5596
5597 MI.eraseFromParent();
5598 return BB;
5599 }
5600 }
5601 }
5602
5603 // If only FP bits are touched, used the no side effects pseudo.
5604 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
5605 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
5606 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
5607
5608 return BB;
5609 }
5610 case AMDGPU::S_INVERSE_BALLOT_U32:
5611 case AMDGPU::S_INVERSE_BALLOT_U64:
5612 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
5613 // necessary. After that they are equivalent to a COPY.
5614 MI.setDesc(TII->get(AMDGPU::COPY));
5615 return BB;
5616 case AMDGPU::ENDPGM_TRAP: {
5617 const DebugLoc &DL = MI.getDebugLoc();
5618 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
5619 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
5620 MI.addOperand(MachineOperand::CreateImm(0));
5621 return BB;
5622 }
5623
5624 // We need a block split to make the real endpgm a terminator. We also don't
5625 // want to break phis in successor blocks, so we can't just delete to the
5626 // end of the block.
5627
5628 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
5630 MF->push_back(TrapBB);
5631 // clang-format off
5632 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
5633 .addImm(0);
5634 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5635 .addMBB(TrapBB);
5636 // clang-format on
5637
5638 BB->addSuccessor(TrapBB);
5639 MI.eraseFromParent();
5640 return SplitBB;
5641 }
5642 case AMDGPU::SIMULATED_TRAP: {
5643 assert(Subtarget->hasPrivEnabledTrap2NopBug());
5645 MachineBasicBlock *SplitBB =
5646 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
5647 MI.eraseFromParent();
5648 return SplitBB;
5649 }
5650 default:
5651 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
5652 if (!MI.mayStore())
5654 return BB;
5655 }
5657 }
5658}
5659
5661 // This currently forces unfolding various combinations of fsub into fma with
5662 // free fneg'd operands. As long as we have fast FMA (controlled by
5663 // isFMAFasterThanFMulAndFAdd), we should perform these.
5664
5665 // When fma is quarter rate, for f64 where add / sub are at best half rate,
5666 // most of these combines appear to be cycle neutral but save on instruction
5667 // count / code size.
5668 return true;
5669}
5670
5672
5674 EVT VT) const {
5675 if (!VT.isVector()) {
5676 return MVT::i1;
5677 }
5678 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
5679}
5680
5682 // TODO: Should i16 be used always if legal? For now it would force VALU
5683 // shifts.
5684 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5685}
5686
5688 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
5689 ? Ty.changeElementSize(16)
5690 : Ty.changeElementSize(32);
5691}
5692
5693// Answering this is somewhat tricky and depends on the specific device which
5694// have different rates for fma or all f64 operations.
5695//
5696// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
5697// regardless of which device (although the number of cycles differs between
5698// devices), so it is always profitable for f64.
5699//
5700// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
5701// only on full rate devices. Normally, we should prefer selecting v_mad_f32
5702// which we can always do even without fused FP ops since it returns the same
5703// result as the separate operations and since it is always full
5704// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
5705// however does not support denormals, so we do report fma as faster if we have
5706// a fast fma device and require denormals.
5707//
5709 EVT VT) const {
5710 VT = VT.getScalarType();
5711
5712 switch (VT.getSimpleVT().SimpleTy) {
5713 case MVT::f32: {
5714 // If mad is not available this depends only on if f32 fma is full rate.
5715 if (!Subtarget->hasMadMacF32Insts())
5716 return Subtarget->hasFastFMAF32();
5717
5718 // Otherwise f32 mad is always full rate and returns the same result as
5719 // the separate operations so should be preferred over fma.
5720 // However does not support denormals.
5722 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5723
5724 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
5725 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5726 }
5727 case MVT::f64:
5728 return true;
5729 case MVT::f16:
5730 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
5731 default:
5732 break;
5733 }
5734
5735 return false;
5736}
5737
5739 LLT Ty) const {
5740 switch (Ty.getScalarSizeInBits()) {
5741 case 16:
5742 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
5743 case 32:
5744 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
5745 case 64:
5746 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
5747 default:
5748 break;
5749 }
5750
5751 return false;
5752}
5753
5754// Refer to comments added to the MIR variant of isFMAFasterThanFMulAndFAdd for
5755// specific details.
5757 Type *Ty) const {
5758 switch (Ty->getScalarSizeInBits()) {
5759 case 16: {
5761 return Subtarget->has16BitInsts() &&
5762 Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
5763 }
5764 case 32: {
5765 if (!Subtarget->hasMadMacF32Insts())
5766 return Subtarget->hasFastFMAF32();
5767
5769 if (Mode.FP32Denormals != DenormalMode::getPreserveSign())
5770 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5771
5772 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5773 }
5774 case 64:
5775 return true;
5776 default:
5777 break;
5778 }
5779
5780 return false;
5781}
5782
5784 if (!Ty.isScalar())
5785 return false;
5786
5787 if (Ty.getScalarSizeInBits() == 16)
5788 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
5789 if (Ty.getScalarSizeInBits() == 32)
5790 return Subtarget->hasMadMacF32Insts() &&
5791 denormalModeIsFlushAllF32(*MI.getMF());
5792
5793 return false;
5794}
5795
5797 const SDNode *N) const {
5798 // TODO: Check future ftz flag
5799 // v_mad_f32/v_mac_f32 do not support denormals.
5800 EVT VT = N->getValueType(0);
5801 if (VT == MVT::f32)
5802 return Subtarget->hasMadMacF32Insts() &&
5804 if (VT == MVT::f16) {
5805 return Subtarget->hasMadF16() &&
5807 }
5808
5809 return false;
5810}
5811
5812//===----------------------------------------------------------------------===//
5813// Custom DAG Lowering Operations
5814//===----------------------------------------------------------------------===//
5815
5816// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5817// wider vector type is legal.
5819 SelectionDAG &DAG) const {
5820 unsigned Opc = Op.getOpcode();
5821 EVT VT = Op.getValueType();
5822 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5823 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5824 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5825 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5826
5827 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
5828
5829 SDLoc SL(Op);
5830 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
5831 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
5832
5833 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5834}
5835
5836// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5837// wider vector type is legal.
5839 SelectionDAG &DAG) const {
5840 unsigned Opc = Op.getOpcode();
5841 EVT VT = Op.getValueType();
5842 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5843 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5844 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5845 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5846
5847 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
5848 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
5849
5850 SDLoc SL(Op);
5851
5852 SDValue OpLo =
5853 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
5854 SDValue OpHi =
5855 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
5856
5857 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5858}
5859
5861 SelectionDAG &DAG) const {
5862 unsigned Opc = Op.getOpcode();
5863 EVT VT = Op.getValueType();
5864 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5865 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5866 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5867 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5868 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5869 VT == MVT::v32bf16);
5870
5871 SDValue Op0 = Op.getOperand(0);
5872 auto [Lo0, Hi0] = Op0.getValueType().isVector()
5873 ? DAG.SplitVectorOperand(Op.getNode(), 0)
5874 : std::pair(Op0, Op0);
5875
5876 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
5877 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
5878
5879 SDLoc SL(Op);
5880 auto ResVT = DAG.GetSplitDestVTs(VT);
5881
5882 SDValue OpLo =
5883 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
5884 SDValue OpHi =
5885 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
5886
5887 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5888}
5889
5891 switch (Op.getOpcode()) {
5892 default:
5894 case ISD::BRCOND:
5895 return LowerBRCOND(Op, DAG);
5896 case ISD::RETURNADDR:
5897 return LowerRETURNADDR(Op, DAG);
5898 case ISD::LOAD: {
5899 SDValue Result = LowerLOAD(Op, DAG);
5900 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
5901 "Load should return a value and a chain");
5902 return Result;
5903 }
5904 case ISD::FSQRT: {
5905 EVT VT = Op.getValueType();
5906 if (VT == MVT::f32)
5907 return lowerFSQRTF32(Op, DAG);
5908 if (VT == MVT::f64)
5909 return lowerFSQRTF64(Op, DAG);
5910 return SDValue();
5911 }
5912 case ISD::FSIN:
5913 case ISD::FCOS:
5914 return LowerTrig(Op, DAG);
5915 case ISD::SELECT:
5916 return LowerSELECT(Op, DAG);
5917 case ISD::FDIV:
5918 return LowerFDIV(Op, DAG);
5919 case ISD::FFREXP:
5920 return LowerFFREXP(Op, DAG);
5922 return LowerATOMIC_CMP_SWAP(Op, DAG);
5923 case ISD::STORE:
5924 return LowerSTORE(Op, DAG);
5925 case ISD::GlobalAddress: {
5928 return LowerGlobalAddress(MFI, Op, DAG);
5929 }
5931 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
5933 return LowerINTRINSIC_W_CHAIN(Op, DAG);
5935 return LowerINTRINSIC_VOID(Op, DAG);
5936 case ISD::ADDRSPACECAST:
5937 return lowerADDRSPACECAST(Op, DAG);
5939 return lowerINSERT_SUBVECTOR(Op, DAG);
5941 return lowerINSERT_VECTOR_ELT(Op, DAG);
5943 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
5945 return lowerVECTOR_SHUFFLE(Op, DAG);
5947 return lowerSCALAR_TO_VECTOR(Op, DAG);
5948 case ISD::BUILD_VECTOR:
5949 return lowerBUILD_VECTOR(Op, DAG);
5950 case ISD::FP_ROUND:
5952 return lowerFP_ROUND(Op, DAG);
5953 case ISD::TRAP:
5954 return lowerTRAP(Op, DAG);
5955 case ISD::DEBUGTRAP:
5956 return lowerDEBUGTRAP(Op, DAG);
5957 case ISD::ABS:
5958 case ISD::FABS:
5959 case ISD::FNEG:
5960 case ISD::FCANONICALIZE:
5961 case ISD::BSWAP:
5962 return splitUnaryVectorOp(Op, DAG);
5963 case ISD::FMINNUM:
5964 case ISD::FMAXNUM:
5965 return lowerFMINNUM_FMAXNUM(Op, DAG);
5966 case ISD::FLDEXP:
5967 case ISD::STRICT_FLDEXP:
5968 return lowerFLDEXP(Op, DAG);
5969 case ISD::FMA:
5970 return splitTernaryVectorOp(Op, DAG);
5971 case ISD::FP_TO_SINT:
5972 case ISD::FP_TO_UINT:
5973 return LowerFP_TO_INT(Op, DAG);
5974 case ISD::SHL:
5975 case ISD::SRA:
5976 case ISD::SRL:
5977 case ISD::ADD:
5978 case ISD::SUB:
5979 case ISD::SMIN:
5980 case ISD::SMAX:
5981 case ISD::UMIN:
5982 case ISD::UMAX:
5983 case ISD::FADD:
5984 case ISD::FMUL:
5985 case ISD::FMINNUM_IEEE:
5986 case ISD::FMAXNUM_IEEE:
5987 case ISD::FMINIMUM:
5988 case ISD::FMAXIMUM:
5989 case ISD::FMINIMUMNUM:
5990 case ISD::FMAXIMUMNUM:
5991 case ISD::UADDSAT:
5992 case ISD::USUBSAT:
5993 case ISD::SADDSAT:
5994 case ISD::SSUBSAT:
5995 return splitBinaryVectorOp(Op, DAG);
5996 case ISD::MUL:
5997 return lowerMUL(Op, DAG);
5998 case ISD::SMULO:
5999 case ISD::UMULO:
6000 return lowerXMULO(Op, DAG);
6001 case ISD::SMUL_LOHI:
6002 case ISD::UMUL_LOHI:
6003 return lowerXMUL_LOHI(Op, DAG);
6005 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6006 case ISD::STACKSAVE:
6007 return LowerSTACKSAVE(Op, DAG);
6008 case ISD::GET_ROUNDING:
6009 return lowerGET_ROUNDING(Op, DAG);
6010 case ISD::SET_ROUNDING:
6011 return lowerSET_ROUNDING(Op, DAG);
6012 case ISD::PREFETCH:
6013 return lowerPREFETCH(Op, DAG);
6014 case ISD::FP_EXTEND:
6016 return lowerFP_EXTEND(Op, DAG);
6017 case ISD::GET_FPENV:
6018 return lowerGET_FPENV(Op, DAG);
6019 case ISD::SET_FPENV:
6020 return lowerSET_FPENV(Op, DAG);
6021 }
6022 return SDValue();
6023}
6024
6025// Used for D16: Casts the result of an instruction into the right vector,
6026// packs values if loads return unpacked values.
6028 const SDLoc &DL, SelectionDAG &DAG,
6029 bool Unpacked) {
6030 if (!LoadVT.isVector())
6031 return Result;
6032
6033 // Cast back to the original packed type or to a larger type that is a
6034 // multiple of 32 bit for D16. Widening the return type is a required for
6035 // legalization.
6036 EVT FittingLoadVT = LoadVT;
6037 if ((LoadVT.getVectorNumElements() % 2) == 1) {
6038 FittingLoadVT =
6040 LoadVT.getVectorNumElements() + 1);
6041 }
6042
6043 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6044 // Truncate to v2i16/v4i16.
6045 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6046
6047 // Workaround legalizer not scalarizing truncate after vector op
6048 // legalization but not creating intermediate vector trunc.
6050 DAG.ExtractVectorElements(Result, Elts);
6051 for (SDValue &Elt : Elts)
6052 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6053
6054 // Pad illegal v1i16/v3fi6 to v4i16
6055 if ((LoadVT.getVectorNumElements() % 2) == 1)
6056 Elts.push_back(DAG.getUNDEF(MVT::i16));
6057
6058 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6059
6060 // Bitcast to original type (v2f16/v4f16).
6061 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6062 }
6063
6064 // Cast back to the original packed type.
6065 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6066}
6067
6068SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6069 SelectionDAG &DAG,
6071 bool IsIntrinsic) const {
6072 SDLoc DL(M);
6073
6074 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6075 EVT LoadVT = M->getValueType(0);
6076
6077 EVT EquivLoadVT = LoadVT;
6078 if (LoadVT.isVector()) {
6079 if (Unpacked) {
6080 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6081 LoadVT.getVectorNumElements());
6082 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
6083 // Widen v3f16 to legal type
6084 EquivLoadVT =
6086 LoadVT.getVectorNumElements() + 1);
6087 }
6088 }
6089
6090 // Change from v4f16/v2f16 to EquivLoadVT.
6091 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
6092
6094 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
6095 M->getMemoryVT(), M->getMemOperand());
6096
6097 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
6098
6099 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
6100}
6101
6102SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
6103 SelectionDAG &DAG,
6104 ArrayRef<SDValue> Ops) const {
6105 SDLoc DL(M);
6106 EVT LoadVT = M->getValueType(0);
6107 EVT EltType = LoadVT.getScalarType();
6108 EVT IntVT = LoadVT.changeTypeToInteger();
6109
6110 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
6111
6112 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
6113 bool IsTFE = M->getNumValues() == 3;
6114
6115 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
6119
6120 if (IsD16) {
6121 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
6122 }
6123
6124 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6125 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
6126 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
6127 IsTFE);
6128
6129 if (isTypeLegal(LoadVT)) {
6130 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
6131 M->getMemOperand(), DAG);
6132 }
6133
6134 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
6135 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
6136 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
6137 M->getMemOperand(), DAG);
6138 return DAG.getMergeValues(
6139 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
6140 DL);
6141}
6142
6144 SelectionDAG &DAG) {
6145 EVT VT = N->getValueType(0);
6146 unsigned CondCode = N->getConstantOperandVal(3);
6147 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
6148 return DAG.getUNDEF(VT);
6149
6150 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
6151
6152 SDValue LHS = N->getOperand(1);
6153 SDValue RHS = N->getOperand(2);
6154
6155 SDLoc DL(N);
6156
6157 EVT CmpVT = LHS.getValueType();
6158 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
6159 unsigned PromoteOp =
6161 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
6162 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
6163 }
6164
6165 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
6166
6167 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6168 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6169
6170 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
6171 DAG.getCondCode(CCOpcode));
6172 if (VT.bitsEq(CCVT))
6173 return SetCC;
6174 return DAG.getZExtOrTrunc(SetCC, DL, VT);
6175}
6176
6178 SelectionDAG &DAG) {
6179 EVT VT = N->getValueType(0);
6180
6181 unsigned CondCode = N->getConstantOperandVal(3);
6182 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
6183 return DAG.getUNDEF(VT);
6184
6185 SDValue Src0 = N->getOperand(1);
6186 SDValue Src1 = N->getOperand(2);
6187 EVT CmpVT = Src0.getValueType();
6188 SDLoc SL(N);
6189
6190 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
6191 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6192 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6193 }
6194
6195 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
6196 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
6197 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6198 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6199 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
6200 DAG.getCondCode(CCOpcode));
6201 if (VT.bitsEq(CCVT))
6202 return SetCC;
6203 return DAG.getZExtOrTrunc(SetCC, SL, VT);
6204}
6205
6207 SelectionDAG &DAG) {
6208 EVT VT = N->getValueType(0);
6209 SDValue Src = N->getOperand(1);
6210 SDLoc SL(N);
6211
6212 if (Src.getOpcode() == ISD::SETCC) {
6213 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
6214 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
6215 Src.getOperand(1), Src.getOperand(2));
6216 }
6217 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
6218 // (ballot 0) -> 0
6219 if (Arg->isZero())
6220 return DAG.getConstant(0, SL, VT);
6221
6222 // (ballot 1) -> EXEC/EXEC_LO
6223 if (Arg->isOne()) {
6224 Register Exec;
6225 if (VT.getScalarSizeInBits() == 32)
6226 Exec = AMDGPU::EXEC_LO;
6227 else if (VT.getScalarSizeInBits() == 64)
6228 Exec = AMDGPU::EXEC;
6229 else
6230 return SDValue();
6231
6232 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
6233 }
6234 }
6235
6236 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
6237 // ISD::SETNE)
6238 return DAG.getNode(
6239 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
6240 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
6241}
6242
6244 SelectionDAG &DAG) {
6245 EVT VT = N->getValueType(0);
6246 unsigned ValSize = VT.getSizeInBits();
6247 unsigned IID = N->getConstantOperandVal(0);
6248 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6249 IID == Intrinsic::amdgcn_permlanex16;
6250 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6251 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6252 SDLoc SL(N);
6253 MVT IntVT = MVT::getIntegerVT(ValSize);
6254 const GCNSubtarget *ST = TLI.getSubtarget();
6255 unsigned SplitSize = 32;
6256 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
6257 ST->hasDPALU_DPP() &&
6258 AMDGPU::isLegalDPALU_DPPControl(N->getConstantOperandVal(3)))
6259 SplitSize = 64;
6260
6261 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
6262 SDValue Src2, MVT ValT) -> SDValue {
6264 switch (IID) {
6265 case Intrinsic::amdgcn_permlane16:
6266 case Intrinsic::amdgcn_permlanex16:
6267 case Intrinsic::amdgcn_update_dpp:
6268 Operands.push_back(N->getOperand(6));
6269 Operands.push_back(N->getOperand(5));
6270 Operands.push_back(N->getOperand(4));
6271 [[fallthrough]];
6272 case Intrinsic::amdgcn_writelane:
6273 Operands.push_back(Src2);
6274 [[fallthrough]];
6275 case Intrinsic::amdgcn_readlane:
6276 case Intrinsic::amdgcn_set_inactive:
6277 case Intrinsic::amdgcn_set_inactive_chain_arg:
6278 case Intrinsic::amdgcn_mov_dpp8:
6279 Operands.push_back(Src1);
6280 [[fallthrough]];
6281 case Intrinsic::amdgcn_readfirstlane:
6282 case Intrinsic::amdgcn_permlane64:
6283 Operands.push_back(Src0);
6284 break;
6285 default:
6286 llvm_unreachable("unhandled lane op");
6287 }
6288
6289 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
6290 std::reverse(Operands.begin(), Operands.end());
6291
6292 if (SDNode *GL = N->getGluedNode()) {
6293 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6294 GL = GL->getOperand(0).getNode();
6295 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6296 SDValue(GL, 0)));
6297 }
6298
6299 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
6300 };
6301
6302 SDValue Src0 = N->getOperand(1);
6303 SDValue Src1, Src2;
6304 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6305 IID == Intrinsic::amdgcn_mov_dpp8 ||
6306 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6307 Src1 = N->getOperand(2);
6308 if (IID == Intrinsic::amdgcn_writelane ||
6309 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
6310 Src2 = N->getOperand(3);
6311 }
6312
6313 if (ValSize == SplitSize) {
6314 // Already legal
6315 return SDValue();
6316 }
6317
6318 if (ValSize < 32) {
6319 bool IsFloat = VT.isFloatingPoint();
6320 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
6321 SL, MVT::i32);
6322
6323 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6324 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
6325 SL, MVT::i32);
6326 }
6327
6328 if (IID == Intrinsic::amdgcn_writelane) {
6329 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
6330 SL, MVT::i32);
6331 }
6332
6333 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6334 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
6335 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
6336 }
6337
6338 if (ValSize % SplitSize != 0)
6339 return SDValue();
6340
6341 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
6342 EVT VT = N->getValueType(0);
6343 unsigned NE = VT.getVectorNumElements();
6344 EVT EltVT = VT.getVectorElementType();
6346 unsigned NumOperands = N->getNumOperands();
6347 SmallVector<SDValue, 4> Operands(NumOperands);
6348 SDNode *GL = N->getGluedNode();
6349
6350 // only handle convergencectrl_glue
6352
6353 for (unsigned i = 0; i != NE; ++i) {
6354 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6355 ++j) {
6356 SDValue Operand = N->getOperand(j);
6357 EVT OperandVT = Operand.getValueType();
6358 if (OperandVT.isVector()) {
6359 // A vector operand; extract a single element.
6360 EVT OperandEltVT = OperandVT.getVectorElementType();
6361 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
6362 Operand, DAG.getVectorIdxConstant(i, SL));
6363 } else {
6364 // A scalar operand; just use it as is.
6365 Operands[j] = Operand;
6366 }
6367 }
6368
6369 if (GL)
6370 Operands[NumOperands - 1] =
6371 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6372 SDValue(GL->getOperand(0).getNode(), 0));
6373
6374 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
6375 }
6376
6377 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
6378 return DAG.getBuildVector(VecVT, SL, Scalars);
6379 };
6380
6381 if (VT.isVector()) {
6382 switch (MVT::SimpleValueType EltTy =
6384 case MVT::i32:
6385 case MVT::f32:
6386 if (SplitSize == 32) {
6387 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
6388 return unrollLaneOp(LaneOp.getNode());
6389 }
6390 [[fallthrough]];
6391 case MVT::i16:
6392 case MVT::f16:
6393 case MVT::bf16: {
6394 unsigned SubVecNumElt =
6395 SplitSize / VT.getVectorElementType().getSizeInBits();
6396 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
6398 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6399 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
6400 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
6401 DAG.getConstant(EltIdx, SL, MVT::i32));
6402
6403 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
6404 IsPermLane16)
6405 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
6406 DAG.getConstant(EltIdx, SL, MVT::i32));
6407
6408 if (IID == Intrinsic::amdgcn_writelane)
6409 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
6410 DAG.getConstant(EltIdx, SL, MVT::i32));
6411
6412 Pieces.push_back(
6413 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
6414 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6415 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6416 EltIdx += SubVecNumElt;
6417 }
6418 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
6419 }
6420 default:
6421 // Handle all other cases by bitcasting to i32 vectors
6422 break;
6423 }
6424 }
6425
6426 MVT VecVT =
6427 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
6428 Src0 = DAG.getBitcast(VecVT, Src0);
6429
6430 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6431 Src1 = DAG.getBitcast(VecVT, Src1);
6432
6433 if (IID == Intrinsic::amdgcn_writelane)
6434 Src2 = DAG.getBitcast(VecVT, Src2);
6435
6436 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6437 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
6438 return DAG.getBitcast(VT, UnrolledLaneOp);
6439}
6440
6443 SelectionDAG &DAG) const {
6444 switch (N->getOpcode()) {
6446 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
6447 Results.push_back(Res);
6448 return;
6449 }
6451 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
6452 Results.push_back(Res);
6453 return;
6454 }
6456 unsigned IID = N->getConstantOperandVal(0);
6457 switch (IID) {
6458 case Intrinsic::amdgcn_make_buffer_rsrc:
6459 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
6460 return;
6461 case Intrinsic::amdgcn_cvt_pkrtz: {
6462 SDValue Src0 = N->getOperand(1);
6463 SDValue Src1 = N->getOperand(2);
6464 SDLoc SL(N);
6465 SDValue Cvt =
6466 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
6467 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
6468 return;
6469 }
6470 case Intrinsic::amdgcn_cvt_pknorm_i16:
6471 case Intrinsic::amdgcn_cvt_pknorm_u16:
6472 case Intrinsic::amdgcn_cvt_pk_i16:
6473 case Intrinsic::amdgcn_cvt_pk_u16: {
6474 SDValue Src0 = N->getOperand(1);
6475 SDValue Src1 = N->getOperand(2);
6476 SDLoc SL(N);
6477 unsigned Opcode;
6478
6479 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6481 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6483 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6485 else
6487
6488 EVT VT = N->getValueType(0);
6489 if (isTypeLegal(VT))
6490 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
6491 else {
6492 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
6493 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
6494 }
6495 return;
6496 }
6497 case Intrinsic::amdgcn_s_buffer_load: {
6498 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
6499 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
6500 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
6501 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
6502 // s_buffer_load_i8.
6503 if (!Subtarget->hasScalarSubwordLoads())
6504 return;
6505 SDValue Op = SDValue(N, 0);
6506 SDValue Rsrc = Op.getOperand(1);
6507 SDValue Offset = Op.getOperand(2);
6508 SDValue CachePolicy = Op.getOperand(3);
6509 EVT VT = Op.getValueType();
6510 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
6511 SDLoc DL(Op);
6513 const DataLayout &DataLayout = DAG.getDataLayout();
6514 Align Alignment =
6520 VT.getStoreSize(), Alignment);
6521 SDValue LoadVal;
6522 if (!Offset->isDivergent()) {
6523 SDValue Ops[] = {Rsrc, // source register
6524 Offset, CachePolicy};
6525 SDValue BufferLoad =
6527 DAG.getVTList(MVT::i32), Ops, VT, MMO);
6528 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
6529 } else {
6530 SDValue Ops[] = {
6531 DAG.getEntryNode(), // Chain
6532 Rsrc, // rsrc
6533 DAG.getConstant(0, DL, MVT::i32), // vindex
6534 {}, // voffset
6535 {}, // soffset
6536 {}, // offset
6537 CachePolicy, // cachepolicy
6538 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
6539 };
6540 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
6541 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
6542 }
6543 Results.push_back(LoadVal);
6544 return;
6545 }
6546 }
6547 break;
6548 }
6550 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
6551 if (Res.getOpcode() == ISD::MERGE_VALUES) {
6552 // FIXME: Hacky
6553 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
6554 Results.push_back(Res.getOperand(I));
6555 }
6556 } else {
6557 Results.push_back(Res);
6558 Results.push_back(Res.getValue(1));
6559 }
6560 return;
6561 }
6562
6563 break;
6564 }
6565 case ISD::SELECT: {
6566 SDLoc SL(N);
6567 EVT VT = N->getValueType(0);
6568 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
6569 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
6570 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
6571
6572 EVT SelectVT = NewVT;
6573 if (NewVT.bitsLT(MVT::i32)) {
6574 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
6575 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
6576 SelectVT = MVT::i32;
6577 }
6578
6579 SDValue NewSelect =
6580 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
6581
6582 if (NewVT != SelectVT)
6583 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
6584 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
6585 return;
6586 }
6587 case ISD::FNEG: {
6588 if (N->getValueType(0) != MVT::v2f16)
6589 break;
6590
6591 SDLoc SL(N);
6592 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6593
6594 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
6595 DAG.getConstant(0x80008000, SL, MVT::i32));
6596 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6597 return;
6598 }
6599 case ISD::FABS: {
6600 if (N->getValueType(0) != MVT::v2f16)
6601 break;
6602
6603 SDLoc SL(N);
6604 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6605
6606 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
6607 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
6608 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6609 return;
6610 }
6611 case ISD::FSQRT: {
6612 if (N->getValueType(0) != MVT::f16)
6613 break;
6614 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
6615 break;
6616 }
6617 default:
6619 break;
6620 }
6621}
6622
6623/// Helper function for LowerBRCOND
6624static SDNode *findUser(SDValue Value, unsigned Opcode) {
6625
6626 for (SDUse &U : Value->uses()) {
6627 if (U.get() != Value)
6628 continue;
6629
6630 if (U.getUser()->getOpcode() == Opcode)
6631 return U.getUser();
6632 }
6633 return nullptr;
6634}
6635
6636unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
6637 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
6638 switch (Intr->getConstantOperandVal(1)) {
6639 case Intrinsic::amdgcn_if:
6640 return AMDGPUISD::IF;
6641 case Intrinsic::amdgcn_else:
6642 return AMDGPUISD::ELSE;
6643 case Intrinsic::amdgcn_loop:
6644 return AMDGPUISD::LOOP;
6645 case Intrinsic::amdgcn_end_cf:
6646 llvm_unreachable("should not occur");
6647 default:
6648 return 0;
6649 }
6650 }
6651
6652 // break, if_break, else_break are all only used as inputs to loop, not
6653 // directly as branch conditions.
6654 return 0;
6655}
6656
6658 const Triple &TT = getTargetMachine().getTargetTriple();
6662}
6663
6665 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
6666 return false;
6667
6668 // FIXME: Either avoid relying on address space here or change the default
6669 // address space for functions to avoid the explicit check.
6670 return (GV->getValueType()->isFunctionTy() ||
6673}
6674
6676 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
6677}
6678
6680 if (!GV->hasExternalLinkage())
6681 return true;
6682
6683 const auto OS = getTargetMachine().getTargetTriple().getOS();
6684 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
6685}
6686
6687/// This transforms the control flow intrinsics to get the branch destination as
6688/// last parameter, also switches branch target with BR if the need arise
6689SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
6690 SDLoc DL(BRCOND);
6691
6692 SDNode *Intr = BRCOND.getOperand(1).getNode();
6693 SDValue Target = BRCOND.getOperand(2);
6694 SDNode *BR = nullptr;
6695 SDNode *SetCC = nullptr;
6696
6697 if (Intr->getOpcode() == ISD::SETCC) {
6698 // As long as we negate the condition everything is fine
6699 SetCC = Intr;
6700 Intr = SetCC->getOperand(0).getNode();
6701
6702 } else {
6703 // Get the target from BR if we don't negate the condition
6704 BR = findUser(BRCOND, ISD::BR);
6705 assert(BR && "brcond missing unconditional branch user");
6706 Target = BR->getOperand(1);
6707 }
6708
6709 unsigned CFNode = isCFIntrinsic(Intr);
6710 if (CFNode == 0) {
6711 // This is a uniform branch so we don't need to legalize.
6712 return BRCOND;
6713 }
6714
6715 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
6716 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
6717
6718 assert(!SetCC ||
6719 (SetCC->getConstantOperandVal(1) == 1 &&
6720 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
6721 ISD::SETNE));
6722
6723 // operands of the new intrinsic call
6725 if (HaveChain)
6726 Ops.push_back(BRCOND.getOperand(0));
6727
6728 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
6729 Ops.push_back(Target);
6730
6731 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
6732
6733 // build the new intrinsic call
6734 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
6735
6736 if (!HaveChain) {
6737 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
6738
6739 Result = DAG.getMergeValues(Ops, DL).getNode();
6740 }
6741
6742 if (BR) {
6743 // Give the branch instruction our target
6744 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
6745 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
6746 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
6747 }
6748
6749 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
6750
6751 // Copy the intrinsic results to registers
6752 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
6754 if (!CopyToReg)
6755 continue;
6756
6757 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
6758 SDValue(Result, i - 1), SDValue());
6759
6760 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
6761 }
6762
6763 // Remove the old intrinsic from the chain
6764 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
6765 Intr->getOperand(0));
6766
6767 return Chain;
6768}
6769
6770SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
6771 MVT VT = Op.getSimpleValueType();
6772 SDLoc DL(Op);
6773 // Checking the depth
6774 if (Op.getConstantOperandVal(0) != 0)
6775 return DAG.getConstant(0, DL, VT);
6776
6779 // Check for kernel and shader functions
6780 if (Info->isEntryFunction())
6781 return DAG.getConstant(0, DL, VT);
6782
6783 MachineFrameInfo &MFI = MF.getFrameInfo();
6784 // There is a call to @llvm.returnaddress in this function
6785 MFI.setReturnAddressIsTaken(true);
6786
6788 // Get the return address reg and mark it as an implicit live-in
6789 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
6790 getRegClassFor(VT, Op.getNode()->isDivergent()));
6791
6792 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
6793}
6794
6795SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
6796 const SDLoc &DL, EVT VT) const {
6797 return Op.getValueType().bitsLE(VT)
6798 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
6799 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
6800 DAG.getTargetConstant(0, DL, MVT::i32));
6801}
6802
6803SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
6804 assert(Op.getValueType() == MVT::f16 &&
6805 "Do not know how to custom lower FP_ROUND for non-f16 type");
6806
6807 SDValue Src = Op.getOperand(0);
6808 EVT SrcVT = Src.getValueType();
6809 if (SrcVT != MVT::f64)
6810 return Op;
6811
6812 // TODO: Handle strictfp
6813 if (Op.getOpcode() != ISD::FP_ROUND)
6814 return Op;
6815
6816 SDLoc DL(Op);
6817
6818 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
6819 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
6820 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
6821}
6822
6823SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
6824 SelectionDAG &DAG) const {
6825 EVT VT = Op.getValueType();
6826 const MachineFunction &MF = DAG.getMachineFunction();
6828 bool IsIEEEMode = Info->getMode().IEEE;
6829
6830 // FIXME: Assert during selection that this is only selected for
6831 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
6832 // mode functions, but this happens to be OK since it's only done in cases
6833 // where there is known no sNaN.
6834 if (IsIEEEMode)
6835 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
6836
6837 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6838 VT == MVT::v16bf16)
6839 return splitBinaryVectorOp(Op, DAG);
6840 return Op;
6841}
6842
6843SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
6844 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
6845 EVT VT = Op.getValueType();
6846 assert(VT == MVT::f16);
6847
6848 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
6849 EVT ExpVT = Exp.getValueType();
6850 if (ExpVT == MVT::i16)
6851 return Op;
6852
6853 SDLoc DL(Op);
6854
6855 // Correct the exponent type for f16 to i16.
6856 // Clamp the range of the exponent to the instruction's range.
6857
6858 // TODO: This should be a generic narrowing legalization, and can easily be
6859 // for GlobalISel.
6860
6861 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
6862 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
6863
6864 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
6865 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
6866
6867 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
6868
6869 if (IsStrict) {
6870 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
6871 {Op.getOperand(0), Op.getOperand(1), TruncExp});
6872 }
6873
6874 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
6875}
6876
6878 switch (Op->getOpcode()) {
6879 case ISD::SRA:
6880 case ISD::SMIN:
6881 case ISD::SMAX:
6882 return ISD::SIGN_EXTEND;
6883 case ISD::SRL:
6884 case ISD::UMIN:
6885 case ISD::UMAX:
6886 return ISD::ZERO_EXTEND;
6887 case ISD::ADD:
6888 case ISD::SUB:
6889 case ISD::AND:
6890 case ISD::OR:
6891 case ISD::XOR:
6892 case ISD::SHL:
6893 case ISD::SELECT:
6894 case ISD::MUL:
6895 // operation result won't be influenced by garbage high bits.
6896 // TODO: are all of those cases correct, and are there more?
6897 return ISD::ANY_EXTEND;
6898 case ISD::SETCC: {
6899 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
6901 }
6902 default:
6903 llvm_unreachable("unexpected opcode!");
6904 }
6905}
6906
6907SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
6908 DAGCombinerInfo &DCI) const {
6909 const unsigned Opc = Op.getOpcode();
6910 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
6911 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
6912 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
6913 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
6914 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
6915
6916 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
6917 : Op->getOperand(0).getValueType();
6918 auto ExtTy = OpTy.changeElementType(MVT::i32);
6919
6920 if (DCI.isBeforeLegalizeOps() ||
6921 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
6922 return SDValue();
6923
6924 auto &DAG = DCI.DAG;
6925
6926 SDLoc DL(Op);
6927 SDValue LHS;
6928 SDValue RHS;
6929 if (Opc == ISD::SELECT) {
6930 LHS = Op->getOperand(1);
6931 RHS = Op->getOperand(2);
6932 } else {
6933 LHS = Op->getOperand(0);
6934 RHS = Op->getOperand(1);
6935 }
6936
6937 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
6938 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
6939
6940 // Special case: for shifts, the RHS always needs a zext.
6941 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
6942 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
6943 else
6944 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
6945
6946 // setcc always return i1/i1 vec so no need to truncate after.
6947 if (Opc == ISD::SETCC) {
6948 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
6949 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
6950 }
6951
6952 // For other ops, we extend the operation's return type as well so we need to
6953 // truncate back to the original type.
6954 SDValue NewVal;
6955 if (Opc == ISD::SELECT)
6956 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
6957 else
6958 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
6959
6960 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
6961}
6962
6963// Custom lowering for vector multiplications and s_mul_u64.
6964SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
6965 EVT VT = Op.getValueType();
6966
6967 // Split vector operands.
6968 if (VT.isVector())
6969 return splitBinaryVectorOp(Op, DAG);
6970
6971 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
6972
6973 // There are four ways to lower s_mul_u64:
6974 //
6975 // 1. If all the operands are uniform, then we lower it as it is.
6976 //
6977 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
6978 // multiplications because there is not a vector equivalent of s_mul_u64.
6979 //
6980 // 3. If the cost model decides that it is more efficient to use vector
6981 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
6982 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
6983 //
6984 // 4. If the cost model decides to use vector registers and both of the
6985 // operands are zero-extended/sign-extended from 32-bits, then we split the
6986 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
6987 // possible to check if the operands are zero-extended or sign-extended in
6988 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
6989 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
6990 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
6991 // If the cost model decides that we have to use vector registers, then
6992 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
6993 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
6994 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
6995 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
6996 // SIInstrInfo.cpp .
6997
6998 if (Op->isDivergent())
6999 return SDValue();
7000
7001 SDValue Op0 = Op.getOperand(0);
7002 SDValue Op1 = Op.getOperand(1);
7003 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
7004 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
7005 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
7006 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
7007 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
7008 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
7009 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
7010 SDLoc SL(Op);
7011 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7012 return SDValue(
7013 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7014 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
7015 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
7016 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7017 return SDValue(
7018 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7019 // If all the operands are uniform, then we lower s_mul_u64 as it is.
7020 return Op;
7021}
7022
7023SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
7024 EVT VT = Op.getValueType();
7025 SDLoc SL(Op);
7026 SDValue LHS = Op.getOperand(0);
7027 SDValue RHS = Op.getOperand(1);
7028 bool isSigned = Op.getOpcode() == ISD::SMULO;
7029
7030 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
7031 const APInt &C = RHSC->getAPIntValue();
7032 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
7033 if (C.isPowerOf2()) {
7034 // smulo(x, signed_min) is same as umulo(x, signed_min).
7035 bool UseArithShift = isSigned && !C.isMinSignedValue();
7036 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
7037 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
7038 SDValue Overflow =
7039 DAG.getSetCC(SL, MVT::i1,
7040 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
7041 Result, ShiftAmt),
7042 LHS, ISD::SETNE);
7043 return DAG.getMergeValues({Result, Overflow}, SL);
7044 }
7045 }
7046
7047 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
7048 SDValue Top =
7049 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
7050
7051 SDValue Sign = isSigned
7052 ? DAG.getNode(ISD::SRA, SL, VT, Result,
7053 DAG.getConstant(VT.getScalarSizeInBits() - 1,
7054 SL, MVT::i32))
7055 : DAG.getConstant(0, SL, VT);
7056 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
7057
7058 return DAG.getMergeValues({Result, Overflow}, SL);
7059}
7060
7061SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
7062 if (Op->isDivergent()) {
7063 // Select to V_MAD_[IU]64_[IU]32.
7064 return Op;
7065 }
7066 if (Subtarget->hasSMulHi()) {
7067 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
7068 return SDValue();
7069 }
7070 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
7071 // calculate the high part, so we might as well do the whole thing with
7072 // V_MAD_[IU]64_[IU]32.
7073 return Op;
7074}
7075
7076SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
7077 if (!Subtarget->isTrapHandlerEnabled() ||
7079 return lowerTrapEndpgm(Op, DAG);
7080
7081 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
7082 : lowerTrapHsaQueuePtr(Op, DAG);
7083}
7084
7085SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
7086 SDLoc SL(Op);
7087 SDValue Chain = Op.getOperand(0);
7088 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
7089}
7090
7091SDValue
7092SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
7093 const SDLoc &DL, Align Alignment,
7094 ImplicitParameter Param) const {
7097 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
7099 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
7102}
7103
7104SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
7105 SelectionDAG &DAG) const {
7106 SDLoc SL(Op);
7107 SDValue Chain = Op.getOperand(0);
7108
7109 SDValue QueuePtr;
7110 // For code object version 5, QueuePtr is passed through implicit kernarg.
7111 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7113 QueuePtr =
7114 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
7115 } else {
7118 Register UserSGPR = Info->getQueuePtrUserSGPR();
7119
7120 if (UserSGPR == AMDGPU::NoRegister) {
7121 // We probably are in a function incorrectly marked with
7122 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
7123 // trap, so just use a null pointer.
7124 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
7125 } else {
7126 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
7127 MVT::i64);
7128 }
7129 }
7130
7131 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
7132 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
7133
7135 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
7136 ToReg.getValue(1)};
7137 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7138}
7139
7140SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
7141 SDLoc SL(Op);
7142 SDValue Chain = Op.getOperand(0);
7143
7144 // We need to simulate the 's_trap 2' instruction on targets that run in
7145 // PRIV=1 (where it is treated as a nop).
7146 if (Subtarget->hasPrivEnabledTrap2NopBug())
7147 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
7148
7150 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7151 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7152}
7153
7154SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
7155 SDLoc SL(Op);
7156 SDValue Chain = Op.getOperand(0);
7158
7159 if (!Subtarget->isTrapHandlerEnabled() ||
7162 "debugtrap handler not supported",
7163 Op.getDebugLoc(), DS_Warning);
7164 LLVMContext &Ctx = MF.getFunction().getContext();
7165 Ctx.diagnose(NoTrap);
7166 return Chain;
7167 }
7168
7169 uint64_t TrapID =
7171 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7172 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7173}
7174
7175SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
7176 SelectionDAG &DAG) const {
7177 if (Subtarget->hasApertureRegs()) {
7178 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
7179 ? AMDGPU::SRC_SHARED_BASE
7180 : AMDGPU::SRC_PRIVATE_BASE;
7181 // Note: this feature (register) is broken. When used as a 32-bit operand,
7182 // it returns a wrong value (all zeroes?). The real value is in the upper 32
7183 // bits.
7184 //
7185 // To work around the issue, directly emit a 64 bit mov from this register
7186 // then extract the high bits. Note that this shouldn't even result in a
7187 // shift being emitted and simply become a pair of registers (e.g.):
7188 // s_mov_b64 s[6:7], src_shared_base
7189 // v_mov_b32_e32 v1, s7
7190 //
7191 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
7192 // coalescing would kick in and it would think it's okay to use the "HI"
7193 // subregister directly (instead of extracting the HI 32 bits) which is an
7194 // artificial (unusable) register.
7195 // Register TableGen definitions would need an overhaul to get rid of the
7196 // artificial "HI" aperture registers and prevent this kind of issue from
7197 // happening.
7198 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
7199 DAG.getRegister(ApertureRegNo, MVT::i64));
7200 return DAG.getNode(
7201 ISD::TRUNCATE, DL, MVT::i32,
7202 DAG.getNode(ISD::SRL, DL, MVT::i64,
7203 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
7204 }
7205
7206 // For code object version 5, private_base and shared_base are passed through
7207 // implicit kernargs.
7208 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7212 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
7213 }
7214
7217 Register UserSGPR = Info->getQueuePtrUserSGPR();
7218 if (UserSGPR == AMDGPU::NoRegister) {
7219 // We probably are in a function incorrectly marked with
7220 // amdgpu-no-queue-ptr. This is undefined.
7221 return DAG.getUNDEF(MVT::i32);
7222 }
7223
7224 SDValue QueuePtr =
7225 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
7226
7227 // Offset into amd_queue_t for group_segment_aperture_base_hi /
7228 // private_segment_aperture_base_hi.
7229 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
7230
7231 SDValue Ptr =
7232 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
7233
7234 // TODO: Use custom target PseudoSourceValue.
7235 // TODO: We should use the value from the IR intrinsic call, but it might not
7236 // be available and how do we get it?
7238 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
7239 commonAlignment(Align(64), StructOffset),
7242}
7243
7244/// Return true if the value is a known valid address, such that a null check is
7245/// not necessary.
7247 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
7248 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
7249 isa<BasicBlockSDNode>(Val))
7250 return true;
7251
7252 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7253 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7254
7255 // TODO: Search through arithmetic, handle arguments and loads
7256 // marked nonnull.
7257 return false;
7258}
7259
7260SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
7261 SelectionDAG &DAG) const {
7262 SDLoc SL(Op);
7263
7264 const AMDGPUTargetMachine &TM =
7265 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
7266
7267 unsigned DestAS, SrcAS;
7268 SDValue Src;
7269 bool IsNonNull = false;
7270 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
7271 SrcAS = ASC->getSrcAddressSpace();
7272 Src = ASC->getOperand(0);
7273 DestAS = ASC->getDestAddressSpace();
7274 } else {
7275 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
7276 Op.getConstantOperandVal(0) ==
7277 Intrinsic::amdgcn_addrspacecast_nonnull);
7278 Src = Op->getOperand(1);
7279 SrcAS = Op->getConstantOperandVal(2);
7280 DestAS = Op->getConstantOperandVal(3);
7281 IsNonNull = true;
7282 }
7283
7284 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
7285
7286 // flat -> local/private
7287 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
7288 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
7289 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
7290 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7291
7292 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7293 return Ptr;
7294
7295 unsigned NullVal = TM.getNullPointerValue(DestAS);
7296 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7297 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
7298
7299 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
7300 SegmentNullPtr);
7301 }
7302 }
7303
7304 // local/private -> flat
7305 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
7306 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
7307 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
7308
7309 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7310 SDValue CvtPtr =
7311 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
7312 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7313
7314 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7315 return CvtPtr;
7316
7317 unsigned NullVal = TM.getNullPointerValue(SrcAS);
7318 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7319
7320 SDValue NonNull =
7321 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
7322
7323 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
7324 FlatNullPtr);
7325 }
7326 }
7327
7328 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7329 Op.getValueType() == MVT::i64) {
7332 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
7333 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
7334 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
7335 }
7336
7337 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7338 Src.getValueType() == MVT::i64)
7339 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7340
7341 // global <-> flat are no-ops and never emitted.
7342
7343 const MachineFunction &MF = DAG.getMachineFunction();
7344 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
7345 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
7346 DAG.getContext()->diagnose(InvalidAddrSpaceCast);
7347
7348 return DAG.getUNDEF(Op->getValueType(0));
7349}
7350
7351// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
7352// the small vector and inserting them into the big vector. That is better than
7353// the default expansion of doing it via a stack slot. Even though the use of
7354// the stack slot would be optimized away afterwards, the stack slot itself
7355// remains.
7356SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
7357 SelectionDAG &DAG) const {
7358 SDValue Vec = Op.getOperand(0);
7359 SDValue Ins = Op.getOperand(1);
7360 SDValue Idx = Op.getOperand(2);
7361 EVT VecVT = Vec.getValueType();
7362 EVT InsVT = Ins.getValueType();
7363 EVT EltVT = VecVT.getVectorElementType();
7364 unsigned InsNumElts = InsVT.getVectorNumElements();
7365 unsigned IdxVal = Idx->getAsZExtVal();
7366 SDLoc SL(Op);
7367
7368 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
7369 // Insert 32-bit registers at a time.
7370 assert(InsNumElts % 2 == 0 && "expect legal vector types");
7371
7372 unsigned VecNumElts = VecVT.getVectorNumElements();
7373 EVT NewVecVT =
7374 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
7375 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7377 MVT::i32, InsNumElts / 2);
7378
7379 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
7380 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
7381
7382 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
7383 SDValue Elt;
7384 if (InsNumElts == 2) {
7385 Elt = Ins;
7386 } else {
7387 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
7388 DAG.getConstant(I, SL, MVT::i32));
7389 }
7390 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
7391 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
7392 }
7393
7394 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
7395 }
7396
7397 for (unsigned I = 0; I != InsNumElts; ++I) {
7398 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
7399 DAG.getConstant(I, SL, MVT::i32));
7400 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
7401 DAG.getConstant(IdxVal + I, SL, MVT::i32));
7402 }
7403 return Vec;
7404}
7405
7406SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
7407 SelectionDAG &DAG) const {
7408 SDValue Vec = Op.getOperand(0);
7409 SDValue InsVal = Op.getOperand(1);
7410 SDValue Idx = Op.getOperand(2);
7411 EVT VecVT = Vec.getValueType();
7412 EVT EltVT = VecVT.getVectorElementType();
7413 unsigned VecSize = VecVT.getSizeInBits();
7414 unsigned EltSize = EltVT.getSizeInBits();
7415 SDLoc SL(Op);
7416
7417 // Specially handle the case of v4i16 with static indexing.
7418 unsigned NumElts = VecVT.getVectorNumElements();
7419 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
7420 if (NumElts == 4 && EltSize == 16 && KIdx) {
7421 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
7422
7423 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7424 DAG.getConstant(0, SL, MVT::i32));
7425 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7426 DAG.getConstant(1, SL, MVT::i32));
7427
7428 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
7429 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
7430
7431 unsigned Idx = KIdx->getZExtValue();
7432 bool InsertLo = Idx < 2;
7433 SDValue InsHalf = DAG.getNode(
7434 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
7435 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
7436 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
7437
7438 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
7439
7440 SDValue Concat =
7441 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
7442 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
7443
7444 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
7445 }
7446
7447 // Static indexing does not lower to stack access, and hence there is no need
7448 // for special custom lowering to avoid stack access.
7449 if (isa<ConstantSDNode>(Idx))
7450 return SDValue();
7451
7452 // Avoid stack access for dynamic indexing by custom lowering to
7453 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
7454
7455 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
7456
7457 MVT IntVT = MVT::getIntegerVT(VecSize);
7458
7459 // Convert vector index to bit-index and get the required bit mask.
7460 assert(isPowerOf2_32(EltSize));
7461 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7462 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7463 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7464 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
7465 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
7466
7467 // 1. Create a congruent vector with the target value in each element.
7468 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
7469 DAG.getSplatBuildVector(VecVT, SL, InsVal));
7470
7471 // 2. Mask off all other indices except the required index within (1).
7472 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
7473
7474 // 3. Mask off the required index within the target vector.
7475 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7476 SDValue RHS =
7477 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
7478
7479 // 4. Get (2) and (3) ORed into the target vector.
7480 SDValue BFI =
7481 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
7482
7483 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
7484}
7485
7486SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
7487 SelectionDAG &DAG) const {
7488 SDLoc SL(Op);
7489
7490 EVT ResultVT = Op.getValueType();
7491 SDValue Vec = Op.getOperand(0);
7492 SDValue Idx = Op.getOperand(1);
7493 EVT VecVT = Vec.getValueType();
7494 unsigned VecSize = VecVT.getSizeInBits();
7495 EVT EltVT = VecVT.getVectorElementType();
7496
7497 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
7498
7499 // Make sure we do any optimizations that will make it easier to fold
7500 // source modifiers before obscuring it with bit operations.
7501
7502 // XXX - Why doesn't this get called when vector_shuffle is expanded?
7503 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
7504 return Combined;
7505
7506 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7507 SDValue Lo, Hi;
7508 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
7509
7510 if (VecSize == 128) {
7511 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
7512 Lo = DAG.getBitcast(LoVT,
7513 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7514 DAG.getConstant(0, SL, MVT::i32)));
7515 Hi = DAG.getBitcast(HiVT,
7516 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7517 DAG.getConstant(1, SL, MVT::i32)));
7518 } else if (VecSize == 256) {
7519 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
7520 SDValue Parts[4];
7521 for (unsigned P = 0; P < 4; ++P) {
7522 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7523 DAG.getConstant(P, SL, MVT::i32));
7524 }
7525
7526 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7527 Parts[0], Parts[1]));
7528 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7529 Parts[2], Parts[3]));
7530 } else {
7531 assert(VecSize == 512);
7532
7533 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
7534 SDValue Parts[8];
7535 for (unsigned P = 0; P < 8; ++P) {
7536 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7537 DAG.getConstant(P, SL, MVT::i32));
7538 }
7539
7540 Lo = DAG.getBitcast(LoVT,
7541 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7542 Parts[0], Parts[1], Parts[2], Parts[3]));
7543 Hi = DAG.getBitcast(HiVT,
7544 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7545 Parts[4], Parts[5], Parts[6], Parts[7]));
7546 }
7547
7548 EVT IdxVT = Idx.getValueType();
7549 unsigned NElem = VecVT.getVectorNumElements();
7550 assert(isPowerOf2_32(NElem));
7551 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
7552 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
7553 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
7554 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
7555 }
7556
7557 assert(VecSize <= 64);
7558
7559 MVT IntVT = MVT::getIntegerVT(VecSize);
7560
7561 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
7562 SDValue VecBC = peekThroughBitcasts(Vec);
7563 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
7564 SDValue Src = VecBC.getOperand(0);
7565 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7566 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
7567 }
7568
7569 unsigned EltSize = EltVT.getSizeInBits();
7570 assert(isPowerOf2_32(EltSize));
7571
7572 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7573
7574 // Convert vector index to bit-index (* EltSize)
7575 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7576
7577 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7578 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
7579
7580 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7581 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
7582 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
7583 }
7584
7585 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
7586}
7587
7588static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
7589 assert(Elt % 2 == 0);
7590 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7591}
7592
7593static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
7594 assert(Elt % 2 == 0);
7595 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
7596 !(Mask[Elt + 1] & 1);
7597}
7598
7599SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
7600 SelectionDAG &DAG) const {
7601 SDLoc SL(Op);
7602 EVT ResultVT = Op.getValueType();
7603 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
7604 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
7605 const int NewSrcNumElts = 2;
7606 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
7607 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
7608
7609 // Break up the shuffle into registers sized pieces.
7610 //
7611 // We're trying to form sub-shuffles that the register allocation pipeline
7612 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
7613 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
7614 // pair of copies into a consecutive register copy, so use the ordinary
7615 // extract_vector_elt lowering unless we can use the shuffle.
7616 //
7617 // TODO: This is a bit of hack, and we should probably always use
7618 // extract_subvector for the largest possible subvector we can (or at least
7619 // use it for PackVT aligned pieces). However we have worse support for
7620 // combines on them don't directly treat extract_subvector / insert_subvector
7621 // as legal. The DAG scheduler also ends up doing a worse job with the
7622 // extract_subvectors.
7623 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
7624
7625 // vector_shuffle <0,1,6,7> lhs, rhs
7626 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
7627 //
7628 // vector_shuffle <6,7,2,3> lhs, rhs
7629 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
7630 //
7631 // vector_shuffle <6,7,0,1> lhs, rhs
7632 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
7633
7634 // Avoid scalarizing when both halves are reading from consecutive elements.
7635
7636 // If we're treating 2 element shuffles as legal, also create odd-to-even
7637 // shuffles of neighboring pairs.
7638 //
7639 // vector_shuffle <3,2,7,6> lhs, rhs
7640 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
7641 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
7642
7644 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
7645 if (ShouldUseConsecutiveExtract &&
7647 const int Idx = SVN->getMaskElt(I);
7648 int VecIdx = Idx < SrcNumElts ? 0 : 1;
7649 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
7650 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
7651 SVN->getOperand(VecIdx),
7652 DAG.getConstant(EltIdx, SL, MVT::i32));
7653 Pieces.push_back(SubVec);
7654 } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
7656 int Idx0 = SVN->getMaskElt(I);
7657 int Idx1 = SVN->getMaskElt(I + 1);
7658
7659 SDValue SrcOp0 = SVN->getOperand(0);
7660 SDValue SrcOp1 = SrcOp0;
7661 if (Idx0 >= SrcNumElts) {
7662 SrcOp0 = SVN->getOperand(1);
7663 Idx0 -= SrcNumElts;
7664 }
7665
7666 if (Idx1 >= SrcNumElts) {
7667 SrcOp1 = SVN->getOperand(1);
7668 Idx1 -= SrcNumElts;
7669 }
7670
7671 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
7672 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
7673
7674 // Extract nearest even aligned piece.
7675 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
7676 DAG.getConstant(AlignedIdx0, SL, MVT::i32));
7677 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
7678 DAG.getConstant(AlignedIdx1, SL, MVT::i32));
7679
7680 int NewMaskIdx0 = Idx0 - AlignedIdx0;
7681 int NewMaskIdx1 = Idx1 - AlignedIdx1;
7682
7683 SDValue Result0 = SubVec0;
7684 SDValue Result1 = SubVec0;
7685
7686 if (SubVec0 != SubVec1) {
7687 NewMaskIdx1 += NewSrcNumElts;
7688 Result1 = SubVec1;
7689 } else {
7690 Result1 = DAG.getUNDEF(PackVT);
7691 }
7692
7693 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
7694 {NewMaskIdx0, NewMaskIdx1});
7695 Pieces.push_back(Shuf);
7696 } else {
7697 const int Idx0 = SVN->getMaskElt(I);
7698 const int Idx1 = SVN->getMaskElt(I + 1);
7699 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7700 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7701 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7702 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7703
7704 SDValue Vec0 = SVN->getOperand(VecIdx0);
7705 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
7706 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
7707
7708 SDValue Vec1 = SVN->getOperand(VecIdx1);
7709 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
7710 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
7711 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
7712 }
7713 }
7714
7715 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
7716}
7717
7718SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
7719 SelectionDAG &DAG) const {
7720 SDValue SVal = Op.getOperand(0);
7721 EVT ResultVT = Op.getValueType();
7722 EVT SValVT = SVal.getValueType();
7723 SDValue UndefVal = DAG.getUNDEF(SValVT);
7724 SDLoc SL(Op);
7725
7727 VElts.push_back(SVal);
7728 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
7729 VElts.push_back(UndefVal);
7730
7731 return DAG.getBuildVector(ResultVT, SL, VElts);
7732}
7733
7734SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
7735 SelectionDAG &DAG) const {
7736 SDLoc SL(Op);
7737 EVT VT = Op.getValueType();
7738
7739 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
7740 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
7741
7742 SDValue Lo = Op.getOperand(0);
7743 SDValue Hi = Op.getOperand(1);
7744
7745 // Avoid adding defined bits with the zero_extend.
7746 if (Hi.isUndef()) {
7747 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7748 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
7749 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
7750 }
7751
7752 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
7753 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
7754
7755 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
7756 DAG.getConstant(16, SL, MVT::i32));
7757 if (Lo.isUndef())
7758 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
7759
7760 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7761 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
7762
7763 SDValue Or =
7764 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
7765 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
7766 }
7767
7768 // Split into 2-element chunks.
7769 const unsigned NumParts = VT.getVectorNumElements() / 2;
7771 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
7772
7774 for (unsigned P = 0; P < NumParts; ++P) {
7775 SDValue Vec = DAG.getBuildVector(
7776 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
7777 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
7778 }
7779
7780 SDValue Blend =
7781 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
7782 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7783}
7784
7786 const GlobalAddressSDNode *GA) const {
7787 // OSes that use ELF REL relocations (instead of RELA) can only store a
7788 // 32-bit addend in the instruction, so it is not safe to allow offset folding
7789 // which can create arbitrary 64-bit addends. (This is only a problem for
7790 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
7791 // the high 32 bits of the addend.)
7792 //
7793 // This should be kept in sync with how HasRelocationAddend is initialized in
7794 // the constructor of ELFAMDGPUAsmBackend.
7795 if (!Subtarget->isAmdHsaOS())
7796 return false;
7797
7798 // We can fold offsets for anything that doesn't require a GOT relocation.
7799 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
7803}
7804
7805static SDValue
7807 const SDLoc &DL, int64_t Offset, EVT PtrVT,
7808 unsigned GAFlags = SIInstrInfo::MO_NONE) {
7809 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
7810 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
7811 // lowered to the following code sequence:
7812 //
7813 // For constant address space:
7814 // s_getpc_b64 s[0:1]
7815 // s_add_u32 s0, s0, $symbol
7816 // s_addc_u32 s1, s1, 0
7817 //
7818 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7819 // a fixup or relocation is emitted to replace $symbol with a literal
7820 // constant, which is a pc-relative offset from the encoding of the $symbol
7821 // operand to the global variable.
7822 //
7823 // For global address space:
7824 // s_getpc_b64 s[0:1]
7825 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
7826 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
7827 //
7828 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7829 // fixups or relocations are emitted to replace $symbol@*@lo and
7830 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
7831 // which is a 64-bit pc-relative offset from the encoding of the $symbol
7832 // operand to the global variable.
7833 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
7834 SDValue PtrHi;
7835 if (GAFlags == SIInstrInfo::MO_NONE)
7836 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
7837 else
7838 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
7839 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
7840}
7841
7842SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
7843 SDValue Op,
7844 SelectionDAG &DAG) const {
7845 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
7846 SDLoc DL(GSD);
7847 EVT PtrVT = Op.getValueType();
7848
7849 const GlobalValue *GV = GSD->getGlobal();
7855 GV->hasExternalLinkage()) {
7856 Type *Ty = GV->getValueType();
7857 // HIP uses an unsized array `extern __shared__ T s[]` or similar
7858 // zero-sized type in other languages to declare the dynamic shared
7859 // memory which size is not known at the compile time. They will be
7860 // allocated by the runtime and placed directly after the static
7861 // allocated ones. They all share the same offset.
7862 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
7863 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
7864 // Adjust alignment for that dynamic shared memory array.
7866 MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
7867 MFI->setUsesDynamicLDS(true);
7868 return SDValue(
7869 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
7870 }
7871 }
7873 }
7874
7876 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
7878 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
7879 }
7880
7881 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
7882 SDValue AddrLo = DAG.getTargetGlobalAddress(
7883 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
7884 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
7885
7886 SDValue AddrHi = DAG.getTargetGlobalAddress(
7887 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
7888 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
7889
7890 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
7891 }
7892
7893 if (shouldEmitFixup(GV))
7894 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
7895
7896 if (shouldEmitPCReloc(GV))
7897 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
7899
7900 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
7902 PointerType *PtrTy =
7904 const DataLayout &DataLayout = DAG.getDataLayout();
7905 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
7906 MachinePointerInfo PtrInfo =
7908
7909 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
7912}
7913
7915 const SDLoc &DL, SDValue V) const {
7916 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
7917 // the destination register.
7918 //
7919 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
7920 // so we will end up with redundant moves to m0.
7921 //
7922 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
7923
7924 // A Null SDValue creates a glue result.
7925 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
7926 V, Chain);
7927 return SDValue(M0, 0);
7928}
7929
7930SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
7931 MVT VT,
7932 unsigned Offset) const {
7933 SDLoc SL(Op);
7934 SDValue Param = lowerKernargMemParameter(
7935 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
7936 // The local size values will have the hi 16-bits as zero.
7937 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
7938 DAG.getValueType(VT));
7939}
7940
7942 EVT VT) {
7944 "non-hsa intrinsic with hsa target",
7945 DL.getDebugLoc());
7946 DAG.getContext()->diagnose(BadIntrin);
7947 return DAG.getUNDEF(VT);
7948}
7949
7951 EVT VT) {
7953 "intrinsic not supported on subtarget",
7954 DL.getDebugLoc());
7955 DAG.getContext()->diagnose(BadIntrin);
7956 return DAG.getUNDEF(VT);
7957}
7958
7960 ArrayRef<SDValue> Elts) {
7961 assert(!Elts.empty());
7962 MVT Type;
7963 unsigned NumElts = Elts.size();
7964
7965 if (NumElts <= 12) {
7966 Type = MVT::getVectorVT(MVT::f32, NumElts);
7967 } else {
7968 assert(Elts.size() <= 16);
7969 Type = MVT::v16f32;
7970 NumElts = 16;
7971 }
7972
7973 SmallVector<SDValue, 16> VecElts(NumElts);
7974 for (unsigned i = 0; i < Elts.size(); ++i) {
7975 SDValue Elt = Elts[i];
7976 if (Elt.getValueType() != MVT::f32)
7977 Elt = DAG.getBitcast(MVT::f32, Elt);
7978 VecElts[i] = Elt;
7979 }
7980 for (unsigned i = Elts.size(); i < NumElts; ++i)
7981 VecElts[i] = DAG.getUNDEF(MVT::f32);
7982
7983 if (NumElts == 1)
7984 return VecElts[0];
7985 return DAG.getBuildVector(Type, DL, VecElts);
7986}
7987
7988static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
7989 SDValue Src, int ExtraElts) {
7990 EVT SrcVT = Src.getValueType();
7991
7993
7994 if (SrcVT.isVector())
7995 DAG.ExtractVectorElements(Src, Elts);
7996 else
7997 Elts.push_back(Src);
7998
7999 SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
8000 while (ExtraElts--)
8001 Elts.push_back(Undef);
8002
8003 return DAG.getBuildVector(CastVT, DL, Elts);
8004}
8005
8006// Re-construct the required return value for a image load intrinsic.
8007// This is more complicated due to the optional use TexFailCtrl which means the
8008// required return type is an aggregate
8010 ArrayRef<EVT> ResultTypes, bool IsTexFail,
8011 bool Unpacked, bool IsD16, int DMaskPop,
8012 int NumVDataDwords, bool IsAtomicPacked16Bit,
8013 const SDLoc &DL) {
8014 // Determine the required return type. This is the same regardless of
8015 // IsTexFail flag
8016 EVT ReqRetVT = ResultTypes[0];
8017 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
8018 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
8019 ? (ReqRetNumElts + 1) / 2
8020 : ReqRetNumElts;
8021
8022 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
8023
8024 MVT DataDwordVT =
8025 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
8026
8027 MVT MaskPopVT =
8028 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
8029
8030 SDValue Data(Result, 0);
8031 SDValue TexFail;
8032
8033 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
8034 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
8035 if (MaskPopVT.isVector()) {
8036 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
8037 SDValue(Result, 0), ZeroIdx);
8038 } else {
8039 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
8040 SDValue(Result, 0), ZeroIdx);
8041 }
8042 }
8043
8044 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
8045 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
8046 NumDataDwords - MaskPopDwords);
8047
8048 if (IsD16)
8049 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
8050
8051 EVT LegalReqRetVT = ReqRetVT;
8052 if (!ReqRetVT.isVector()) {
8053 if (!Data.getValueType().isInteger())
8054 Data = DAG.getNode(ISD::BITCAST, DL,
8055 Data.getValueType().changeTypeToInteger(), Data);
8056 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
8057 } else {
8058 // We need to widen the return vector to a legal type
8059 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
8060 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
8061 LegalReqRetVT =
8063 ReqRetVT.getVectorNumElements() + 1);
8064 }
8065 }
8066 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
8067
8068 if (IsTexFail) {
8069 TexFail =
8070 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
8071 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
8072
8073 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
8074 }
8075
8076 if (Result->getNumValues() == 1)
8077 return Data;
8078
8079 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
8080}
8081
8082static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
8083 SDValue *LWE, bool &IsTexFail) {
8084 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
8085
8086 uint64_t Value = TexFailCtrlConst->getZExtValue();
8087 if (Value) {
8088 IsTexFail = true;
8089 }
8090
8091 SDLoc DL(TexFailCtrlConst);
8092 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
8093 Value &= ~(uint64_t)0x1;
8094 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
8095 Value &= ~(uint64_t)0x2;
8096
8097 return Value == 0;
8098}
8099
8101 MVT PackVectorVT,
8102 SmallVectorImpl<SDValue> &PackedAddrs,
8103 unsigned DimIdx, unsigned EndIdx,
8104 unsigned NumGradients) {
8105 SDLoc DL(Op);
8106 for (unsigned I = DimIdx; I < EndIdx; I++) {
8107 SDValue Addr = Op.getOperand(I);
8108
8109 // Gradients are packed with undef for each coordinate.
8110 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
8111 // 1D: undef,dx/dh; undef,dx/dv
8112 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
8113 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
8114 if (((I + 1) >= EndIdx) ||
8115 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
8116 I == DimIdx + NumGradients - 1))) {
8117 if (Addr.getValueType() != MVT::i16)
8118 Addr = DAG.getBitcast(MVT::i16, Addr);
8119 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
8120 } else {
8121 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
8122 I++;
8123 }
8124 Addr = DAG.getBitcast(MVT::f32, Addr);
8125 PackedAddrs.push_back(Addr);
8126 }
8127}
8128
8129SDValue SITargetLowering::lowerImage(SDValue Op,
8131 SelectionDAG &DAG, bool WithChain) const {
8132 SDLoc DL(Op);
8134 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
8135 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
8137 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
8138 unsigned IntrOpcode = Intr->BaseOpcode;
8139 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
8140 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
8141 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
8142
8143 SmallVector<EVT, 3> ResultTypes(Op->values());
8144 SmallVector<EVT, 3> OrigResultTypes(Op->values());
8145 bool IsD16 = false;
8146 bool IsG16 = false;
8147 bool IsA16 = false;
8148 SDValue VData;
8149 int NumVDataDwords = 0;
8150 bool AdjustRetType = false;
8151 bool IsAtomicPacked16Bit = false;
8152
8153 // Offset of intrinsic arguments
8154 const unsigned ArgOffset = WithChain ? 2 : 1;
8155
8156 unsigned DMask;
8157 unsigned DMaskLanes = 0;
8158
8159 if (BaseOpcode->Atomic) {
8160 VData = Op.getOperand(2);
8161
8162 IsAtomicPacked16Bit =
8163 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
8164 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
8165
8166 bool Is64Bit = VData.getValueSizeInBits() == 64;
8167 if (BaseOpcode->AtomicX2) {
8168 SDValue VData2 = Op.getOperand(3);
8169 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
8170 {VData, VData2});
8171 if (Is64Bit)
8172 VData = DAG.getBitcast(MVT::v4i32, VData);
8173
8174 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
8175 DMask = Is64Bit ? 0xf : 0x3;
8176 NumVDataDwords = Is64Bit ? 4 : 2;
8177 } else {
8178 DMask = Is64Bit ? 0x3 : 0x1;
8179 NumVDataDwords = Is64Bit ? 2 : 1;
8180 }
8181 } else {
8182 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
8183 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
8184
8185 if (BaseOpcode->Store) {
8186 VData = Op.getOperand(2);
8187
8188 MVT StoreVT = VData.getSimpleValueType();
8189 if (StoreVT.getScalarType() == MVT::f16) {
8190 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8191 return Op; // D16 is unsupported for this instruction
8192
8193 IsD16 = true;
8194 VData = handleD16VData(VData, DAG, true);
8195 }
8196
8197 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
8198 } else if (!BaseOpcode->NoReturn) {
8199 // Work out the num dwords based on the dmask popcount and underlying type
8200 // and whether packing is supported.
8201 MVT LoadVT = ResultTypes[0].getSimpleVT();
8202 if (LoadVT.getScalarType() == MVT::f16) {
8203 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8204 return Op; // D16 is unsupported for this instruction
8205
8206 IsD16 = true;
8207 }
8208
8209 // Confirm that the return type is large enough for the dmask specified
8210 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
8211 (!LoadVT.isVector() && DMaskLanes > 1))
8212 return Op;
8213
8214 // The sq block of gfx8 and gfx9 do not estimate register use correctly
8215 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
8216 // instructions.
8217 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
8218 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
8219 NumVDataDwords = (DMaskLanes + 1) / 2;
8220 else
8221 NumVDataDwords = DMaskLanes;
8222
8223 AdjustRetType = true;
8224 }
8225 }
8226
8227 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
8229
8230 // Check for 16 bit addresses or derivatives and pack if true.
8231 MVT VAddrVT =
8232 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
8233 MVT VAddrScalarVT = VAddrVT.getScalarType();
8234 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8235 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8236
8237 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
8238 VAddrScalarVT = VAddrVT.getScalarType();
8239 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8240 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8241
8242 // Push back extra arguments.
8243 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
8244 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
8245 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
8246 // Special handling of bias when A16 is on. Bias is of type half but
8247 // occupies full 32-bit.
8248 SDValue Bias = DAG.getBuildVector(
8249 MVT::v2f16, DL,
8250 {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
8251 VAddrs.push_back(Bias);
8252 } else {
8253 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
8254 "Bias needs to be converted to 16 bit in A16 mode");
8255 VAddrs.push_back(Op.getOperand(ArgOffset + I));
8256 }
8257 }
8258
8259 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
8260 // 16 bit gradients are supported, but are tied to the A16 control
8261 // so both gradients and addresses must be 16 bit
8262 LLVM_DEBUG(
8263 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
8264 "require 16 bit args for both gradients and addresses");
8265 return Op;
8266 }
8267
8268 if (IsA16) {
8269 if (!ST->hasA16()) {
8270 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
8271 "support 16 bit addresses\n");
8272 return Op;
8273 }
8274 }
8275
8276 // We've dealt with incorrect input so we know that if IsA16, IsG16
8277 // are set then we have to compress/pack operands (either address,
8278 // gradient or both)
8279 // In the case where a16 and gradients are tied (no G16 support) then we
8280 // have already verified that both IsA16 and IsG16 are true
8281 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
8282 // Activate g16
8283 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
8285 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
8286 }
8287
8288 // Add gradients (packed or unpacked)
8289 if (IsG16) {
8290 // Pack the gradients
8291 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
8292 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
8293 ArgOffset + Intr->GradientStart,
8294 ArgOffset + Intr->CoordStart, Intr->NumGradients);
8295 } else {
8296 for (unsigned I = ArgOffset + Intr->GradientStart;
8297 I < ArgOffset + Intr->CoordStart; I++)
8298 VAddrs.push_back(Op.getOperand(I));
8299 }
8300
8301 // Add addresses (packed or unpacked)
8302 if (IsA16) {
8303 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
8304 ArgOffset + Intr->CoordStart, VAddrEnd,
8305 0 /* No gradients */);
8306 } else {
8307 // Add uncompressed address
8308 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
8309 VAddrs.push_back(Op.getOperand(I));
8310 }
8311
8312 // If the register allocator cannot place the address registers contiguously
8313 // without introducing moves, then using the non-sequential address encoding
8314 // is always preferable, since it saves VALU instructions and is usually a
8315 // wash in terms of code size or even better.
8316 //
8317 // However, we currently have no way of hinting to the register allocator that
8318 // MIMG addresses should be placed contiguously when it is possible to do so,
8319 // so force non-NSA for the common 2-address case as a heuristic.
8320 //
8321 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
8322 // allocation when possible.
8323 //
8324 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
8325 // set of the remaining addresses.
8326 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
8327 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
8328 const bool UseNSA = ST->hasNSAEncoding() &&
8329 VAddrs.size() >= ST->getNSAThreshold(MF) &&
8330 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
8331 const bool UsePartialNSA =
8332 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
8333
8334 SDValue VAddr;
8335 if (UsePartialNSA) {
8336 VAddr = getBuildDwordsVector(DAG, DL,
8337 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8338 } else if (!UseNSA) {
8339 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
8340 }
8341
8342 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
8343 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
8344 SDValue Unorm;
8345 if (!BaseOpcode->Sampler) {
8346 Unorm = True;
8347 } else {
8348 uint64_t UnormConst =
8349 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
8350
8351 Unorm = UnormConst ? True : False;
8352 }
8353
8354 SDValue TFE;
8355 SDValue LWE;
8356 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
8357 bool IsTexFail = false;
8358 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8359 return Op;
8360
8361 if (IsTexFail) {
8362 if (!DMaskLanes) {
8363 // Expecting to get an error flag since TFC is on - and dmask is 0
8364 // Force dmask to be at least 1 otherwise the instruction will fail
8365 DMask = 0x1;
8366 DMaskLanes = 1;
8367 NumVDataDwords = 1;
8368 }
8369 NumVDataDwords += 1;
8370 AdjustRetType = true;
8371 }
8372
8373 // Has something earlier tagged that the return type needs adjusting
8374 // This happens if the instruction is a load or has set TexFailCtrl flags
8375 if (AdjustRetType) {
8376 // NumVDataDwords reflects the true number of dwords required in the return
8377 // type
8378 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8379 // This is a no-op load. This can be eliminated
8380 SDValue Undef = DAG.getUNDEF(Op.getValueType());
8381 if (isa<MemSDNode>(Op))
8382 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
8383 return Undef;
8384 }
8385
8386 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
8387 MVT::i32, NumVDataDwords)
8388 : MVT::i32;
8389
8390 ResultTypes[0] = NewVT;
8391 if (ResultTypes.size() == 3) {
8392 // Original result was aggregate type used for TexFailCtrl results
8393 // The actual instruction returns as a vector type which has now been
8394 // created. Remove the aggregate result.
8395 ResultTypes.erase(&ResultTypes[1]);
8396 }
8397 }
8398
8399 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
8400 if (BaseOpcode->Atomic)
8401 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
8402 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
8404 return Op;
8405
8407 if (BaseOpcode->Store || BaseOpcode->Atomic)
8408 Ops.push_back(VData); // vdata
8409 if (UsePartialNSA) {
8410 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
8411 Ops.push_back(VAddr);
8412 } else if (UseNSA)
8413 append_range(Ops, VAddrs);
8414 else
8415 Ops.push_back(VAddr);
8416 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
8417 EVT RsrcVT = Rsrc.getValueType();
8418 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
8419 return Op;
8420 Ops.push_back(Rsrc);
8421 if (BaseOpcode->Sampler) {
8422 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
8423 if (Samp.getValueType() != MVT::v4i32)
8424 return Op;
8425 Ops.push_back(Samp);
8426 }
8427 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
8428 if (IsGFX10Plus)
8429 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
8430 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8431 Ops.push_back(Unorm);
8432 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
8433 Ops.push_back(IsA16 && // r128, a16 for gfx9
8434 ST->hasFeature(AMDGPU::FeatureR128A16)
8435 ? True
8436 : False);
8437 if (IsGFX10Plus)
8438 Ops.push_back(IsA16 ? True : False);
8439 if (!Subtarget->hasGFX90AInsts()) {
8440 Ops.push_back(TFE); // tfe
8441 } else if (TFE->getAsZExtVal()) {
8442 report_fatal_error("TFE is not supported on this GPU");
8443 }
8444 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8445 Ops.push_back(LWE); // lwe
8446 if (!IsGFX10Plus)
8447 Ops.push_back(DimInfo->DA ? True : False);
8448 if (BaseOpcode->HasD16)
8449 Ops.push_back(IsD16 ? True : False);
8450 if (isa<MemSDNode>(Op))
8451 Ops.push_back(Op.getOperand(0)); // chain
8452
8453 int NumVAddrDwords =
8454 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
8455 int Opcode = -1;
8456
8457 if (IsGFX12Plus) {
8458 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
8459 NumVDataDwords, NumVAddrDwords);
8460 } else if (IsGFX11Plus) {
8461 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8462 UseNSA ? AMDGPU::MIMGEncGfx11NSA
8463 : AMDGPU::MIMGEncGfx11Default,
8464 NumVDataDwords, NumVAddrDwords);
8465 } else if (IsGFX10Plus) {
8466 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8467 UseNSA ? AMDGPU::MIMGEncGfx10NSA
8468 : AMDGPU::MIMGEncGfx10Default,
8469 NumVDataDwords, NumVAddrDwords);
8470 } else {
8471 if (Subtarget->hasGFX90AInsts()) {
8472 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
8473 NumVDataDwords, NumVAddrDwords);
8474 if (Opcode == -1)
8476 "requested image instruction is not supported on this GPU");
8477 }
8478 if (Opcode == -1 &&
8480 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
8481 NumVDataDwords, NumVAddrDwords);
8482 if (Opcode == -1)
8483 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
8484 NumVDataDwords, NumVAddrDwords);
8485 }
8486 if (Opcode == -1)
8487 return Op;
8488
8489 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
8490 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
8491 MachineMemOperand *MemRef = MemOp->getMemOperand();
8492 DAG.setNodeMemRefs(NewNode, {MemRef});
8493 }
8494
8495 if (BaseOpcode->AtomicX2) {
8497 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
8498 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
8499 }
8500 if (BaseOpcode->NoReturn)
8501 return SDValue(NewNode, 0);
8502 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
8503 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
8504 NumVDataDwords, IsAtomicPacked16Bit, DL);
8505}
8506
8507SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
8508 SDValue Offset, SDValue CachePolicy,
8509 SelectionDAG &DAG) const {
8511
8512 const DataLayout &DataLayout = DAG.getDataLayout();
8513 Align Alignment =
8515
8520 VT.getStoreSize(), Alignment);
8521
8522 if (!Offset->isDivergent()) {
8523 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
8524
8525 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
8526 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
8527 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
8528 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
8529 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8530 SDValue BufferLoad =
8532 DAG.getVTList(MVT::i32), Ops, VT, MMO);
8533 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
8534 }
8535
8536 // Widen vec3 load to vec4.
8537 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
8538 !Subtarget->hasScalarDwordx3Loads()) {
8539 EVT WidenedVT =
8541 auto WidenedOp = DAG.getMemIntrinsicNode(
8542 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
8543 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
8544 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
8545 DAG.getVectorIdxConstant(0, DL));
8546 return Subvector;
8547 }
8548
8550 DAG.getVTList(VT), Ops, VT, MMO);
8551 }
8552
8553 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
8554 // assume that the buffer is unswizzled.
8555 SDValue Ops[] = {
8556 DAG.getEntryNode(), // Chain
8557 Rsrc, // rsrc
8558 DAG.getConstant(0, DL, MVT::i32), // vindex
8559 {}, // voffset
8560 {}, // soffset
8561 {}, // offset
8562 CachePolicy, // cachepolicy
8563 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8564 };
8565 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8566 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
8567 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
8568 }
8569
8571 unsigned NumLoads = 1;
8572 MVT LoadVT = VT.getSimpleVT();
8573 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
8574 assert((LoadVT.getScalarType() == MVT::i32 ||
8575 LoadVT.getScalarType() == MVT::f32));
8576
8577 if (NumElts == 8 || NumElts == 16) {
8578 NumLoads = NumElts / 4;
8579 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
8580 }
8581
8582 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
8583
8584 // Use the alignment to ensure that the required offsets will fit into the
8585 // immediate offsets.
8586 setBufferOffsets(Offset, DAG, &Ops[3],
8587 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
8588
8589 uint64_t InstOffset = Ops[5]->getAsZExtVal();
8590 for (unsigned i = 0; i < NumLoads; ++i) {
8591 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
8592 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
8593 LoadVT, MMO, DAG));
8594 }
8595
8596 if (NumElts == 8 || NumElts == 16)
8597 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
8598
8599 return Loads[0];
8600}
8601
8602SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
8603 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
8604 if (!Subtarget->hasArchitectedSGPRs())
8605 return {};
8606 SDLoc SL(Op);
8607 MVT VT = MVT::i32;
8608 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
8609 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
8610 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
8611}
8612
8613SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
8614 unsigned Dim,
8615 const ArgDescriptor &Arg) const {
8616 SDLoc SL(Op);
8618 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
8619 if (MaxID == 0)
8620 return DAG.getConstant(0, SL, MVT::i32);
8621
8622 // It's undefined behavior if a function marked with the amdgpu-no-*
8623 // attributes uses the corresponding intrinsic.
8624 if (!Arg)
8625 return DAG.getUNDEF(Op->getValueType(0));
8626
8627 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
8628 SDLoc(DAG.getEntryNode()), Arg);
8629
8630 // Don't bother inserting AssertZext for packed IDs since we're emitting the
8631 // masking operations anyway.
8632 //
8633 // TODO: We could assert the top bit is 0 for the source copy.
8634 if (Arg.isMasked())
8635 return Val;
8636
8637 // Preserve the known bits after expansion to a copy.
8639 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
8640 DAG.getValueType(SmallVT));
8641}
8642
8643SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
8644 SelectionDAG &DAG) const {
8646 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
8647
8648 EVT VT = Op.getValueType();
8649 SDLoc DL(Op);
8650 unsigned IntrinsicID = Op.getConstantOperandVal(0);
8651
8652 // TODO: Should this propagate fast-math-flags?
8653
8654 switch (IntrinsicID) {
8655 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8656 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
8657 return emitNonHSAIntrinsicError(DAG, DL, VT);
8658 return getPreloadedValue(DAG, *MFI, VT,
8660 }
8661 case Intrinsic::amdgcn_dispatch_ptr:
8662 case Intrinsic::amdgcn_queue_ptr: {
8663 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
8664 DiagnosticInfoUnsupported BadIntrin(
8665 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
8666 DL.getDebugLoc());
8667 DAG.getContext()->diagnose(BadIntrin);
8668 return DAG.getUNDEF(VT);
8669 }
8670
8671 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
8674 return getPreloadedValue(DAG, *MFI, VT, RegID);
8675 }
8676 case Intrinsic::amdgcn_implicitarg_ptr: {
8677 if (MFI->isEntryFunction())
8678 return getImplicitArgPtr(DAG, DL);
8679 return getPreloadedValue(DAG, *MFI, VT,
8681 }
8682 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8684 // This only makes sense to call in a kernel, so just lower to null.
8685 return DAG.getConstant(0, DL, VT);
8686 }
8687
8688 return getPreloadedValue(DAG, *MFI, VT,
8690 }
8691 case Intrinsic::amdgcn_dispatch_id: {
8692 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
8693 }
8694 case Intrinsic::amdgcn_rcp:
8695 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
8696 case Intrinsic::amdgcn_rsq:
8697 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8698 case Intrinsic::amdgcn_rsq_legacy:
8700 return emitRemovedIntrinsicError(DAG, DL, VT);
8701 return SDValue();
8702 case Intrinsic::amdgcn_rcp_legacy:
8704 return emitRemovedIntrinsicError(DAG, DL, VT);
8705 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
8706 case Intrinsic::amdgcn_rsq_clamp: {
8708 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
8709
8710 Type *Type = VT.getTypeForEVT(*DAG.getContext());
8713
8714 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8715 SDValue Tmp =
8716 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
8717 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
8718 DAG.getConstantFP(Min, DL, VT));
8719 }
8720 case Intrinsic::r600_read_ngroups_x:
8721 if (Subtarget->isAmdHsaOS())
8722 return emitNonHSAIntrinsicError(DAG, DL, VT);
8723
8724 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8726 false);
8727 case Intrinsic::r600_read_ngroups_y:
8728 if (Subtarget->isAmdHsaOS())
8729 return emitNonHSAIntrinsicError(DAG, DL, VT);
8730
8731 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8733 false);
8734 case Intrinsic::r600_read_ngroups_z:
8735 if (Subtarget->isAmdHsaOS())
8736 return emitNonHSAIntrinsicError(DAG, DL, VT);
8737
8738 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8740 false);
8741 case Intrinsic::r600_read_global_size_x:
8742 if (Subtarget->isAmdHsaOS())
8743 return emitNonHSAIntrinsicError(DAG, DL, VT);
8744
8745 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8747 Align(4), false);
8748 case Intrinsic::r600_read_global_size_y:
8749 if (Subtarget->isAmdHsaOS())
8750 return emitNonHSAIntrinsicError(DAG, DL, VT);
8751
8752 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8754 Align(4), false);
8755 case Intrinsic::r600_read_global_size_z:
8756 if (Subtarget->isAmdHsaOS())
8757 return emitNonHSAIntrinsicError(DAG, DL, VT);
8758
8759 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8761 Align(4), false);
8762 case Intrinsic::r600_read_local_size_x:
8763 if (Subtarget->isAmdHsaOS())
8764 return emitNonHSAIntrinsicError(DAG, DL, VT);
8765
8766 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8768 case Intrinsic::r600_read_local_size_y:
8769 if (Subtarget->isAmdHsaOS())
8770 return emitNonHSAIntrinsicError(DAG, DL, VT);
8771
8772 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8774 case Intrinsic::r600_read_local_size_z:
8775 if (Subtarget->isAmdHsaOS())
8776 return emitNonHSAIntrinsicError(DAG, DL, VT);
8777
8778 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8780 case Intrinsic::amdgcn_workgroup_id_x:
8781 return getPreloadedValue(DAG, *MFI, VT,
8783 case Intrinsic::amdgcn_workgroup_id_y:
8784 return getPreloadedValue(DAG, *MFI, VT,
8786 case Intrinsic::amdgcn_workgroup_id_z:
8787 return getPreloadedValue(DAG, *MFI, VT,
8789 case Intrinsic::amdgcn_wave_id:
8790 return lowerWaveID(DAG, Op);
8791 case Intrinsic::amdgcn_lds_kernel_id: {
8792 if (MFI->isEntryFunction())
8793 return getLDSKernelId(DAG, DL);
8794 return getPreloadedValue(DAG, *MFI, VT,
8796 }
8797 case Intrinsic::amdgcn_workitem_id_x:
8798 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
8799 case Intrinsic::amdgcn_workitem_id_y:
8800 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
8801 case Intrinsic::amdgcn_workitem_id_z:
8802 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
8803 case Intrinsic::amdgcn_wavefrontsize:
8805 SDLoc(Op), MVT::i32);
8806 case Intrinsic::amdgcn_s_buffer_load: {
8807 unsigned CPol = Op.getConstantOperandVal(3);
8808 // s_buffer_load, because of how it's optimized, can't be volatile
8809 // so reject ones with the volatile bit set.
8810 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
8813 return Op;
8814 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
8815 Op.getOperand(3), DAG);
8816 }
8817 case Intrinsic::amdgcn_fdiv_fast:
8818 return lowerFDIV_FAST(Op, DAG);
8819 case Intrinsic::amdgcn_sin:
8820 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
8821
8822 case Intrinsic::amdgcn_cos:
8823 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
8824
8825 case Intrinsic::amdgcn_mul_u24:
8826 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
8827 Op.getOperand(2));
8828 case Intrinsic::amdgcn_mul_i24:
8829 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
8830 Op.getOperand(2));
8831
8832 case Intrinsic::amdgcn_log_clamp: {
8834 return SDValue();
8835
8836 return emitRemovedIntrinsicError(DAG, DL, VT);
8837 }
8838 case Intrinsic::amdgcn_fract:
8839 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
8840
8841 case Intrinsic::amdgcn_class:
8842 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
8843 Op.getOperand(2));
8844 case Intrinsic::amdgcn_div_fmas:
8845 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
8846 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
8847
8848 case Intrinsic::amdgcn_div_fixup:
8849 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
8850 Op.getOperand(2), Op.getOperand(3));
8851
8852 case Intrinsic::amdgcn_div_scale: {
8853 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
8854
8855 // Translate to the operands expected by the machine instruction. The
8856 // first parameter must be the same as the first instruction.
8857 SDValue Numerator = Op.getOperand(1);
8858 SDValue Denominator = Op.getOperand(2);
8859
8860 // Note this order is opposite of the machine instruction's operations,
8861 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
8862 // intrinsic has the numerator as the first operand to match a normal
8863 // division operation.
8864
8865 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
8866
8867 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
8868 Denominator, Numerator);
8869 }
8870 case Intrinsic::amdgcn_icmp: {
8871 // There is a Pat that handles this variant, so return it as-is.
8872 if (Op.getOperand(1).getValueType() == MVT::i1 &&
8873 Op.getConstantOperandVal(2) == 0 &&
8874 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
8875 return Op;
8876 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
8877 }
8878 case Intrinsic::amdgcn_fcmp: {
8879 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
8880 }
8881 case Intrinsic::amdgcn_ballot:
8882 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
8883 case Intrinsic::amdgcn_fmed3:
8884 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
8885 Op.getOperand(2), Op.getOperand(3));
8886 case Intrinsic::amdgcn_fdot2:
8887 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
8888 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
8889 case Intrinsic::amdgcn_fmul_legacy:
8890 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
8891 Op.getOperand(2));
8892 case Intrinsic::amdgcn_sffbh:
8893 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
8894 case Intrinsic::amdgcn_sbfe:
8895 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
8896 Op.getOperand(2), Op.getOperand(3));
8897 case Intrinsic::amdgcn_ubfe:
8898 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
8899 Op.getOperand(2), Op.getOperand(3));
8900 case Intrinsic::amdgcn_cvt_pkrtz:
8901 case Intrinsic::amdgcn_cvt_pknorm_i16:
8902 case Intrinsic::amdgcn_cvt_pknorm_u16:
8903 case Intrinsic::amdgcn_cvt_pk_i16:
8904 case Intrinsic::amdgcn_cvt_pk_u16: {
8905 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
8906 EVT VT = Op.getValueType();
8907 unsigned Opcode;
8908
8909 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8911 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8913 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8915 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8917 else
8919
8920 if (isTypeLegal(VT))
8921 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
8922
8923 SDValue Node =
8924 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
8925 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
8926 }
8927 case Intrinsic::amdgcn_fmad_ftz:
8928 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
8929 Op.getOperand(2), Op.getOperand(3));
8930
8931 case Intrinsic::amdgcn_if_break:
8932 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
8933 Op->getOperand(1), Op->getOperand(2)),
8934 0);
8935
8936 case Intrinsic::amdgcn_groupstaticsize: {
8938 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
8939 return Op;
8940
8941 const Module *M = MF.getFunction().getParent();
8942 const GlobalValue *GV =
8943 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
8944 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
8946 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8947 }
8948 case Intrinsic::amdgcn_is_shared:
8949 case Intrinsic::amdgcn_is_private: {
8950 SDLoc SL(Op);
8951 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
8954 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8955 SDValue SrcVec =
8956 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
8957
8958 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
8959 DAG.getConstant(1, SL, MVT::i32));
8960 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
8961 }
8962 case Intrinsic::amdgcn_perm:
8963 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
8964 Op.getOperand(2), Op.getOperand(3));
8965 case Intrinsic::amdgcn_reloc_constant: {
8966 Module *M = const_cast<Module *>(MF.getFunction().getParent());
8967 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
8968 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
8969 auto *RelocSymbol = cast<GlobalVariable>(
8970 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
8971 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
8973 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8974 }
8975 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8976 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8977 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8978 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8979 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8980 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8981 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8982 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8983 if (Op.getOperand(4).getValueType() == MVT::i32)
8984 return SDValue();
8985
8986 SDLoc SL(Op);
8987 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
8988 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8989 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8990 Op.getOperand(3), IndexKeyi32);
8991 }
8992 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8993 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8994 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8995 if (Op.getOperand(6).getValueType() == MVT::i32)
8996 return SDValue();
8997
8998 SDLoc SL(Op);
8999 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
9000 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
9001 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9002 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
9003 IndexKeyi32, Op.getOperand(7)});
9004 }
9005 case Intrinsic::amdgcn_addrspacecast_nonnull:
9006 return lowerADDRSPACECAST(Op, DAG);
9007 case Intrinsic::amdgcn_readlane:
9008 case Intrinsic::amdgcn_readfirstlane:
9009 case Intrinsic::amdgcn_writelane:
9010 case Intrinsic::amdgcn_permlane16:
9011 case Intrinsic::amdgcn_permlanex16:
9012 case Intrinsic::amdgcn_permlane64:
9013 case Intrinsic::amdgcn_set_inactive:
9014 case Intrinsic::amdgcn_set_inactive_chain_arg:
9015 case Intrinsic::amdgcn_mov_dpp8:
9016 case Intrinsic::amdgcn_update_dpp:
9017 return lowerLaneOp(*this, Op.getNode(), DAG);
9018 default:
9019 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9021 return lowerImage(Op, ImageDimIntr, DAG, false);
9022
9023 return Op;
9024 }
9025}
9026
9027// On targets not supporting constant in soffset field, turn zero to
9028// SGPR_NULL to avoid generating an extra s_mov with zero.
9030 const GCNSubtarget *Subtarget) {
9031 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
9032 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
9033 return SOffset;
9034}
9035
9036SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
9037 SelectionDAG &DAG,
9038 unsigned NewOpcode) const {
9039 SDLoc DL(Op);
9040
9041 SDValue VData = Op.getOperand(2);
9042 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9043 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9044 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9045 SDValue Ops[] = {
9046 Op.getOperand(0), // Chain
9047 VData, // vdata
9048 Rsrc, // rsrc
9049 DAG.getConstant(0, DL, MVT::i32), // vindex
9050 VOffset, // voffset
9051 SOffset, // soffset
9052 Offset, // offset
9053 Op.getOperand(6), // cachepolicy
9054 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9055 };
9056
9057 auto *M = cast<MemSDNode>(Op);
9058
9059 EVT MemVT = VData.getValueType();
9060 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
9061 M->getMemOperand());
9062}
9063
9064SDValue
9065SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
9066 unsigned NewOpcode) const {
9067 SDLoc DL(Op);
9068
9069 SDValue VData = Op.getOperand(2);
9070 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9071 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9072 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9073 SDValue Ops[] = {
9074 Op.getOperand(0), // Chain
9075 VData, // vdata
9076 Rsrc, // rsrc
9077 Op.getOperand(4), // vindex
9078 VOffset, // voffset
9079 SOffset, // soffset
9080 Offset, // offset
9081 Op.getOperand(7), // cachepolicy
9082 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9083 };
9084
9085 auto *M = cast<MemSDNode>(Op);
9086
9087 EVT MemVT = VData.getValueType();
9088 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
9089 M->getMemOperand());
9090}
9091
9092SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
9093 SelectionDAG &DAG) const {
9094 unsigned IntrID = Op.getConstantOperandVal(1);
9095 SDLoc DL(Op);
9096
9097 switch (IntrID) {
9098 case Intrinsic::amdgcn_ds_ordered_add:
9099 case Intrinsic::amdgcn_ds_ordered_swap: {
9100 MemSDNode *M = cast<MemSDNode>(Op);
9101 SDValue Chain = M->getOperand(0);
9102 SDValue M0 = M->getOperand(2);
9103 SDValue Value = M->getOperand(3);
9104 unsigned IndexOperand = M->getConstantOperandVal(7);
9105 unsigned WaveRelease = M->getConstantOperandVal(8);
9106 unsigned WaveDone = M->getConstantOperandVal(9);
9107
9108 unsigned OrderedCountIndex = IndexOperand & 0x3f;
9109 IndexOperand &= ~0x3f;
9110 unsigned CountDw = 0;
9111
9112 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
9113 CountDw = (IndexOperand >> 24) & 0xf;
9114 IndexOperand &= ~(0xf << 24);
9115
9116 if (CountDw < 1 || CountDw > 4) {
9118 "ds_ordered_count: dword count must be between 1 and 4");
9119 }
9120 }
9121
9122 if (IndexOperand)
9123 report_fatal_error("ds_ordered_count: bad index operand");
9124
9125 if (WaveDone && !WaveRelease)
9126 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
9127
9128 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
9129 unsigned ShaderType =
9131 unsigned Offset0 = OrderedCountIndex << 2;
9132 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
9133
9134 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
9135 Offset1 |= (CountDw - 1) << 6;
9136
9137 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
9138 Offset1 |= ShaderType << 2;
9139
9140 unsigned Offset = Offset0 | (Offset1 << 8);
9141
9142 SDValue Ops[] = {
9143 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
9144 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
9145 };
9147 M->getVTList(), Ops, M->getMemoryVT(),
9148 M->getMemOperand());
9149 }
9150 case Intrinsic::amdgcn_raw_buffer_load:
9151 case Intrinsic::amdgcn_raw_ptr_buffer_load:
9152 case Intrinsic::amdgcn_raw_atomic_buffer_load:
9153 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
9154 case Intrinsic::amdgcn_raw_buffer_load_format:
9155 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
9156 const bool IsFormat =
9157 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
9158 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
9159
9160 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9161 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
9162 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
9163 SDValue Ops[] = {
9164 Op.getOperand(0), // Chain
9165 Rsrc, // rsrc
9166 DAG.getConstant(0, DL, MVT::i32), // vindex
9167 VOffset, // voffset
9168 SOffset, // soffset
9169 Offset, // offset
9170 Op.getOperand(5), // cachepolicy, swizzled buffer
9171 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9172 };
9173
9174 auto *M = cast<MemSDNode>(Op);
9175 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
9176 }
9177 case Intrinsic::amdgcn_struct_buffer_load:
9178 case Intrinsic::amdgcn_struct_ptr_buffer_load:
9179 case Intrinsic::amdgcn_struct_buffer_load_format:
9180 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
9181 case Intrinsic::amdgcn_struct_atomic_buffer_load:
9182 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
9183 const bool IsFormat =
9184 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
9185 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
9186
9187 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9188 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9189 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9190 SDValue Ops[] = {
9191 Op.getOperand(0), // Chain
9192 Rsrc, // rsrc
9193 Op.getOperand(3), // vindex
9194 VOffset, // voffset
9195 SOffset, // soffset
9196 Offset, // offset
9197 Op.getOperand(6), // cachepolicy, swizzled buffer
9198 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9199 };
9200
9201 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
9202 }
9203 case Intrinsic::amdgcn_raw_tbuffer_load:
9204 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
9205 MemSDNode *M = cast<MemSDNode>(Op);
9206 EVT LoadVT = Op.getValueType();
9207 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9208 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
9209 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
9210
9211 SDValue Ops[] = {
9212 Op.getOperand(0), // Chain
9213 Rsrc, // rsrc
9214 DAG.getConstant(0, DL, MVT::i32), // vindex
9215 VOffset, // voffset
9216 SOffset, // soffset
9217 Offset, // offset
9218 Op.getOperand(5), // format
9219 Op.getOperand(6), // cachepolicy, swizzled buffer
9220 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9221 };
9222
9223 if (LoadVT.getScalarType() == MVT::f16)
9224 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9225 Ops);
9226 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9227 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9228 DAG);
9229 }
9230 case Intrinsic::amdgcn_struct_tbuffer_load:
9231 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
9232 MemSDNode *M = cast<MemSDNode>(Op);
9233 EVT LoadVT = Op.getValueType();
9234 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9235 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9236 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9237
9238 SDValue Ops[] = {
9239 Op.getOperand(0), // Chain
9240 Rsrc, // rsrc
9241 Op.getOperand(3), // vindex
9242 VOffset, // voffset
9243 SOffset, // soffset
9244 Offset, // offset
9245 Op.getOperand(6), // format
9246 Op.getOperand(7), // cachepolicy, swizzled buffer
9247 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9248 };
9249
9250 if (LoadVT.getScalarType() == MVT::f16)
9251 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9252 Ops);
9253 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9254 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9255 DAG);
9256 }
9257 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9258 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9259 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
9260 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9261 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9262 return lowerStructBufferAtomicIntrin(Op, DAG,
9264 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9265 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9266 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
9267 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9268 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9269 return lowerStructBufferAtomicIntrin(Op, DAG,
9271 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9272 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9273 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
9274 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9275 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9276 return lowerStructBufferAtomicIntrin(Op, DAG,
9278 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9279 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9280 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
9281 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9282 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9283 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9284 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9285 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9286 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9287 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9288 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9289 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
9290 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9291 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9292 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
9293 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9294 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9295 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
9296 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9297 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9298 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
9299 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9300 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9301 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9302 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9303 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9304 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9305 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9306 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9307 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9308 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9309 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9310 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9311 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9312 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9313 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9314 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9315 return lowerRawBufferAtomicIntrin(Op, DAG,
9317 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9318 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9319 return lowerStructBufferAtomicIntrin(Op, DAG,
9321 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9322 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9323 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9324 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9325 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9326 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9327 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9328 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9329 return lowerStructBufferAtomicIntrin(Op, DAG,
9331 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9332 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9333 return lowerStructBufferAtomicIntrin(Op, DAG,
9335 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9336 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9337 return lowerStructBufferAtomicIntrin(Op, DAG,
9339 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9340 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9341 return lowerStructBufferAtomicIntrin(Op, DAG,
9343 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9344 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9345 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9346 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9347 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9348 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9349 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9350 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9351 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9352 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9353 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9354 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9355 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9356 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9357 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9358 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9359 return lowerStructBufferAtomicIntrin(Op, DAG,
9361
9362 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9363 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9364 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
9365 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9366 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9367 SDValue Ops[] = {
9368 Op.getOperand(0), // Chain
9369 Op.getOperand(2), // src
9370 Op.getOperand(3), // cmp
9371 Rsrc, // rsrc
9372 DAG.getConstant(0, DL, MVT::i32), // vindex
9373 VOffset, // voffset
9374 SOffset, // soffset
9375 Offset, // offset
9376 Op.getOperand(7), // cachepolicy
9377 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9378 };
9379 EVT VT = Op.getValueType();
9380 auto *M = cast<MemSDNode>(Op);
9381
9383 Op->getVTList(), Ops, VT,
9384 M->getMemOperand());
9385 }
9386 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9387 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9388 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
9389 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
9390 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
9391 SDValue Ops[] = {
9392 Op.getOperand(0), // Chain
9393 Op.getOperand(2), // src
9394 Op.getOperand(3), // cmp
9395 Rsrc, // rsrc
9396 Op.getOperand(5), // vindex
9397 VOffset, // voffset
9398 SOffset, // soffset
9399 Offset, // offset
9400 Op.getOperand(8), // cachepolicy
9401 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9402 };
9403 EVT VT = Op.getValueType();
9404 auto *M = cast<MemSDNode>(Op);
9405
9407 Op->getVTList(), Ops, VT,
9408 M->getMemOperand());
9409 }
9410 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9411 MemSDNode *M = cast<MemSDNode>(Op);
9412 SDValue NodePtr = M->getOperand(2);
9413 SDValue RayExtent = M->getOperand(3);
9414 SDValue RayOrigin = M->getOperand(4);
9415 SDValue RayDir = M->getOperand(5);
9416 SDValue RayInvDir = M->getOperand(6);
9417 SDValue TDescr = M->getOperand(7);
9418
9419 assert(NodePtr.getValueType() == MVT::i32 ||
9420 NodePtr.getValueType() == MVT::i64);
9421 assert(RayDir.getValueType() == MVT::v3f16 ||
9422 RayDir.getValueType() == MVT::v3f32);
9423
9424 if (!Subtarget->hasGFX10_AEncoding()) {
9425 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
9426 return SDValue();
9427 }
9428
9429 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
9430 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9431 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9432 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
9433 const bool Is64 = NodePtr.getValueType() == MVT::i64;
9434 const unsigned NumVDataDwords = 4;
9435 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9436 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9437 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
9438 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
9439 IsGFX12Plus;
9440 const unsigned BaseOpcodes[2][2] = {
9441 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9442 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9443 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9444 int Opcode;
9445 if (UseNSA) {
9446 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9447 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9448 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9449 : AMDGPU::MIMGEncGfx10NSA,
9450 NumVDataDwords, NumVAddrDwords);
9451 } else {
9452 assert(!IsGFX12Plus);
9453 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9454 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9455 : AMDGPU::MIMGEncGfx10Default,
9456 NumVDataDwords, NumVAddrDwords);
9457 }
9458 assert(Opcode != -1);
9459
9461
9462 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
9464 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
9465 if (Lanes[0].getValueSizeInBits() == 32) {
9466 for (unsigned I = 0; I < 3; ++I)
9467 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
9468 } else {
9469 if (IsAligned) {
9470 Ops.push_back(DAG.getBitcast(
9471 MVT::i32,
9472 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
9473 Ops.push_back(Lanes[2]);
9474 } else {
9475 SDValue Elt0 = Ops.pop_back_val();
9476 Ops.push_back(DAG.getBitcast(
9477 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
9478 Ops.push_back(DAG.getBitcast(
9479 MVT::i32,
9480 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
9481 }
9482 }
9483 };
9484
9485 if (UseNSA && IsGFX11Plus) {
9486 Ops.push_back(NodePtr);
9487 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9488 Ops.push_back(RayOrigin);
9489 if (IsA16) {
9490 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
9491 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
9492 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
9493 for (unsigned I = 0; I < 3; ++I) {
9494 MergedLanes.push_back(DAG.getBitcast(
9495 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
9496 {DirLanes[I], InvDirLanes[I]})));
9497 }
9498 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
9499 } else {
9500 Ops.push_back(RayDir);
9501 Ops.push_back(RayInvDir);
9502 }
9503 } else {
9504 if (Is64)
9505 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
9506 2);
9507 else
9508 Ops.push_back(NodePtr);
9509
9510 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9511 packLanes(RayOrigin, true);
9512 packLanes(RayDir, true);
9513 packLanes(RayInvDir, false);
9514 }
9515
9516 if (!UseNSA) {
9517 // Build a single vector containing all the operands so far prepared.
9518 if (NumVAddrDwords > 12) {
9519 SDValue Undef = DAG.getUNDEF(MVT::i32);
9520 Ops.append(16 - Ops.size(), Undef);
9521 }
9522 assert(Ops.size() >= 8 && Ops.size() <= 12);
9523 SDValue MergedOps =
9524 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
9525 Ops.clear();
9526 Ops.push_back(MergedOps);
9527 }
9528
9529 Ops.push_back(TDescr);
9530 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
9531 Ops.push_back(M->getChain());
9532
9533 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
9534 MachineMemOperand *MemRef = M->getMemOperand();
9535 DAG.setNodeMemRefs(NewNode, {MemRef});
9536 return SDValue(NewNode, 0);
9537 }
9538 case Intrinsic::amdgcn_global_atomic_fmin_num:
9539 case Intrinsic::amdgcn_global_atomic_fmax_num:
9540 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9541 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9542 MemSDNode *M = cast<MemSDNode>(Op);
9543 SDValue Ops[] = {
9544 M->getOperand(0), // Chain
9545 M->getOperand(2), // Ptr
9546 M->getOperand(3) // Value
9547 };
9548 unsigned Opcode = 0;
9549 switch (IntrID) {
9550 case Intrinsic::amdgcn_global_atomic_fmin_num:
9551 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9552 Opcode = ISD::ATOMIC_LOAD_FMIN;
9553 break;
9554 }
9555 case Intrinsic::amdgcn_global_atomic_fmax_num:
9556 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9557 Opcode = ISD::ATOMIC_LOAD_FMAX;
9558 break;
9559 }
9560 default:
9561 llvm_unreachable("unhandled atomic opcode");
9562 }
9563 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
9564 Ops, M->getMemOperand());
9565 }
9566 case Intrinsic::amdgcn_s_get_barrier_state:
9567 case Intrinsic::amdgcn_s_get_named_barrier_state: {
9568 SDValue Chain = Op->getOperand(0);
9570 unsigned Opc;
9571
9572 if (isa<ConstantSDNode>(Op->getOperand(2))) {
9573 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
9574 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
9575 BarID = (BarID >> 4) & 0x3F;
9576 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9577 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
9578 Ops.push_back(K);
9579 Ops.push_back(Chain);
9580 } else {
9581 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9582 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
9583 SDValue M0Val;
9584 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
9585 DAG.getShiftAmountConstant(4, MVT::i32, DL));
9586 M0Val = SDValue(
9587 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
9588 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
9589 0);
9590 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
9591 } else
9592 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
9593 }
9594
9595 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9596 return SDValue(NewMI, 0);
9597 }
9598 default:
9599
9600 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9602 return lowerImage(Op, ImageDimIntr, DAG, true);
9603
9604 return SDValue();
9605 }
9606}
9607
9608// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
9609// dwordx4 if on SI and handle TFE loads.
9610SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
9611 SDVTList VTList,
9612 ArrayRef<SDValue> Ops, EVT MemVT,
9613 MachineMemOperand *MMO,
9614 SelectionDAG &DAG) const {
9615 LLVMContext &C = *DAG.getContext();
9617 EVT VT = VTList.VTs[0];
9618
9619 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
9620 bool IsTFE = VTList.NumVTs == 3;
9621 if (IsTFE) {
9622 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
9623 unsigned NumOpDWords = NumValueDWords + 1;
9624 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
9625 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
9626 MachineMemOperand *OpDWordsMMO =
9627 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
9628 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
9629 OpDWordsVT, OpDWordsMMO, DAG);
9631 DAG.getVectorIdxConstant(NumValueDWords, DL));
9632 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
9633 SDValue ValueDWords =
9634 NumValueDWords == 1
9635 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
9637 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
9638 ZeroIdx);
9639 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
9640 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
9641 }
9642
9643 if (!Subtarget->hasDwordx3LoadStores() &&
9644 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9645 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
9646 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
9647 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
9648 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
9649 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
9650 WidenedMemVT, WidenedMMO);
9652 DAG.getVectorIdxConstant(0, DL));
9653 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
9654 }
9655
9656 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
9657}
9658
9659SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
9660 bool ImageStore) const {
9661 EVT StoreVT = VData.getValueType();
9662
9663 // No change for f16 and legal vector D16 types.
9664 if (!StoreVT.isVector())
9665 return VData;
9666
9667 SDLoc DL(VData);
9668 unsigned NumElements = StoreVT.getVectorNumElements();
9669
9670 if (Subtarget->hasUnpackedD16VMem()) {
9671 // We need to unpack the packed data to store.
9672 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9673 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9674
9675 EVT EquivStoreVT =
9676 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
9677 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
9678 return DAG.UnrollVectorOp(ZExt.getNode());
9679 }
9680
9681 // The sq block of gfx8.1 does not estimate register use correctly for d16
9682 // image store instructions. The data operand is computed as if it were not a
9683 // d16 image instruction.
9684 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
9685 // Bitcast to i16
9686 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9687 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9688
9689 // Decompose into scalars
9691 DAG.ExtractVectorElements(IntVData, Elts);
9692
9693 // Group pairs of i16 into v2i16 and bitcast to i32
9694 SmallVector<SDValue, 4> PackedElts;
9695 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
9696 SDValue Pair =
9697 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
9698 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9699 PackedElts.push_back(IntPair);
9700 }
9701 if ((NumElements % 2) == 1) {
9702 // Handle v3i16
9703 unsigned I = Elts.size() / 2;
9704 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
9705 {Elts[I * 2], DAG.getUNDEF(MVT::i16)});
9706 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9707 PackedElts.push_back(IntPair);
9708 }
9709
9710 // Pad using UNDEF
9711 PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32));
9712
9713 // Build final vector
9714 EVT VecVT =
9715 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
9716 return DAG.getBuildVector(VecVT, DL, PackedElts);
9717 }
9718
9719 if (NumElements == 3) {
9720 EVT IntStoreVT =
9722 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9723
9724 EVT WidenedStoreVT = EVT::getVectorVT(
9725 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
9726 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
9727 WidenedStoreVT.getStoreSizeInBits());
9728 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
9729 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
9730 }
9731
9732 assert(isTypeLegal(StoreVT));
9733 return VData;
9734}
9735
9736SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
9737 SelectionDAG &DAG) const {
9738 SDLoc DL(Op);
9739 SDValue Chain = Op.getOperand(0);
9740 unsigned IntrinsicID = Op.getConstantOperandVal(1);
9742
9743 switch (IntrinsicID) {
9744 case Intrinsic::amdgcn_exp_compr: {
9745 if (!Subtarget->hasCompressedExport()) {
9746 DiagnosticInfoUnsupported BadIntrin(
9748 "intrinsic not supported on subtarget", DL.getDebugLoc());
9749 DAG.getContext()->diagnose(BadIntrin);
9750 }
9751 SDValue Src0 = Op.getOperand(4);
9752 SDValue Src1 = Op.getOperand(5);
9753 // Hack around illegal type on SI by directly selecting it.
9754 if (isTypeLegal(Src0.getValueType()))
9755 return SDValue();
9756
9757 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
9758 SDValue Undef = DAG.getUNDEF(MVT::f32);
9759 const SDValue Ops[] = {
9760 Op.getOperand(2), // tgt
9761 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
9762 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
9763 Undef, // src2
9764 Undef, // src3
9765 Op.getOperand(7), // vm
9766 DAG.getTargetConstant(1, DL, MVT::i1), // compr
9767 Op.getOperand(3), // en
9768 Op.getOperand(0) // Chain
9769 };
9770
9771 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9772 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
9773 }
9774 case Intrinsic::amdgcn_s_barrier:
9775 case Intrinsic::amdgcn_s_barrier_signal:
9776 case Intrinsic::amdgcn_s_barrier_wait: {
9779 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
9780 if (WGSize <= ST.getWavefrontSize()) {
9781 // If the workgroup fits in a wave, remove s_barrier_signal and lower
9782 // s_barrier/s_barrier_wait to wave_barrier.
9783 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal)
9784 return Op.getOperand(0);
9785 else
9786 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL,
9787 MVT::Other, Op.getOperand(0)),
9788 0);
9789 }
9790 }
9791
9792 if (ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
9793 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
9794 SDValue K =
9796 SDValue BarSignal =
9797 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
9798 MVT::Other, K, Op.getOperand(0)),
9799 0);
9800 SDValue BarWait =
9801 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
9802 BarSignal.getValue(0)),
9803 0);
9804 return BarWait;
9805 }
9806
9807 return SDValue();
9808 };
9809
9810 case Intrinsic::amdgcn_struct_tbuffer_store:
9811 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9812 SDValue VData = Op.getOperand(2);
9813 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9814 if (IsD16)
9815 VData = handleD16VData(VData, DAG);
9816 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9817 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9818 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9819 SDValue Ops[] = {
9820 Chain,
9821 VData, // vdata
9822 Rsrc, // rsrc
9823 Op.getOperand(4), // vindex
9824 VOffset, // voffset
9825 SOffset, // soffset
9826 Offset, // offset
9827 Op.getOperand(7), // format
9828 Op.getOperand(8), // cachepolicy, swizzled buffer
9829 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9830 };
9831 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
9833 MemSDNode *M = cast<MemSDNode>(Op);
9834 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9835 M->getMemoryVT(), M->getMemOperand());
9836 }
9837
9838 case Intrinsic::amdgcn_raw_tbuffer_store:
9839 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9840 SDValue VData = Op.getOperand(2);
9841 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9842 if (IsD16)
9843 VData = handleD16VData(VData, DAG);
9844 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9845 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9846 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9847 SDValue Ops[] = {
9848 Chain,
9849 VData, // vdata
9850 Rsrc, // rsrc
9851 DAG.getConstant(0, DL, MVT::i32), // vindex
9852 VOffset, // voffset
9853 SOffset, // soffset
9854 Offset, // offset
9855 Op.getOperand(6), // format
9856 Op.getOperand(7), // cachepolicy, swizzled buffer
9857 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9858 };
9859 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
9861 MemSDNode *M = cast<MemSDNode>(Op);
9862 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9863 M->getMemoryVT(), M->getMemOperand());
9864 }
9865
9866 case Intrinsic::amdgcn_raw_buffer_store:
9867 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9868 case Intrinsic::amdgcn_raw_buffer_store_format:
9869 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9870 const bool IsFormat =
9871 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9872 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9873
9874 SDValue VData = Op.getOperand(2);
9875 EVT VDataVT = VData.getValueType();
9876 EVT EltType = VDataVT.getScalarType();
9877 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9878 if (IsD16) {
9879 VData = handleD16VData(VData, DAG);
9880 VDataVT = VData.getValueType();
9881 }
9882
9883 if (!isTypeLegal(VDataVT)) {
9884 VData =
9885 DAG.getNode(ISD::BITCAST, DL,
9886 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9887 }
9888
9889 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9890 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9891 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9892 SDValue Ops[] = {
9893 Chain,
9894 VData,
9895 Rsrc,
9896 DAG.getConstant(0, DL, MVT::i32), // vindex
9897 VOffset, // voffset
9898 SOffset, // soffset
9899 Offset, // offset
9900 Op.getOperand(6), // cachepolicy, swizzled buffer
9901 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9902 };
9903 unsigned Opc =
9905 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9906 MemSDNode *M = cast<MemSDNode>(Op);
9907
9908 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9909 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9910 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
9911
9912 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9913 M->getMemoryVT(), M->getMemOperand());
9914 }
9915
9916 case Intrinsic::amdgcn_struct_buffer_store:
9917 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9918 case Intrinsic::amdgcn_struct_buffer_store_format:
9919 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9920 const bool IsFormat =
9921 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9922 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9923
9924 SDValue VData = Op.getOperand(2);
9925 EVT VDataVT = VData.getValueType();
9926 EVT EltType = VDataVT.getScalarType();
9927 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9928
9929 if (IsD16) {
9930 VData = handleD16VData(VData, DAG);
9931 VDataVT = VData.getValueType();
9932 }
9933
9934 if (!isTypeLegal(VDataVT)) {
9935 VData =
9936 DAG.getNode(ISD::BITCAST, DL,
9937 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9938 }
9939
9940 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9941 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9942 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9943 SDValue Ops[] = {
9944 Chain,
9945 VData,
9946 Rsrc,
9947 Op.getOperand(4), // vindex
9948 VOffset, // voffset
9949 SOffset, // soffset
9950 Offset, // offset
9951 Op.getOperand(7), // cachepolicy, swizzled buffer
9952 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9953 };
9954 unsigned Opc =
9956 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9957 MemSDNode *M = cast<MemSDNode>(Op);
9958
9959 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9960 EVT VDataType = VData.getValueType().getScalarType();
9961 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9962 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
9963
9964 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9965 M->getMemoryVT(), M->getMemOperand());
9966 }
9967 case Intrinsic::amdgcn_raw_buffer_load_lds:
9968 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9969 case Intrinsic::amdgcn_struct_buffer_load_lds:
9970 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9971 assert(!AMDGPU::isGFX12Plus(*Subtarget));
9972 unsigned Opc;
9973 bool HasVIndex =
9974 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9975 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9976 unsigned OpOffset = HasVIndex ? 1 : 0;
9977 SDValue VOffset = Op.getOperand(5 + OpOffset);
9978 bool HasVOffset = !isNullConstant(VOffset);
9979 unsigned Size = Op->getConstantOperandVal(4);
9980
9981 switch (Size) {
9982 default:
9983 return SDValue();
9984 case 1:
9985 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9986 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9987 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9988 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9989 break;
9990 case 2:
9991 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9992 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9993 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9994 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9995 break;
9996 case 4:
9997 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9998 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9999 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
10000 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
10001 break;
10002 case 12:
10003 if (!Subtarget->hasLDSLoadB96_B128())
10004 return SDValue();
10005 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
10006 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
10007 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
10008 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
10009 break;
10010 case 16:
10011 if (!Subtarget->hasLDSLoadB96_B128())
10012 return SDValue();
10013 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
10014 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
10015 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
10016 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
10017 break;
10018 }
10019
10020 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
10021
10023
10024 if (HasVIndex && HasVOffset)
10025 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
10026 {Op.getOperand(5), // VIndex
10027 VOffset}));
10028 else if (HasVIndex)
10029 Ops.push_back(Op.getOperand(5));
10030 else if (HasVOffset)
10031 Ops.push_back(VOffset);
10032
10033 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10034 Ops.push_back(Rsrc);
10035 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
10036 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
10037 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10038 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
10040 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
10041 DL, MVT::i8)); // cpol
10043 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
10044 ? 1
10045 : 0,
10046 DL, MVT::i8)); // swz
10047 Ops.push_back(M0Val.getValue(0)); // Chain
10048 Ops.push_back(M0Val.getValue(1)); // Glue
10049
10050 auto *M = cast<MemSDNode>(Op);
10051 MachineMemOperand *LoadMMO = M->getMemOperand();
10052 // Don't set the offset value here because the pointer points to the base of
10053 // the buffer.
10054 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
10055
10056 MachinePointerInfo StorePtrI = LoadPtrI;
10057 LoadPtrI.V = PoisonValue::get(
10061
10062 auto F = LoadMMO->getFlags() &
10064 LoadMMO =
10066 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
10067
10069 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
10070 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
10071
10072 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
10073 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
10074
10075 return SDValue(Load, 0);
10076 }
10077 case Intrinsic::amdgcn_global_load_lds: {
10078 unsigned Opc;
10079 unsigned Size = Op->getConstantOperandVal(4);
10080 switch (Size) {
10081 default:
10082 return SDValue();
10083 case 1:
10084 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
10085 break;
10086 case 2:
10087 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
10088 break;
10089 case 4:
10090 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
10091 break;
10092 case 12:
10093 if (!Subtarget->hasLDSLoadB96_B128())
10094 return SDValue();
10095 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
10096 break;
10097 case 16:
10098 if (!Subtarget->hasLDSLoadB96_B128())
10099 return SDValue();
10100 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
10101 break;
10102 }
10103
10104 auto *M = cast<MemSDNode>(Op);
10105 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
10106
10108
10109 SDValue Addr = Op.getOperand(2); // Global ptr
10110 SDValue VOffset;
10111 // Try to split SAddr and VOffset. Global and LDS pointers share the same
10112 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
10113 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
10114 SDValue LHS = Addr.getOperand(0);
10115 SDValue RHS = Addr.getOperand(1);
10116
10117 if (LHS->isDivergent())
10118 std::swap(LHS, RHS);
10119
10120 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
10121 RHS.getOperand(0).getValueType() == MVT::i32) {
10122 // add (i64 sgpr), (zero_extend (i32 vgpr))
10123 Addr = LHS;
10124 VOffset = RHS.getOperand(0);
10125 }
10126 }
10127
10128 Ops.push_back(Addr);
10129 if (!Addr->isDivergent()) {
10130 Opc = AMDGPU::getGlobalSaddrOp(Opc);
10131 if (!VOffset)
10132 VOffset =
10133 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
10134 DAG.getTargetConstant(0, DL, MVT::i32)),
10135 0);
10136 Ops.push_back(VOffset);
10137 }
10138
10139 Ops.push_back(Op.getOperand(5)); // Offset
10140 Ops.push_back(Op.getOperand(6)); // CPol
10141 Ops.push_back(M0Val.getValue(0)); // Chain
10142 Ops.push_back(M0Val.getValue(1)); // Glue
10143
10144 MachineMemOperand *LoadMMO = M->getMemOperand();
10145 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
10146 LoadPtrI.Offset = Op->getConstantOperandVal(5);
10147 MachinePointerInfo StorePtrI = LoadPtrI;
10148 LoadPtrI.V = PoisonValue::get(
10152 auto F = LoadMMO->getFlags() &
10154 LoadMMO =
10156 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
10158 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
10159 LoadMMO->getAAInfo());
10160
10161 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10162 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
10163
10164 return SDValue(Load, 0);
10165 }
10166 case Intrinsic::amdgcn_end_cf:
10167 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
10168 Op->getOperand(2), Chain),
10169 0);
10170 case Intrinsic::amdgcn_s_barrier_init:
10171 case Intrinsic::amdgcn_s_barrier_signal_var: {
10172 // these two intrinsics have two operands: barrier pointer and member count
10173 SDValue Chain = Op->getOperand(0);
10175 SDValue BarOp = Op->getOperand(2);
10176 SDValue CntOp = Op->getOperand(3);
10177 SDValue M0Val;
10178 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
10179 ? AMDGPU::S_BARRIER_INIT_M0
10180 : AMDGPU::S_BARRIER_SIGNAL_M0;
10181 // extract the BarrierID from bits 4-9 of BarOp
10182 SDValue BarID;
10183 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
10184 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10185 BarID =
10186 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
10187 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10188 0);
10189 // Member count should be put into M0[ShAmt:+6]
10190 // Barrier ID should be put into M0[5:0]
10191 M0Val =
10192 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
10193 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10194 0);
10195 constexpr unsigned ShAmt = 16;
10196 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
10197 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
10198
10199 M0Val = SDValue(
10200 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
10201
10202 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10203
10204 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10205 return SDValue(NewMI, 0);
10206 }
10207 case Intrinsic::amdgcn_s_barrier_join: {
10208 // these three intrinsics have one operand: barrier pointer
10209 SDValue Chain = Op->getOperand(0);
10211 SDValue BarOp = Op->getOperand(2);
10212 unsigned Opc;
10213
10214 if (isa<ConstantSDNode>(BarOp)) {
10215 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
10216 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
10217
10218 // extract the BarrierID from bits 4-9 of the immediate
10219 unsigned BarID = (BarVal >> 4) & 0x3F;
10220 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10221 Ops.push_back(K);
10222 Ops.push_back(Chain);
10223 } else {
10224 Opc = AMDGPU::S_BARRIER_JOIN_M0;
10225
10226 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
10227 SDValue M0Val;
10228 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
10229 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10230 M0Val =
10231 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10232 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10233 0);
10234 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10235 }
10236
10237 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10238 return SDValue(NewMI, 0);
10239 }
10240 case Intrinsic::amdgcn_s_prefetch_data: {
10241 // For non-global address space preserve the chain and remove the call.
10242 if (!AMDGPU::isFlatGlobalAddrSpace(cast<MemSDNode>(Op)->getAddressSpace()))
10243 return Op.getOperand(0);
10244 return Op;
10245 }
10246 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
10247 SDValue Ops[] = {
10248 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
10249 Op.getOperand(3), // offset
10250 Op.getOperand(4), // length
10251 };
10252
10253 MemSDNode *M = cast<MemSDNode>(Op);
10255 Op->getVTList(), Ops, M->getMemoryVT(),
10256 M->getMemOperand());
10257 }
10258 default: {
10259 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10261 return lowerImage(Op, ImageDimIntr, DAG, true);
10262
10263 return Op;
10264 }
10265 }
10266}
10267
10268// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
10269// offset (the offset that is included in bounds checking and swizzling, to be
10270// split between the instruction's voffset and immoffset fields) and soffset
10271// (the offset that is excluded from bounds checking and swizzling, to go in
10272// the instruction's soffset field). This function takes the first kind of
10273// offset and figures out how to split it between voffset and immoffset.
10274std::pair<SDValue, SDValue>
10275SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
10276 SDLoc DL(Offset);
10277 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
10278 SDValue N0 = Offset;
10279 ConstantSDNode *C1 = nullptr;
10280
10281 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
10282 N0 = SDValue();
10283 else if (DAG.isBaseWithConstantOffset(N0)) {
10284 C1 = cast<ConstantSDNode>(N0.getOperand(1));
10285 N0 = N0.getOperand(0);
10286 }
10287
10288 if (C1) {
10289 unsigned ImmOffset = C1->getZExtValue();
10290 // If the immediate value is too big for the immoffset field, put only bits
10291 // that would normally fit in the immoffset field. The remaining value that
10292 // is copied/added for the voffset field is a large power of 2, and it
10293 // stands more chance of being CSEd with the copy/add for another similar
10294 // load/store.
10295 // However, do not do that rounding down if that is a negative
10296 // number, as it appears to be illegal to have a negative offset in the
10297 // vgpr, even if adding the immediate offset makes it positive.
10298 unsigned Overflow = ImmOffset & ~MaxImm;
10299 ImmOffset -= Overflow;
10300 if ((int32_t)Overflow < 0) {
10301 Overflow += ImmOffset;
10302 ImmOffset = 0;
10303 }
10304 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
10305 if (Overflow) {
10306 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
10307 if (!N0)
10308 N0 = OverflowVal;
10309 else {
10310 SDValue Ops[] = {N0, OverflowVal};
10311 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
10312 }
10313 }
10314 }
10315 if (!N0)
10316 N0 = DAG.getConstant(0, DL, MVT::i32);
10317 if (!C1)
10318 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
10319 return {N0, SDValue(C1, 0)};
10320}
10321
10322// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
10323// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
10324// pointed to by Offsets.
10325void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
10326 SelectionDAG &DAG, SDValue *Offsets,
10327 Align Alignment) const {
10329 SDLoc DL(CombinedOffset);
10330 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10331 uint32_t Imm = C->getZExtValue();
10332 uint32_t SOffset, ImmOffset;
10333 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10334 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
10335 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10336 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10337 return;
10338 }
10339 }
10340 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
10341 SDValue N0 = CombinedOffset.getOperand(0);
10342 SDValue N1 = CombinedOffset.getOperand(1);
10343 uint32_t SOffset, ImmOffset;
10344 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10345 if (Offset >= 0 &&
10346 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
10347 Offsets[0] = N0;
10348 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10349 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10350 return;
10351 }
10352 }
10353
10354 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
10355 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
10356 : DAG.getConstant(0, DL, MVT::i32);
10357
10358 Offsets[0] = CombinedOffset;
10359 Offsets[1] = SOffsetZero;
10360 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
10361}
10362
10363SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
10364 SelectionDAG &DAG) const {
10365 if (!MaybePointer.getValueType().isScalarInteger())
10366 return MaybePointer;
10367
10368 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
10369 return Rsrc;
10370}
10371
10372// Wrap a global or flat pointer into a buffer intrinsic using the flags
10373// specified in the intrinsic.
10374SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
10375 SelectionDAG &DAG) const {
10376 SDLoc Loc(Op);
10377
10378 SDValue Pointer = Op->getOperand(1);
10379 SDValue Stride = Op->getOperand(2);
10380 SDValue NumRecords = Op->getOperand(3);
10381 SDValue Flags = Op->getOperand(4);
10382
10383 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10384 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
10385 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
10386 std::optional<uint32_t> ConstStride = std::nullopt;
10387 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10388 ConstStride = ConstNode->getZExtValue();
10389
10390 SDValue NewHighHalf = Masked;
10391 if (!ConstStride || *ConstStride != 0) {
10392 SDValue ShiftedStride;
10393 if (ConstStride) {
10394 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
10395 } else {
10396 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
10397 ShiftedStride =
10398 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
10399 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
10400 }
10401 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
10402 }
10403
10404 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
10405 NewHighHalf, NumRecords, Flags);
10406 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
10407 return RsrcPtr;
10408}
10409
10410// Handle 8 bit and 16 bit buffer loads
10411SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
10412 EVT LoadVT, SDLoc DL,
10414 MachineMemOperand *MMO,
10415 bool IsTFE) const {
10416 EVT IntVT = LoadVT.changeTypeToInteger();
10417
10418 if (IsTFE) {
10419 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
10423 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
10424 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
10425 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
10427 DAG.getConstant(1, DL, MVT::i32));
10428 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10429 DAG.getConstant(0, DL, MVT::i32));
10430 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
10431 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
10432 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10433 }
10434
10435 unsigned Opc = LoadVT.getScalarType() == MVT::i8
10438
10439 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
10440 SDValue BufferLoad =
10441 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
10442 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
10443 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
10444
10445 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
10446}
10447
10448// Handle 8 bit and 16 bit buffer stores
10449SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
10450 EVT VDataType, SDLoc DL,
10451 SDValue Ops[],
10452 MemSDNode *M) const {
10453 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
10454 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
10455
10456 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
10457 Ops[1] = BufferStoreExt;
10458 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
10460 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
10461 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
10462 M->getMemOperand());
10463}
10464
10466 SDValue Op, const SDLoc &SL, EVT VT) {
10467 if (VT.bitsLT(Op.getValueType()))
10468 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
10469
10470 switch (ExtType) {
10471 case ISD::SEXTLOAD:
10472 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
10473 case ISD::ZEXTLOAD:
10474 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
10475 case ISD::EXTLOAD:
10476 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
10477 case ISD::NON_EXTLOAD:
10478 return Op;
10479 }
10480
10481 llvm_unreachable("invalid ext type");
10482}
10483
10484// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
10485// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
10486SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
10487 DAGCombinerInfo &DCI) const {
10488 SelectionDAG &DAG = DCI.DAG;
10489 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
10490 return SDValue();
10491
10492 // FIXME: Constant loads should all be marked invariant.
10493 unsigned AS = Ld->getAddressSpace();
10494 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
10496 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
10497 return SDValue();
10498
10499 // Don't do this early, since it may interfere with adjacent load merging for
10500 // illegal types. We can avoid losing alignment information for exotic types
10501 // pre-legalize.
10502 EVT MemVT = Ld->getMemoryVT();
10503 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
10504 MemVT.getSizeInBits() >= 32)
10505 return SDValue();
10506
10507 SDLoc SL(Ld);
10508
10509 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
10510 "unexpected vector extload");
10511
10512 // TODO: Drop only high part of range.
10513 SDValue Ptr = Ld->getBasePtr();
10514 SDValue NewLoad = DAG.getLoad(
10515 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
10516 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
10517 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
10518 nullptr); // Drop ranges
10519
10520 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
10521 if (MemVT.isFloatingPoint()) {
10523 "unexpected fp extload");
10524 TruncVT = MemVT.changeTypeToInteger();
10525 }
10526
10527 SDValue Cvt = NewLoad;
10528 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
10529 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
10530 DAG.getValueType(TruncVT));
10531 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
10533 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
10534 } else {
10536 }
10537
10538 EVT VT = Ld->getValueType(0);
10539 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
10540
10541 DCI.AddToWorklist(Cvt.getNode());
10542
10543 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
10544 // the appropriate extension from the 32-bit load.
10545 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
10546 DCI.AddToWorklist(Cvt.getNode());
10547
10548 // Handle conversion back to floating point if necessary.
10549 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
10550
10551 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
10552}
10553
10555 const SIMachineFunctionInfo &Info) {
10556 // TODO: Should check if the address can definitely not access stack.
10557 if (Info.isEntryFunction())
10558 return Info.getUserSGPRInfo().hasFlatScratchInit();
10559 return true;
10560}
10561
10562SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
10563 SDLoc DL(Op);
10564 LoadSDNode *Load = cast<LoadSDNode>(Op);
10565 ISD::LoadExtType ExtType = Load->getExtensionType();
10566 EVT MemVT = Load->getMemoryVT();
10567 MachineMemOperand *MMO = Load->getMemOperand();
10568
10569 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
10570 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
10571 return SDValue();
10572
10573 // FIXME: Copied from PPC
10574 // First, load into 32 bits, then truncate to 1 bit.
10575
10576 SDValue Chain = Load->getChain();
10577 SDValue BasePtr = Load->getBasePtr();
10578
10579 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10580
10581 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
10582 RealMemVT, MMO);
10583
10584 if (!MemVT.isVector()) {
10585 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
10586 NewLD.getValue(1)};
10587
10588 return DAG.getMergeValues(Ops, DL);
10589 }
10590
10592 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
10593 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
10594 DAG.getConstant(I, DL, MVT::i32));
10595
10596 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
10597 }
10598
10599 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
10600
10601 return DAG.getMergeValues(Ops, DL);
10602 }
10603
10604 if (!MemVT.isVector())
10605 return SDValue();
10606
10607 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
10608 "Custom lowering for non-i32 vectors hasn't been implemented.");
10609
10610 Align Alignment = Load->getAlign();
10611 unsigned AS = Load->getAddressSpace();
10612 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
10613 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
10614 return SplitVectorLoad(Op, DAG);
10615 }
10616
10619 // If there is a possibility that flat instruction access scratch memory
10620 // then we need to use the same legalization rules we use for private.
10621 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10623 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
10626
10627 unsigned NumElements = MemVT.getVectorNumElements();
10628
10629 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10631 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
10632 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
10634 if ((!Op->isDivergent() || AMDGPUInstrInfo::isUniformMMO(MMO)) &&
10635 Alignment >= Align(4) && NumElements < 32) {
10636 if (MemVT.isPow2VectorType() ||
10637 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10638 return SDValue();
10639 return WidenOrSplitVectorLoad(Op, DAG);
10640 }
10641 // Non-uniform loads will be selected to MUBUF instructions, so they
10642 // have the same legalization requirements as global and private
10643 // loads.
10644 //
10645 }
10646 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10649 if (NumElements > 4)
10650 return SplitVectorLoad(Op, DAG);
10651 // v3 loads not supported on SI.
10652 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10653 return WidenOrSplitVectorLoad(Op, DAG);
10654
10655 // v3 and v4 loads are supported for private and global memory.
10656 return SDValue();
10657 }
10658 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10659 // Depending on the setting of the private_element_size field in the
10660 // resource descriptor, we can only make private accesses up to a certain
10661 // size.
10662 switch (Subtarget->getMaxPrivateElementSize()) {
10663 case 4: {
10664 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
10665 return DAG.getMergeValues({Op0, Op1}, DL);
10666 }
10667 case 8:
10668 if (NumElements > 2)
10669 return SplitVectorLoad(Op, DAG);
10670 return SDValue();
10671 case 16:
10672 // Same as global/flat
10673 if (NumElements > 4)
10674 return SplitVectorLoad(Op, DAG);
10675 // v3 loads not supported on SI.
10676 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10677 return WidenOrSplitVectorLoad(Op, DAG);
10678
10679 return SDValue();
10680 default:
10681 llvm_unreachable("unsupported private_element_size");
10682 }
10683 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10684 unsigned Fast = 0;
10685 auto Flags = Load->getMemOperand()->getFlags();
10687 Load->getAlign(), Flags, &Fast) &&
10688 Fast > 1)
10689 return SDValue();
10690
10691 if (MemVT.isVector())
10692 return SplitVectorLoad(Op, DAG);
10693 }
10694
10696 MemVT, *Load->getMemOperand())) {
10697 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
10698 return DAG.getMergeValues({Op0, Op1}, DL);
10699 }
10700
10701 return SDValue();
10702}
10703
10704SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
10705 EVT VT = Op.getValueType();
10706 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
10707 VT.getSizeInBits() == 512)
10708 return splitTernaryVectorOp(Op, DAG);
10709
10710 assert(VT.getSizeInBits() == 64);
10711
10712 SDLoc DL(Op);
10713 SDValue Cond = Op.getOperand(0);
10714
10715 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
10716 SDValue One = DAG.getConstant(1, DL, MVT::i32);
10717
10718 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10719 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
10720
10721 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
10722 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
10723
10724 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
10725
10726 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
10727 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
10728
10729 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
10730
10731 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
10732 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
10733}
10734
10735// Catch division cases where we can use shortcuts with rcp and rsq
10736// instructions.
10737SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
10738 SelectionDAG &DAG) const {
10739 SDLoc SL(Op);
10740 SDValue LHS = Op.getOperand(0);
10741 SDValue RHS = Op.getOperand(1);
10742 EVT VT = Op.getValueType();
10743 const SDNodeFlags Flags = Op->getFlags();
10744
10745 bool AllowInaccurateRcp =
10746 Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
10747
10748 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
10749 // Without !fpmath accuracy information, we can't do more because we don't
10750 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
10751 // f16 is always accurate enough
10752 if (!AllowInaccurateRcp && VT != MVT::f16)
10753 return SDValue();
10754
10755 if (CLHS->isExactlyValue(1.0)) {
10756 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
10757 // the CI documentation has a worst case error of 1 ulp.
10758 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
10759 // use it as long as we aren't trying to use denormals.
10760 //
10761 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
10762
10763 // 1.0 / sqrt(x) -> rsq(x)
10764
10765 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
10766 // error seems really high at 2^29 ULP.
10767 // 1.0 / x -> rcp(x)
10768 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10769 }
10770
10771 // Same as for 1.0, but expand the sign out of the constant.
10772 if (CLHS->isExactlyValue(-1.0)) {
10773 // -1.0 / x -> rcp (fneg x)
10774 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
10775 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
10776 }
10777 }
10778
10779 // For f16 require afn or arcp.
10780 // For f32 require afn.
10781 if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
10782 return SDValue();
10783
10784 // Turn into multiply by the reciprocal.
10785 // x / y -> x * (1.0 / y)
10786 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10787 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
10788}
10789
10790SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
10791 SelectionDAG &DAG) const {
10792 SDLoc SL(Op);
10793 SDValue X = Op.getOperand(0);
10794 SDValue Y = Op.getOperand(1);
10795 EVT VT = Op.getValueType();
10796 const SDNodeFlags Flags = Op->getFlags();
10797
10798 bool AllowInaccurateDiv =
10799 Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
10800 if (!AllowInaccurateDiv)
10801 return SDValue();
10802
10803 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
10804 SDValue One = DAG.getConstantFP(1.0, SL, VT);
10805
10806 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
10807 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10808
10809 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
10810 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10811 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
10812 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
10813 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
10814 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
10815}
10816
10817static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10818 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
10819 SDNodeFlags Flags) {
10820 if (GlueChain->getNumValues() <= 1) {
10821 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
10822 }
10823
10824 assert(GlueChain->getNumValues() == 3);
10825
10826 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10827 switch (Opcode) {
10828 default:
10829 llvm_unreachable("no chain equivalent for opcode");
10830 case ISD::FMUL:
10831 Opcode = AMDGPUISD::FMUL_W_CHAIN;
10832 break;
10833 }
10834
10835 return DAG.getNode(Opcode, SL, VTList,
10836 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
10837 Flags);
10838}
10839
10840static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10841 EVT VT, SDValue A, SDValue B, SDValue C,
10842 SDValue GlueChain, SDNodeFlags Flags) {
10843 if (GlueChain->getNumValues() <= 1) {
10844 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
10845 }
10846
10847 assert(GlueChain->getNumValues() == 3);
10848
10849 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10850 switch (Opcode) {
10851 default:
10852 llvm_unreachable("no chain equivalent for opcode");
10853 case ISD::FMA:
10854 Opcode = AMDGPUISD::FMA_W_CHAIN;
10855 break;
10856 }
10857
10858 return DAG.getNode(Opcode, SL, VTList,
10859 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
10860 Flags);
10861}
10862
10863SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
10864 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10865 return FastLowered;
10866
10867 SDLoc SL(Op);
10868 SDValue LHS = Op.getOperand(0);
10869 SDValue RHS = Op.getOperand(1);
10870
10871 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
10872 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
10873 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
10874 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
10875 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
10876 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
10877 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
10878 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
10879 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
10880 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
10881 // q16.u = opx(V_CVT_F16_F32, q32.u);
10882 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
10883
10884 // We will use ISD::FMA on targets that don't support ISD::FMAD.
10885 unsigned FMADOpCode =
10887
10888 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
10889 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
10890 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
10891 SDValue Rcp =
10892 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
10893 SDValue Quot =
10894 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
10895 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10896 Op->getFlags());
10897 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
10898 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10899 Op->getFlags());
10900 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
10901 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
10902 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
10903 DAG.getConstant(0xff800000, SL, MVT::i32));
10904 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
10905 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
10906 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
10907 DAG.getTargetConstant(0, SL, MVT::i32));
10908 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
10909 Op->getFlags());
10910}
10911
10912// Faster 2.5 ULP division that does not support denormals.
10913SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
10914 SDNodeFlags Flags = Op->getFlags();
10915 SDLoc SL(Op);
10916 SDValue LHS = Op.getOperand(1);
10917 SDValue RHS = Op.getOperand(2);
10918
10919 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
10920
10921 const APFloat K0Val(0x1p+96f);
10922 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
10923
10924 const APFloat K1Val(0x1p-32f);
10925 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
10926
10927 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10928
10929 EVT SetCCVT =
10930 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
10931
10932 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
10933
10934 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
10935
10936 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
10937
10938 // rcp does not support denormals.
10939 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
10940
10941 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
10942
10943 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
10944}
10945
10946// Returns immediate value for setting the F32 denorm mode when using the
10947// S_DENORM_MODE instruction.
10949 const SIMachineFunctionInfo *Info,
10950 const GCNSubtarget *ST) {
10951 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
10952 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
10953 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10954 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
10955}
10956
10957SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
10958 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10959 return FastLowered;
10960
10961 // The selection matcher assumes anything with a chain selecting to a
10962 // mayRaiseFPException machine instruction. Since we're introducing a chain
10963 // here, we need to explicitly report nofpexcept for the regular fdiv
10964 // lowering.
10965 SDNodeFlags Flags = Op->getFlags();
10966 Flags.setNoFPExcept(true);
10967
10968 SDLoc SL(Op);
10969 SDValue LHS = Op.getOperand(0);
10970 SDValue RHS = Op.getOperand(1);
10971
10972 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10973
10974 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
10975
10976 SDValue DenominatorScaled =
10977 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
10978 SDValue NumeratorScaled =
10979 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
10980
10981 // Denominator is scaled to not be denormal, so using rcp is ok.
10982 SDValue ApproxRcp =
10983 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
10984 SDValue NegDivScale0 =
10985 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
10986
10987 using namespace AMDGPU::Hwreg;
10988 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10989 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
10990
10991 const MachineFunction &MF = DAG.getMachineFunction();
10993 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
10994
10995 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
10996 const bool HasDynamicDenormals =
10997 (DenormMode.Input == DenormalMode::Dynamic) ||
10998 (DenormMode.Output == DenormalMode::Dynamic);
10999
11000 SDValue SavedDenormMode;
11001
11002 if (!PreservesDenormals) {
11003 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
11004 // lowering. The chain dependence is insufficient, and we need glue. We do
11005 // not need the glue variants in a strictfp function.
11006
11007 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
11008
11009 SDValue Glue = DAG.getEntryNode();
11010 if (HasDynamicDenormals) {
11011 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
11012 DAG.getVTList(MVT::i32, MVT::Glue),
11013 {BitField, Glue});
11014 SavedDenormMode = SDValue(GetReg, 0);
11015
11016 Glue = DAG.getMergeValues(
11017 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
11018 }
11019
11020 SDNode *EnableDenorm;
11021 if (Subtarget->hasDenormModeInst()) {
11022 const SDValue EnableDenormValue =
11023 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
11024
11025 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
11026 EnableDenormValue)
11027 .getNode();
11028 } else {
11029 const SDValue EnableDenormValue =
11030 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
11031 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
11032 {EnableDenormValue, BitField, Glue});
11033 }
11034
11035 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
11036 SDValue(EnableDenorm, 1)};
11037
11038 NegDivScale0 = DAG.getMergeValues(Ops, SL);
11039 }
11040
11041 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
11042 ApproxRcp, One, NegDivScale0, Flags);
11043
11044 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
11045 ApproxRcp, Fma0, Flags);
11046
11047 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
11048 Fma1, Flags);
11049
11050 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
11051 NumeratorScaled, Mul, Flags);
11052
11053 SDValue Fma3 =
11054 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
11055
11056 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
11057 NumeratorScaled, Fma3, Flags);
11058
11059 if (!PreservesDenormals) {
11060 SDNode *DisableDenorm;
11061 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
11062 const SDValue DisableDenormValue = getSPDenormModeValue(
11063 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
11064
11065 DisableDenorm =
11066 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other, Fma4.getValue(1),
11067 DisableDenormValue, Fma4.getValue(2))
11068 .getNode();
11069 } else {
11070 assert(HasDynamicDenormals == (bool)SavedDenormMode);
11071 const SDValue DisableDenormValue =
11072 HasDynamicDenormals
11073 ? SavedDenormMode
11074 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
11075
11076 DisableDenorm = DAG.getMachineNode(
11077 AMDGPU::S_SETREG_B32, SL, MVT::Other,
11078 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
11079 }
11080
11081 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
11082 SDValue(DisableDenorm, 0), DAG.getRoot());
11083 DAG.setRoot(OutputChain);
11084 }
11085
11086 SDValue Scale = NumeratorScaled.getValue(1);
11087 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
11088 {Fma4, Fma1, Fma3, Scale}, Flags);
11089
11090 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
11091}
11092
11093SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
11094 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
11095 return FastLowered;
11096
11097 SDLoc SL(Op);
11098 SDValue X = Op.getOperand(0);
11099 SDValue Y = Op.getOperand(1);
11100
11101 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
11102
11103 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
11104
11105 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
11106
11107 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
11108
11109 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
11110
11111 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
11112
11113 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
11114
11115 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
11116
11117 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
11118
11119 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
11120 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
11121
11122 SDValue Fma4 =
11123 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
11124
11125 SDValue Scale;
11126
11127 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
11128 // Workaround a hardware bug on SI where the condition output from div_scale
11129 // is not usable.
11130
11131 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
11132
11133 // Figure out if the scale to use for div_fmas.
11134 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
11135 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
11136 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
11137 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
11138
11139 SDValue NumHi =
11140 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
11141 SDValue DenHi =
11142 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
11143
11144 SDValue Scale0Hi =
11145 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
11146 SDValue Scale1Hi =
11147 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
11148
11149 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
11150 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
11151 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
11152 } else {
11153 Scale = DivScale1.getValue(1);
11154 }
11155
11156 SDValue Fmas =
11157 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
11158
11159 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
11160}
11161
11162SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
11163 EVT VT = Op.getValueType();
11164
11165 if (VT == MVT::f32)
11166 return LowerFDIV32(Op, DAG);
11167
11168 if (VT == MVT::f64)
11169 return LowerFDIV64(Op, DAG);
11170
11171 if (VT == MVT::f16)
11172 return LowerFDIV16(Op, DAG);
11173
11174 llvm_unreachable("Unexpected type for fdiv");
11175}
11176
11177SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
11178 SDLoc dl(Op);
11179 SDValue Val = Op.getOperand(0);
11180 EVT VT = Val.getValueType();
11181 EVT ResultExpVT = Op->getValueType(1);
11182 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
11183
11184 SDValue Mant = DAG.getNode(
11186 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
11187
11188 SDValue Exp = DAG.getNode(
11189 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
11190 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
11191
11192 if (Subtarget->hasFractBug()) {
11193 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
11194 SDValue Inf =
11196
11197 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
11198 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
11199 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
11200 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
11201 }
11202
11203 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
11204 return DAG.getMergeValues({Mant, CastExp}, dl);
11205}
11206
11207SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
11208 SDLoc DL(Op);
11209 StoreSDNode *Store = cast<StoreSDNode>(Op);
11210 EVT VT = Store->getMemoryVT();
11211
11212 if (VT == MVT::i1) {
11213 return DAG.getTruncStore(
11214 Store->getChain(), DL,
11215 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
11216 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
11217 }
11218
11219 assert(VT.isVector() &&
11220 Store->getValue().getValueType().getScalarType() == MVT::i32);
11221
11222 unsigned AS = Store->getAddressSpace();
11223 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
11224 Store->getAlign().value() < VT.getStoreSize() &&
11225 VT.getSizeInBits() > 32) {
11226 return SplitVectorStore(Op, DAG);
11227 }
11228
11231 // If there is a possibility that flat instruction access scratch memory
11232 // then we need to use the same legalization rules we use for private.
11233 if (AS == AMDGPUAS::FLAT_ADDRESS &&
11235 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
11238
11239 unsigned NumElements = VT.getVectorNumElements();
11241 if (NumElements > 4)
11242 return SplitVectorStore(Op, DAG);
11243 // v3 stores not supported on SI.
11244 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11245 return SplitVectorStore(Op, DAG);
11246
11248 VT, *Store->getMemOperand()))
11249 return expandUnalignedStore(Store, DAG);
11250
11251 return SDValue();
11252 }
11253 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11254 switch (Subtarget->getMaxPrivateElementSize()) {
11255 case 4:
11256 return scalarizeVectorStore(Store, DAG);
11257 case 8:
11258 if (NumElements > 2)
11259 return SplitVectorStore(Op, DAG);
11260 return SDValue();
11261 case 16:
11262 if (NumElements > 4 ||
11263 (NumElements == 3 && !Subtarget->enableFlatScratch()))
11264 return SplitVectorStore(Op, DAG);
11265 return SDValue();
11266 default:
11267 llvm_unreachable("unsupported private_element_size");
11268 }
11269 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
11270 unsigned Fast = 0;
11271 auto Flags = Store->getMemOperand()->getFlags();
11273 Store->getAlign(), Flags, &Fast) &&
11274 Fast > 1)
11275 return SDValue();
11276
11277 if (VT.isVector())
11278 return SplitVectorStore(Op, DAG);
11279
11280 return expandUnalignedStore(Store, DAG);
11281 }
11282
11283 // Probably an invalid store. If so we'll end up emitting a selection error.
11284 return SDValue();
11285}
11286
11287// Avoid the full correct expansion for f32 sqrt when promoting from f16.
11288SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
11289 SDLoc SL(Op);
11290 assert(!Subtarget->has16BitInsts());
11291 SDNodeFlags Flags = Op->getFlags();
11292 SDValue Ext =
11293 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
11294
11295 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
11296 SDValue Sqrt =
11297 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
11298
11299 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
11300 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
11301}
11302
11303SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
11304 SDLoc DL(Op);
11305 SDNodeFlags Flags = Op->getFlags();
11306 MVT VT = Op.getValueType().getSimpleVT();
11307 const SDValue X = Op.getOperand(0);
11308
11309 if (allowApproxFunc(DAG, Flags)) {
11310 // Instruction is 1ulp but ignores denormals.
11311 return DAG.getNode(
11313 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
11314 }
11315
11316 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
11317 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
11318
11319 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
11320
11321 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
11322
11323 SDValue SqrtX =
11324 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
11325
11326 SDValue SqrtS;
11327 if (needsDenormHandlingF32(DAG, X, Flags)) {
11328 SDValue SqrtID =
11329 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
11330 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
11331
11332 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
11333 SDValue SqrtSNextDownInt =
11334 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11335 DAG.getAllOnesConstant(DL, MVT::i32));
11336 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
11337
11338 SDValue NegSqrtSNextDown =
11339 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
11340
11341 SDValue SqrtVP =
11342 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
11343
11344 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11345 DAG.getConstant(1, DL, MVT::i32));
11346 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
11347
11348 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
11349 SDValue SqrtVS =
11350 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
11351
11352 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
11353 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
11354
11355 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
11356 Flags);
11357
11358 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
11359 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
11360 Flags);
11361 } else {
11362 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
11363
11364 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
11365
11366 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
11367 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
11368 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
11369
11370 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
11371 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
11372 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
11373
11374 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
11375 SDValue SqrtD =
11376 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
11377 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
11378 }
11379
11380 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
11381
11382 SDValue ScaledDown =
11383 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
11384
11385 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
11386 SDValue IsZeroOrInf =
11387 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11388 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11389
11390 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
11391}
11392
11393SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
11394 // For double type, the SQRT and RSQ instructions don't have required
11395 // precision, we apply Goldschmidt's algorithm to improve the result:
11396 //
11397 // y0 = rsq(x)
11398 // g0 = x * y0
11399 // h0 = 0.5 * y0
11400 //
11401 // r0 = 0.5 - h0 * g0
11402 // g1 = g0 * r0 + g0
11403 // h1 = h0 * r0 + h0
11404 //
11405 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
11406 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
11407 // h2 = h1 * r1 + h1
11408 //
11409 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
11410 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
11411 //
11412 // sqrt(x) = g3
11413
11414 SDNodeFlags Flags = Op->getFlags();
11415
11416 SDLoc DL(Op);
11417
11418 SDValue X = Op.getOperand(0);
11419 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
11420
11421 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
11422
11423 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
11424
11425 // Scale up input if it is too small.
11426 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
11427 SDValue ScaleUp =
11428 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
11429 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
11430
11431 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
11432
11433 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
11434
11435 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
11436 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
11437
11438 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
11439 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
11440
11441 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
11442
11443 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
11444
11445 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
11446 SDValue SqrtD0 =
11447 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
11448
11449 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
11450
11451 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
11452 SDValue SqrtD1 =
11453 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
11454
11455 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
11456
11457 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
11458 SDValue ScaleDown =
11459 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
11460 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
11461
11462 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
11463 // with finite only or nsz because rsq(+/-0) = +/-inf
11464
11465 // TODO: Check for DAZ and expand to subnormals
11466 SDValue IsZeroOrInf =
11467 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11468 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11469
11470 // If x is +INF, +0, or -0, use its original value
11471 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
11472 Flags);
11473}
11474
11475SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
11476 SDLoc DL(Op);
11477 EVT VT = Op.getValueType();
11478 SDValue Arg = Op.getOperand(0);
11479 SDValue TrigVal;
11480
11481 // Propagate fast-math flags so that the multiply we introduce can be folded
11482 // if Arg is already the result of a multiply by constant.
11483 auto Flags = Op->getFlags();
11484
11485 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
11486
11487 if (Subtarget->hasTrigReducedRange()) {
11488 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11489 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
11490 } else {
11491 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11492 }
11493
11494 switch (Op.getOpcode()) {
11495 case ISD::FCOS:
11496 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
11497 case ISD::FSIN:
11498 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
11499 default:
11500 llvm_unreachable("Wrong trig opcode");
11501 }
11502}
11503
11504SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
11505 SelectionDAG &DAG) const {
11506 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
11507 assert(AtomicNode->isCompareAndSwap());
11508 unsigned AS = AtomicNode->getAddressSpace();
11509
11510 // No custom lowering required for local address space
11512 return Op;
11513
11514 // Non-local address space requires custom lowering for atomic compare
11515 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
11516 SDLoc DL(Op);
11517 SDValue ChainIn = Op.getOperand(0);
11518 SDValue Addr = Op.getOperand(1);
11519 SDValue Old = Op.getOperand(2);
11520 SDValue New = Op.getOperand(3);
11521 EVT VT = Op.getValueType();
11522 MVT SimpleVT = VT.getSimpleVT();
11523 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
11524
11525 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
11526 SDValue Ops[] = {ChainIn, Addr, NewOld};
11527
11529 Op->getVTList(), Ops, VT,
11530 AtomicNode->getMemOperand());
11531}
11532
11533//===----------------------------------------------------------------------===//
11534// Custom DAG optimizations
11535//===----------------------------------------------------------------------===//
11536
11537SDValue
11538SITargetLowering::performUCharToFloatCombine(SDNode *N,
11539 DAGCombinerInfo &DCI) const {
11540 EVT VT = N->getValueType(0);
11541 EVT ScalarVT = VT.getScalarType();
11542 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11543 return SDValue();
11544
11545 SelectionDAG &DAG = DCI.DAG;
11546 SDLoc DL(N);
11547
11548 SDValue Src = N->getOperand(0);
11549 EVT SrcVT = Src.getValueType();
11550
11551 // TODO: We could try to match extracting the higher bytes, which would be
11552 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
11553 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
11554 // about in practice.
11555 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11556 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
11557 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
11558 DCI.AddToWorklist(Cvt.getNode());
11559
11560 // For the f16 case, fold to a cast to f32 and then cast back to f16.
11561 if (ScalarVT != MVT::f32) {
11562 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
11563 DAG.getTargetConstant(0, DL, MVT::i32));
11564 }
11565 return Cvt;
11566 }
11567 }
11568
11569 return SDValue();
11570}
11571
11572SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
11573 DAGCombinerInfo &DCI) const {
11574 SDValue MagnitudeOp = N->getOperand(0);
11575 SDValue SignOp = N->getOperand(1);
11576 SelectionDAG &DAG = DCI.DAG;
11577 SDLoc DL(N);
11578
11579 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
11580 // lower half with a copy.
11581 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11582 if (MagnitudeOp.getValueType() == MVT::f64) {
11583 SDValue MagAsVector =
11584 DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
11585 SDValue MagLo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
11586 MagAsVector, DAG.getConstant(0, DL, MVT::i32));
11587 SDValue MagHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
11588 MagAsVector, DAG.getConstant(1, DL, MVT::i32));
11589
11590 SDValue HiOp = DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
11591
11592 SDValue Vector =
11593 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
11594
11595 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
11596 }
11597
11598 if (SignOp.getValueType() != MVT::f64)
11599 return SDValue();
11600
11601 // Reduce width of sign operand, we only need the highest bit.
11602 //
11603 // fcopysign f64:x, f64:y ->
11604 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
11605 // TODO: In some cases it might make sense to go all the way to f16.
11606 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
11607 SDValue SignAsF32 =
11608 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
11609 DAG.getConstant(1, DL, MVT::i32));
11610
11611 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
11612 SignAsF32);
11613}
11614
11615// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
11616// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
11617// bits
11618
11619// This is a variant of
11620// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
11621//
11622// The normal DAG combiner will do this, but only if the add has one use since
11623// that would increase the number of instructions.
11624//
11625// This prevents us from seeing a constant offset that can be folded into a
11626// memory instruction's addressing mode. If we know the resulting add offset of
11627// a pointer can be folded into an addressing offset, we can replace the pointer
11628// operand with the add of new constant offset. This eliminates one of the uses,
11629// and may allow the remaining use to also be simplified.
11630//
11631SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
11632 EVT MemVT,
11633 DAGCombinerInfo &DCI) const {
11634 SDValue N0 = N->getOperand(0);
11635 SDValue N1 = N->getOperand(1);
11636
11637 // We only do this to handle cases where it's profitable when there are
11638 // multiple uses of the add, so defer to the standard combine.
11639 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
11640 N0->hasOneUse())
11641 return SDValue();
11642
11643 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
11644 if (!CN1)
11645 return SDValue();
11646
11647 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
11648 if (!CAdd)
11649 return SDValue();
11650
11651 SelectionDAG &DAG = DCI.DAG;
11652
11653 if (N0->getOpcode() == ISD::OR &&
11654 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
11655 return SDValue();
11656
11657 // If the resulting offset is too large, we can't fold it into the
11658 // addressing mode offset.
11659 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
11660 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
11661
11662 AddrMode AM;
11663 AM.HasBaseReg = true;
11664 AM.BaseOffs = Offset.getSExtValue();
11665 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
11666 return SDValue();
11667
11668 SDLoc SL(N);
11669 EVT VT = N->getValueType(0);
11670
11671 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
11672 SDValue COffset = DAG.getConstant(Offset, SL, VT);
11673
11675 Flags.setNoUnsignedWrap(
11676 N->getFlags().hasNoUnsignedWrap() &&
11677 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
11678
11679 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
11680}
11681
11682/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
11683/// by the chain and intrinsic ID. Theoretically we would also need to check the
11684/// specific intrinsic, but they all place the pointer operand first.
11685static unsigned getBasePtrIndex(const MemSDNode *N) {
11686 switch (N->getOpcode()) {
11687 case ISD::STORE:
11690 return 2;
11691 default:
11692 return 1;
11693 }
11694}
11695
11696SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
11697 DAGCombinerInfo &DCI) const {
11698 SelectionDAG &DAG = DCI.DAG;
11699 SDLoc SL(N);
11700
11701 unsigned PtrIdx = getBasePtrIndex(N);
11702 SDValue Ptr = N->getOperand(PtrIdx);
11703
11704 // TODO: We could also do this for multiplies.
11705 if (Ptr.getOpcode() == ISD::SHL) {
11706 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
11707 N->getMemoryVT(), DCI);
11708 if (NewPtr) {
11709 SmallVector<SDValue, 8> NewOps(N->ops());
11710
11711 NewOps[PtrIdx] = NewPtr;
11712 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
11713 }
11714 }
11715
11716 return SDValue();
11717}
11718
11719static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
11720 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11721 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11722 (Opc == ISD::XOR && Val == 0);
11723}
11724
11725// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
11726// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
11727// integer combine opportunities since most 64-bit operations are decomposed
11728// this way. TODO: We won't want this for SALU especially if it is an inline
11729// immediate.
11730SDValue SITargetLowering::splitBinaryBitConstantOp(
11731 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
11732 const ConstantSDNode *CRHS) const {
11733 uint64_t Val = CRHS->getZExtValue();
11734 uint32_t ValLo = Lo_32(Val);
11735 uint32_t ValHi = Hi_32(Val);
11737
11738 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
11739 bitOpWithConstantIsReducible(Opc, ValHi)) ||
11740 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
11741 // If we need to materialize a 64-bit immediate, it will be split up later
11742 // anyway. Avoid creating the harder to understand 64-bit immediate
11743 // materialization.
11744 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
11745 }
11746
11747 return SDValue();
11748}
11749
11751 if (V.getValueType() != MVT::i1)
11752 return false;
11753 switch (V.getOpcode()) {
11754 default:
11755 break;
11756 case ISD::SETCC:
11758 return true;
11759 case ISD::AND:
11760 case ISD::OR:
11761 case ISD::XOR:
11762 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
11763 }
11764 return false;
11765}
11766
11767// If a constant has all zeroes or all ones within each byte return it.
11768// Otherwise return 0.
11770 // 0xff for any zero byte in the mask
11771 uint32_t ZeroByteMask = 0;
11772 if (!(C & 0x000000ff))
11773 ZeroByteMask |= 0x000000ff;
11774 if (!(C & 0x0000ff00))
11775 ZeroByteMask |= 0x0000ff00;
11776 if (!(C & 0x00ff0000))
11777 ZeroByteMask |= 0x00ff0000;
11778 if (!(C & 0xff000000))
11779 ZeroByteMask |= 0xff000000;
11780 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
11781 if ((NonZeroByteMask & C) != NonZeroByteMask)
11782 return 0; // Partial bytes selected.
11783 return C;
11784}
11785
11786// Check if a node selects whole bytes from its operand 0 starting at a byte
11787// boundary while masking the rest. Returns select mask as in the v_perm_b32
11788// or -1 if not succeeded.
11789// Note byte select encoding:
11790// value 0-3 selects corresponding source byte;
11791// value 0xc selects zero;
11792// value 0xff selects 0xff.
11794 assert(V.getValueSizeInBits() == 32);
11795
11796 if (V.getNumOperands() != 2)
11797 return ~0;
11798
11799 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
11800 if (!N1)
11801 return ~0;
11802
11803 uint32_t C = N1->getZExtValue();
11804
11805 switch (V.getOpcode()) {
11806 default:
11807 break;
11808 case ISD::AND:
11809 if (uint32_t ConstMask = getConstantPermuteMask(C))
11810 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11811 break;
11812
11813 case ISD::OR:
11814 if (uint32_t ConstMask = getConstantPermuteMask(C))
11815 return (0x03020100 & ~ConstMask) | ConstMask;
11816 break;
11817
11818 case ISD::SHL:
11819 if (C % 8)
11820 return ~0;
11821
11822 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
11823
11824 case ISD::SRL:
11825 if (C % 8)
11826 return ~0;
11827
11828 return uint32_t(0x0c0c0c0c03020100ull >> C);
11829 }
11830
11831 return ~0;
11832}
11833
11834SDValue SITargetLowering::performAndCombine(SDNode *N,
11835 DAGCombinerInfo &DCI) const {
11836 if (DCI.isBeforeLegalize())
11837 return SDValue();
11838
11839 SelectionDAG &DAG = DCI.DAG;
11840 EVT VT = N->getValueType(0);
11841 SDValue LHS = N->getOperand(0);
11842 SDValue RHS = N->getOperand(1);
11843
11844 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
11845 if (VT == MVT::i64 && CRHS) {
11846 if (SDValue Split =
11847 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
11848 return Split;
11849 }
11850
11851 if (CRHS && VT == MVT::i32) {
11852 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
11853 // nb = number of trailing zeroes in mask
11854 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
11855 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
11856 uint64_t Mask = CRHS->getZExtValue();
11857 unsigned Bits = llvm::popcount(Mask);
11858 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
11859 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
11860 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
11861 unsigned Shift = CShift->getZExtValue();
11862 unsigned NB = CRHS->getAPIntValue().countr_zero();
11863 unsigned Offset = NB + Shift;
11864 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
11865 SDLoc SL(N);
11866 SDValue BFE =
11867 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
11868 DAG.getConstant(Offset, SL, MVT::i32),
11869 DAG.getConstant(Bits, SL, MVT::i32));
11870 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
11871 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
11872 DAG.getValueType(NarrowVT));
11873 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
11874 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
11875 return Shl;
11876 }
11877 }
11878 }
11879
11880 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
11881 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
11882 isa<ConstantSDNode>(LHS.getOperand(2))) {
11883 uint32_t Sel = getConstantPermuteMask(Mask);
11884 if (!Sel)
11885 return SDValue();
11886
11887 // Select 0xc for all zero bytes
11888 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11889 SDLoc DL(N);
11890 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
11891 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
11892 }
11893 }
11894
11895 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
11896 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
11897 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
11898 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11899 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
11900
11901 SDValue X = LHS.getOperand(0);
11902 SDValue Y = RHS.getOperand(0);
11903 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
11904 !isTypeLegal(X.getValueType()))
11905 return SDValue();
11906
11907 if (LCC == ISD::SETO) {
11908 if (X != LHS.getOperand(1))
11909 return SDValue();
11910
11911 if (RCC == ISD::SETUNE) {
11912 const ConstantFPSDNode *C1 =
11913 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
11914 if (!C1 || !C1->isInfinity() || C1->isNegative())
11915 return SDValue();
11916
11921
11922 static_assert(
11925 0x3ff) == Mask,
11926 "mask not equal");
11927
11928 SDLoc DL(N);
11929 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
11930 DAG.getConstant(Mask, DL, MVT::i32));
11931 }
11932 }
11933 }
11934
11935 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
11936 std::swap(LHS, RHS);
11937
11938 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
11939 RHS.hasOneUse()) {
11940 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11941 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
11942 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
11943 // | n_nan)
11944 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
11945 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
11946 (RHS.getOperand(0) == LHS.getOperand(0) &&
11947 LHS.getOperand(0) == LHS.getOperand(1))) {
11948 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
11949 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
11950 : Mask->getZExtValue() & OrdMask;
11951
11952 SDLoc DL(N);
11953 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
11954 DAG.getConstant(NewMask, DL, MVT::i32));
11955 }
11956 }
11957
11958 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
11959 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
11960 // and x, (sext cc from i1) => select cc, x, 0
11961 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
11962 std::swap(LHS, RHS);
11963 if (isBoolSGPR(RHS.getOperand(0)))
11964 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
11965 DAG.getConstant(0, SDLoc(N), MVT::i32));
11966 }
11967
11968 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
11970 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
11971 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11972 uint32_t LHSMask = getPermuteMask(LHS);
11973 uint32_t RHSMask = getPermuteMask(RHS);
11974 if (LHSMask != ~0u && RHSMask != ~0u) {
11975 // Canonicalize the expression in an attempt to have fewer unique masks
11976 // and therefore fewer registers used to hold the masks.
11977 if (LHSMask > RHSMask) {
11978 std::swap(LHSMask, RHSMask);
11979 std::swap(LHS, RHS);
11980 }
11981
11982 // Select 0xc for each lane used from source operand. Zero has 0xc mask
11983 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
11984 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11985 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11986
11987 // Check of we need to combine values from two sources within a byte.
11988 if (!(LHSUsedLanes & RHSUsedLanes) &&
11989 // If we select high and lower word keep it for SDWA.
11990 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
11991 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11992 // Each byte in each mask is either selector mask 0-3, or has higher
11993 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
11994 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
11995 // mask which is not 0xff wins. By anding both masks we have a correct
11996 // result except that 0x0c shall be corrected to give 0x0c only.
11997 uint32_t Mask = LHSMask & RHSMask;
11998 for (unsigned I = 0; I < 32; I += 8) {
11999 uint32_t ByteSel = 0xff << I;
12000 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
12001 Mask &= (0x0c << I) & 0xffffffff;
12002 }
12003
12004 // Add 4 to each active LHS lane. It will not affect any existing 0xff
12005 // or 0x0c.
12006 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
12007 SDLoc DL(N);
12008
12009 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12010 RHS.getOperand(0),
12011 DAG.getConstant(Sel, DL, MVT::i32));
12012 }
12013 }
12014 }
12015
12016 return SDValue();
12017}
12018
12019// A key component of v_perm is a mapping between byte position of the src
12020// operands, and the byte position of the dest. To provide such, we need: 1. the
12021// node that provides x byte of the dest of the OR, and 2. the byte of the node
12022// used to provide that x byte. calculateByteProvider finds which node provides
12023// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
12024// and finds an ultimate src and byte position For example: The supported
12025// LoadCombine pattern for vector loads is as follows
12026// t1
12027// or
12028// / \
12029// t2 t3
12030// zext shl
12031// | | \
12032// t4 t5 16
12033// or anyext
12034// / \ |
12035// t6 t7 t8
12036// srl shl or
12037// / | / \ / \
12038// t9 t10 t11 t12 t13 t14
12039// trunc* 8 trunc* 8 and and
12040// | | / | | \
12041// t15 t16 t17 t18 t19 t20
12042// trunc* 255 srl -256
12043// | / \
12044// t15 t15 16
12045//
12046// *In this example, the truncs are from i32->i16
12047//
12048// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
12049// respectively. calculateSrcByte would find (given node) -> ultimate src &
12050// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
12051// After finding the mapping, we can combine the tree into vperm t15, t16,
12052// 0x05000407
12053
12054// Find the source and byte position from a node.
12055// \p DestByte is the byte position of the dest of the or that the src
12056// ultimately provides. \p SrcIndex is the byte of the src that maps to this
12057// dest of the or byte. \p Depth tracks how many recursive iterations we have
12058// performed.
12059static const std::optional<ByteProvider<SDValue>>
12060calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
12061 unsigned Depth = 0) {
12062 // We may need to recursively traverse a series of SRLs
12063 if (Depth >= 6)
12064 return std::nullopt;
12065
12066 if (Op.getValueSizeInBits() < 8)
12067 return std::nullopt;
12068
12069 if (Op.getValueType().isVector())
12070 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
12071
12072 switch (Op->getOpcode()) {
12073 case ISD::TRUNCATE: {
12074 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
12075 }
12076
12077 case ISD::SIGN_EXTEND:
12078 case ISD::ZERO_EXTEND:
12080 SDValue NarrowOp = Op->getOperand(0);
12081 auto NarrowVT = NarrowOp.getValueType();
12082 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
12083 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
12084 NarrowVT = VTSign->getVT();
12085 }
12086 if (!NarrowVT.isByteSized())
12087 return std::nullopt;
12088 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
12089
12090 if (SrcIndex >= NarrowByteWidth)
12091 return std::nullopt;
12092 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
12093 }
12094
12095 case ISD::SRA:
12096 case ISD::SRL: {
12097 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12098 if (!ShiftOp)
12099 return std::nullopt;
12100
12101 uint64_t BitShift = ShiftOp->getZExtValue();
12102
12103 if (BitShift % 8 != 0)
12104 return std::nullopt;
12105
12106 SrcIndex += BitShift / 8;
12107
12108 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
12109 }
12110
12111 default: {
12112 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
12113 }
12114 }
12115 llvm_unreachable("fully handled switch");
12116}
12117
12118// For a byte position in the result of an Or, traverse the tree and find the
12119// node (and the byte of the node) which ultimately provides this {Or,
12120// BytePosition}. \p Op is the operand we are currently examining. \p Index is
12121// the byte position of the Op that corresponds with the originally requested
12122// byte of the Or \p Depth tracks how many recursive iterations we have
12123// performed. \p StartingIndex is the originally requested byte of the Or
12124static const std::optional<ByteProvider<SDValue>>
12125calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
12126 unsigned StartingIndex = 0) {
12127 // Finding Src tree of RHS of or typically requires at least 1 additional
12128 // depth
12129 if (Depth > 6)
12130 return std::nullopt;
12131
12132 unsigned BitWidth = Op.getScalarValueSizeInBits();
12133 if (BitWidth % 8 != 0)
12134 return std::nullopt;
12135 if (Index > BitWidth / 8 - 1)
12136 return std::nullopt;
12137
12138 bool IsVec = Op.getValueType().isVector();
12139 switch (Op.getOpcode()) {
12140 case ISD::OR: {
12141 if (IsVec)
12142 return std::nullopt;
12143
12144 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
12145 StartingIndex);
12146 if (!RHS)
12147 return std::nullopt;
12148 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
12149 StartingIndex);
12150 if (!LHS)
12151 return std::nullopt;
12152 // A well formed Or will have two ByteProviders for each byte, one of which
12153 // is constant zero
12154 if (!LHS->isConstantZero() && !RHS->isConstantZero())
12155 return std::nullopt;
12156 if (!LHS || LHS->isConstantZero())
12157 return RHS;
12158 if (!RHS || RHS->isConstantZero())
12159 return LHS;
12160 return std::nullopt;
12161 }
12162
12163 case ISD::AND: {
12164 if (IsVec)
12165 return std::nullopt;
12166
12167 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12168 if (!BitMaskOp)
12169 return std::nullopt;
12170
12171 uint32_t BitMask = BitMaskOp->getZExtValue();
12172 // Bits we expect for our StartingIndex
12173 uint32_t IndexMask = 0xFF << (Index * 8);
12174
12175 if ((IndexMask & BitMask) != IndexMask) {
12176 // If the result of the and partially provides the byte, then it
12177 // is not well formatted
12178 if (IndexMask & BitMask)
12179 return std::nullopt;
12181 }
12182
12183 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
12184 }
12185
12186 case ISD::FSHR: {
12187 if (IsVec)
12188 return std::nullopt;
12189
12190 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
12191 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12192 if (!ShiftOp || Op.getValueType().isVector())
12193 return std::nullopt;
12194
12195 uint64_t BitsProvided = Op.getValueSizeInBits();
12196 if (BitsProvided % 8 != 0)
12197 return std::nullopt;
12198
12199 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
12200 if (BitShift % 8)
12201 return std::nullopt;
12202
12203 uint64_t ConcatSizeInBytes = BitsProvided / 4;
12204 uint64_t ByteShift = BitShift / 8;
12205
12206 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
12207 uint64_t BytesProvided = BitsProvided / 8;
12208 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
12209 NewIndex %= BytesProvided;
12210 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
12211 }
12212
12213 case ISD::SRA:
12214 case ISD::SRL: {
12215 if (IsVec)
12216 return std::nullopt;
12217
12218 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12219 if (!ShiftOp)
12220 return std::nullopt;
12221
12222 uint64_t BitShift = ShiftOp->getZExtValue();
12223 if (BitShift % 8)
12224 return std::nullopt;
12225
12226 auto BitsProvided = Op.getScalarValueSizeInBits();
12227 if (BitsProvided % 8 != 0)
12228 return std::nullopt;
12229
12230 uint64_t BytesProvided = BitsProvided / 8;
12231 uint64_t ByteShift = BitShift / 8;
12232 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
12233 // If the byte we are trying to provide (as tracked by index) falls in this
12234 // range, then the SRL provides the byte. The byte of interest of the src of
12235 // the SRL is Index + ByteShift
12236 return BytesProvided - ByteShift > Index
12237 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
12238 Index + ByteShift)
12240 }
12241
12242 case ISD::SHL: {
12243 if (IsVec)
12244 return std::nullopt;
12245
12246 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12247 if (!ShiftOp)
12248 return std::nullopt;
12249
12250 uint64_t BitShift = ShiftOp->getZExtValue();
12251 if (BitShift % 8 != 0)
12252 return std::nullopt;
12253 uint64_t ByteShift = BitShift / 8;
12254
12255 // If we are shifting by an amount greater than (or equal to)
12256 // the index we are trying to provide, then it provides 0s. If not,
12257 // then this bytes are not definitively 0s, and the corresponding byte
12258 // of interest is Index - ByteShift of the src
12259 return Index < ByteShift
12261 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
12262 Depth + 1, StartingIndex);
12263 }
12264 case ISD::ANY_EXTEND:
12265 case ISD::SIGN_EXTEND:
12266 case ISD::ZERO_EXTEND:
12268 case ISD::AssertZext:
12269 case ISD::AssertSext: {
12270 if (IsVec)
12271 return std::nullopt;
12272
12273 SDValue NarrowOp = Op->getOperand(0);
12274 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
12275 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
12276 Op->getOpcode() == ISD::AssertZext ||
12277 Op->getOpcode() == ISD::AssertSext) {
12278 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
12279 NarrowBitWidth = VTSign->getVT().getSizeInBits();
12280 }
12281 if (NarrowBitWidth % 8 != 0)
12282 return std::nullopt;
12283 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12284
12285 if (Index >= NarrowByteWidth)
12286 return Op.getOpcode() == ISD::ZERO_EXTEND
12287 ? std::optional<ByteProvider<SDValue>>(
12289 : std::nullopt;
12290 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
12291 }
12292
12293 case ISD::TRUNCATE: {
12294 if (IsVec)
12295 return std::nullopt;
12296
12297 uint64_t NarrowByteWidth = BitWidth / 8;
12298
12299 if (NarrowByteWidth >= Index) {
12300 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
12301 StartingIndex);
12302 }
12303
12304 return std::nullopt;
12305 }
12306
12307 case ISD::CopyFromReg: {
12308 if (BitWidth / 8 > Index)
12309 return calculateSrcByte(Op, StartingIndex, Index);
12310
12311 return std::nullopt;
12312 }
12313
12314 case ISD::LOAD: {
12315 auto *L = cast<LoadSDNode>(Op.getNode());
12316
12317 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
12318 if (NarrowBitWidth % 8 != 0)
12319 return std::nullopt;
12320 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12321
12322 // If the width of the load does not reach byte we are trying to provide for
12323 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
12324 // question
12325 if (Index >= NarrowByteWidth) {
12326 return L->getExtensionType() == ISD::ZEXTLOAD
12327 ? std::optional<ByteProvider<SDValue>>(
12329 : std::nullopt;
12330 }
12331
12332 if (NarrowByteWidth > Index) {
12333 return calculateSrcByte(Op, StartingIndex, Index);
12334 }
12335
12336 return std::nullopt;
12337 }
12338
12339 case ISD::BSWAP: {
12340 if (IsVec)
12341 return std::nullopt;
12342
12343 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
12344 Depth + 1, StartingIndex);
12345 }
12346
12348 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12349 if (!IdxOp)
12350 return std::nullopt;
12351 auto VecIdx = IdxOp->getZExtValue();
12352 auto ScalarSize = Op.getScalarValueSizeInBits();
12353 if (ScalarSize < 32)
12354 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
12355 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
12356 StartingIndex, Index);
12357 }
12358
12359 case AMDGPUISD::PERM: {
12360 if (IsVec)
12361 return std::nullopt;
12362
12363 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12364 if (!PermMask)
12365 return std::nullopt;
12366
12367 auto IdxMask =
12368 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
12369 if (IdxMask > 0x07 && IdxMask != 0x0c)
12370 return std::nullopt;
12371
12372 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12373 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12374
12375 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
12378 }
12379
12380 default: {
12381 return std::nullopt;
12382 }
12383 }
12384
12385 llvm_unreachable("fully handled switch");
12386}
12387
12388// Returns true if the Operand is a scalar and is 16 bits
12389static bool isExtendedFrom16Bits(SDValue &Operand) {
12390
12391 switch (Operand.getOpcode()) {
12392 case ISD::ANY_EXTEND:
12393 case ISD::SIGN_EXTEND:
12394 case ISD::ZERO_EXTEND: {
12395 auto OpVT = Operand.getOperand(0).getValueType();
12396 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
12397 }
12398 case ISD::LOAD: {
12399 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
12400 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12401 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
12402 ExtType == ISD::EXTLOAD) {
12403 auto MemVT = L->getMemoryVT();
12404 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
12405 }
12406 return L->getMemoryVT().getSizeInBits() == 16;
12407 }
12408 default:
12409 return false;
12410 }
12411}
12412
12413// Returns true if the mask matches consecutive bytes, and the first byte
12414// begins at a power of 2 byte offset from 0th byte
12415static bool addresses16Bits(int Mask) {
12416 int Low8 = Mask & 0xff;
12417 int Hi8 = (Mask & 0xff00) >> 8;
12418
12419 assert(Low8 < 8 && Hi8 < 8);
12420 // Are the bytes contiguous in the order of increasing addresses.
12421 bool IsConsecutive = (Hi8 - Low8 == 1);
12422 // Is the first byte at location that is aligned for 16 bit instructions.
12423 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
12424 // In this case, we still need code to extract the 16 bit operand, so it
12425 // is better to use i8 v_perm
12426 bool Is16Aligned = !(Low8 % 2);
12427
12428 return IsConsecutive && Is16Aligned;
12429}
12430
12431// Do not lower into v_perm if the operands are actually 16 bit
12432// and the selected bits (based on PermMask) correspond with two
12433// easily addressable 16 bit operands.
12435 SDValue &OtherOp) {
12436 int Low16 = PermMask & 0xffff;
12437 int Hi16 = (PermMask & 0xffff0000) >> 16;
12438
12439 auto TempOp = peekThroughBitcasts(Op);
12440 auto TempOtherOp = peekThroughBitcasts(OtherOp);
12441
12442 auto OpIs16Bit =
12443 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
12444 if (!OpIs16Bit)
12445 return true;
12446
12447 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12448 isExtendedFrom16Bits(TempOtherOp);
12449 if (!OtherOpIs16Bit)
12450 return true;
12451
12452 // Do we cleanly address both
12453 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
12454}
12455
12457 unsigned DWordOffset) {
12458 SDValue Ret;
12459
12460 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12461 // ByteProvider must be at least 8 bits
12462 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12463
12464 if (TypeSize <= 32)
12465 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
12466
12467 if (Src.getValueType().isVector()) {
12468 auto ScalarTySize = Src.getScalarValueSizeInBits();
12469 auto ScalarTy = Src.getValueType().getScalarType();
12470 if (ScalarTySize == 32) {
12471 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
12472 DAG.getConstant(DWordOffset, SL, MVT::i32));
12473 }
12474 if (ScalarTySize > 32) {
12475 Ret = DAG.getNode(
12476 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
12477 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12478 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12479 if (ShiftVal)
12480 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
12481 DAG.getConstant(ShiftVal, SL, MVT::i32));
12482 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12483 }
12484
12485 assert(ScalarTySize < 32);
12486 auto NumElements = TypeSize / ScalarTySize;
12487 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12488 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12489 auto NumElementsIn32 = 32 / ScalarTySize;
12490 auto NumAvailElements = DWordOffset < Trunc32Elements
12491 ? NumElementsIn32
12492 : NumElements - NormalizedTrunc;
12493
12495 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
12496 NumAvailElements);
12497
12498 Ret = DAG.getBuildVector(
12499 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
12500 VecSrcs);
12501 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12502 }
12503
12504 /// Scalar Type
12505 auto ShiftVal = 32 * DWordOffset;
12506 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
12507 DAG.getConstant(ShiftVal, SL, MVT::i32));
12508 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12509}
12510
12512 SelectionDAG &DAG = DCI.DAG;
12513 [[maybe_unused]] EVT VT = N->getValueType(0);
12515
12516 // VT is known to be MVT::i32, so we need to provide 4 bytes.
12517 assert(VT == MVT::i32);
12518 for (int i = 0; i < 4; i++) {
12519 // Find the ByteProvider that provides the ith byte of the result of OR
12520 std::optional<ByteProvider<SDValue>> P =
12521 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
12522 // TODO support constantZero
12523 if (!P || P->isConstantZero())
12524 return SDValue();
12525
12526 PermNodes.push_back(*P);
12527 }
12528 if (PermNodes.size() != 4)
12529 return SDValue();
12530
12531 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12532 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12533 uint64_t PermMask = 0x00000000;
12534 for (size_t i = 0; i < PermNodes.size(); i++) {
12535 auto PermOp = PermNodes[i];
12536 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
12537 // by sizeof(Src2) = 4
12538 int SrcByteAdjust = 4;
12539
12540 // If the Src uses a byte from a different DWORD, then it corresponds
12541 // with a difference source
12542 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12543 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12544 if (SecondSrc)
12545 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12546 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12547 return SDValue();
12548
12549 // Set the index of the second distinct Src node
12550 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12551 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12552 SrcByteAdjust = 0;
12553 }
12554 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12556 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12557 }
12558 SDLoc DL(N);
12559 SDValue Op = *PermNodes[FirstSrc.first].Src;
12560 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
12561 assert(Op.getValueSizeInBits() == 32);
12562
12563 // Check that we are not just extracting the bytes in order from an op
12564 if (!SecondSrc) {
12565 int Low16 = PermMask & 0xffff;
12566 int Hi16 = (PermMask & 0xffff0000) >> 16;
12567
12568 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12569 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12570
12571 // The perm op would really just produce Op. So combine into Op
12572 if (WellFormedLow && WellFormedHi)
12573 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
12574 }
12575
12576 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
12577
12578 if (SecondSrc) {
12579 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
12580 assert(OtherOp.getValueSizeInBits() == 32);
12581 }
12582
12583 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
12584
12585 assert(Op.getValueType().isByteSized() &&
12586 OtherOp.getValueType().isByteSized());
12587
12588 // If the ultimate src is less than 32 bits, then we will only be
12589 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
12590 // CalculateByteProvider would not have returned Op as source if we
12591 // used a byte that is outside its ValueType. Thus, we are free to
12592 // ANY_EXTEND as the extended bits are dont-cares.
12593 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
12594 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
12595
12596 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
12597 DAG.getConstant(PermMask, DL, MVT::i32));
12598 }
12599 return SDValue();
12600}
12601
12602SDValue SITargetLowering::performOrCombine(SDNode *N,
12603 DAGCombinerInfo &DCI) const {
12604 SelectionDAG &DAG = DCI.DAG;
12605 SDValue LHS = N->getOperand(0);
12606 SDValue RHS = N->getOperand(1);
12607
12608 EVT VT = N->getValueType(0);
12609 if (VT == MVT::i1) {
12610 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
12611 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
12612 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
12613 SDValue Src = LHS.getOperand(0);
12614 if (Src != RHS.getOperand(0))
12615 return SDValue();
12616
12617 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
12618 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
12619 if (!CLHS || !CRHS)
12620 return SDValue();
12621
12622 // Only 10 bits are used.
12623 static const uint32_t MaxMask = 0x3ff;
12624
12625 uint32_t NewMask =
12626 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
12627 SDLoc DL(N);
12628 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
12629 DAG.getConstant(NewMask, DL, MVT::i32));
12630 }
12631
12632 return SDValue();
12633 }
12634
12635 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12636 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
12637 LHS.getOpcode() == AMDGPUISD::PERM &&
12638 isa<ConstantSDNode>(LHS.getOperand(2))) {
12639 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
12640 if (!Sel)
12641 return SDValue();
12642
12643 Sel |= LHS.getConstantOperandVal(2);
12644 SDLoc DL(N);
12645 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12646 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
12647 }
12648
12649 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12651 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
12652 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12653
12654 // If all the uses of an or need to extract the individual elements, do not
12655 // attempt to lower into v_perm
12656 auto usesCombinedOperand = [](SDNode *OrUse) {
12657 // If we have any non-vectorized use, then it is a candidate for v_perm
12658 if (OrUse->getOpcode() != ISD::BITCAST ||
12659 !OrUse->getValueType(0).isVector())
12660 return true;
12661
12662 // If we have any non-vectorized use, then it is a candidate for v_perm
12663 for (auto *VUser : OrUse->users()) {
12664 if (!VUser->getValueType(0).isVector())
12665 return true;
12666
12667 // If the use of a vector is a store, then combining via a v_perm
12668 // is beneficial.
12669 // TODO -- whitelist more uses
12670 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
12671 if (VUser->getOpcode() == VectorwiseOp)
12672 return true;
12673 }
12674 return false;
12675 };
12676
12677 if (!any_of(N->users(), usesCombinedOperand))
12678 return SDValue();
12679
12680 uint32_t LHSMask = getPermuteMask(LHS);
12681 uint32_t RHSMask = getPermuteMask(RHS);
12682
12683 if (LHSMask != ~0u && RHSMask != ~0u) {
12684 // Canonicalize the expression in an attempt to have fewer unique masks
12685 // and therefore fewer registers used to hold the masks.
12686 if (LHSMask > RHSMask) {
12687 std::swap(LHSMask, RHSMask);
12688 std::swap(LHS, RHS);
12689 }
12690
12691 // Select 0xc for each lane used from source operand. Zero has 0xc mask
12692 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12693 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12694 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12695
12696 // Check of we need to combine values from two sources within a byte.
12697 if (!(LHSUsedLanes & RHSUsedLanes) &&
12698 // If we select high and lower word keep it for SDWA.
12699 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
12700 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12701 // Kill zero bytes selected by other mask. Zero value is 0xc.
12702 LHSMask &= ~RHSUsedLanes;
12703 RHSMask &= ~LHSUsedLanes;
12704 // Add 4 to each active LHS lane
12705 LHSMask |= LHSUsedLanes & 0x04040404;
12706 // Combine masks
12707 uint32_t Sel = LHSMask | RHSMask;
12708 SDLoc DL(N);
12709
12710 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12711 RHS.getOperand(0),
12712 DAG.getConstant(Sel, DL, MVT::i32));
12713 }
12714 }
12715 if (LHSMask == ~0u || RHSMask == ~0u) {
12716 if (SDValue Perm = matchPERM(N, DCI))
12717 return Perm;
12718 }
12719 }
12720
12721 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12722 return SDValue();
12723
12724 // TODO: This could be a generic combine with a predicate for extracting the
12725 // high half of an integer being free.
12726
12727 // (or i64:x, (zero_extend i32:y)) ->
12728 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
12729 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
12730 RHS.getOpcode() != ISD::ZERO_EXTEND)
12731 std::swap(LHS, RHS);
12732
12733 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
12734 SDValue ExtSrc = RHS.getOperand(0);
12735 EVT SrcVT = ExtSrc.getValueType();
12736 if (SrcVT == MVT::i32) {
12737 SDLoc SL(N);
12738 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
12739 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
12740
12741 DCI.AddToWorklist(LowOr.getNode());
12742 DCI.AddToWorklist(HiBits.getNode());
12743
12744 SDValue Vec =
12745 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
12746 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
12747 }
12748 }
12749
12750 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
12751 if (CRHS) {
12752 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
12753 N->getOperand(0), CRHS))
12754 return Split;
12755 }
12756
12757 return SDValue();
12758}
12759
12760SDValue SITargetLowering::performXorCombine(SDNode *N,
12761 DAGCombinerInfo &DCI) const {
12762 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
12763 return RV;
12764
12765 SDValue LHS = N->getOperand(0);
12766 SDValue RHS = N->getOperand(1);
12767
12768 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12769 SelectionDAG &DAG = DCI.DAG;
12770
12771 EVT VT = N->getValueType(0);
12772 if (CRHS && VT == MVT::i64) {
12773 if (SDValue Split =
12774 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
12775 return Split;
12776 }
12777
12778 // Make sure to apply the 64-bit constant splitting fold before trying to fold
12779 // fneg-like xors into 64-bit select.
12780 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
12781 // This looks like an fneg, try to fold as a source modifier.
12782 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
12783 shouldFoldFNegIntoSrc(N, LHS)) {
12784 // xor (select c, a, b), 0x80000000 ->
12785 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
12786 SDLoc DL(N);
12787 SDValue CastLHS =
12788 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
12789 SDValue CastRHS =
12790 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
12791 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
12792 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
12793 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
12794 LHS->getOperand(0), FNegLHS, FNegRHS);
12795 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
12796 }
12797 }
12798
12799 return SDValue();
12800}
12801
12802SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
12803 DAGCombinerInfo &DCI) const {
12804 if (!Subtarget->has16BitInsts() ||
12805 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
12806 return SDValue();
12807
12808 EVT VT = N->getValueType(0);
12809 if (VT != MVT::i32)
12810 return SDValue();
12811
12812 SDValue Src = N->getOperand(0);
12813 if (Src.getValueType() != MVT::i16)
12814 return SDValue();
12815
12816 return SDValue();
12817}
12818
12819SDValue
12820SITargetLowering::performSignExtendInRegCombine(SDNode *N,
12821 DAGCombinerInfo &DCI) const {
12822 SDValue Src = N->getOperand(0);
12823 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
12824
12825 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
12826 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
12827 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
12828 VTSign->getVT() == MVT::i8) ||
12829 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
12830 VTSign->getVT() == MVT::i16))) {
12831 assert(Subtarget->hasScalarSubwordLoads() &&
12832 "s_buffer_load_{u8, i8} are supported "
12833 "in GFX12 (or newer) architectures.");
12834 EVT VT = Src.getValueType();
12835 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
12838 SDLoc DL(N);
12839 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12840 SDValue Ops[] = {
12841 Src.getOperand(0), // source register
12842 Src.getOperand(1), // offset
12843 Src.getOperand(2) // cachePolicy
12844 };
12845 auto *M = cast<MemSDNode>(Src);
12846 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12847 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12848 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
12849 return LoadVal;
12850 }
12851 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
12852 VTSign->getVT() == MVT::i8) ||
12853 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
12854 VTSign->getVT() == MVT::i16)) &&
12855 Src.hasOneUse()) {
12856 auto *M = cast<MemSDNode>(Src);
12857 SDValue Ops[] = {Src.getOperand(0), // Chain
12858 Src.getOperand(1), // rsrc
12859 Src.getOperand(2), // vindex
12860 Src.getOperand(3), // voffset
12861 Src.getOperand(4), // soffset
12862 Src.getOperand(5), // offset
12863 Src.getOperand(6), Src.getOperand(7)};
12864 // replace with BUFFER_LOAD_BYTE/SHORT
12865 SDVTList ResList =
12866 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
12867 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
12870 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
12871 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12872 return DCI.DAG.getMergeValues(
12873 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
12874 }
12875 return SDValue();
12876}
12877
12878SDValue SITargetLowering::performClassCombine(SDNode *N,
12879 DAGCombinerInfo &DCI) const {
12880 SelectionDAG &DAG = DCI.DAG;
12881 SDValue Mask = N->getOperand(1);
12882
12883 // fp_class x, 0 -> false
12884 if (isNullConstant(Mask))
12885 return DAG.getConstant(0, SDLoc(N), MVT::i1);
12886
12887 if (N->getOperand(0).isUndef())
12888 return DAG.getUNDEF(MVT::i1);
12889
12890 return SDValue();
12891}
12892
12893SDValue SITargetLowering::performRcpCombine(SDNode *N,
12894 DAGCombinerInfo &DCI) const {
12895 EVT VT = N->getValueType(0);
12896 SDValue N0 = N->getOperand(0);
12897
12898 if (N0.isUndef()) {
12899 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
12900 SDLoc(N), VT);
12901 }
12902
12903 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
12904 N0.getOpcode() == ISD::SINT_TO_FP)) {
12905 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
12906 N->getFlags());
12907 }
12908
12909 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
12910 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
12911 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
12912 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
12913 N->getFlags());
12914 }
12915
12917}
12918
12920 unsigned MaxDepth) const {
12921 unsigned Opcode = Op.getOpcode();
12922 if (Opcode == ISD::FCANONICALIZE)
12923 return true;
12924
12925 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
12926 const auto &F = CFP->getValueAPF();
12927 if (F.isNaN() && F.isSignaling())
12928 return false;
12929 if (!F.isDenormal())
12930 return true;
12931
12932 DenormalMode Mode =
12933 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
12934 return Mode == DenormalMode::getIEEE();
12935 }
12936
12937 // If source is a result of another standard FP operation it is already in
12938 // canonical form.
12939 if (MaxDepth == 0)
12940 return false;
12941
12942 switch (Opcode) {
12943 // These will flush denorms if required.
12944 case ISD::FADD:
12945 case ISD::FSUB:
12946 case ISD::FMUL:
12947 case ISD::FCEIL:
12948 case ISD::FFLOOR:
12949 case ISD::FMA:
12950 case ISD::FMAD:
12951 case ISD::FSQRT:
12952 case ISD::FDIV:
12953 case ISD::FREM:
12954 case ISD::FP_ROUND:
12955 case ISD::FP_EXTEND:
12956 case ISD::FP16_TO_FP:
12957 case ISD::FP_TO_FP16:
12958 case ISD::BF16_TO_FP:
12959 case ISD::FP_TO_BF16:
12960 case ISD::FLDEXP:
12963 case AMDGPUISD::RCP:
12964 case AMDGPUISD::RSQ:
12968 case AMDGPUISD::LOG:
12969 case AMDGPUISD::EXP:
12973 case AMDGPUISD::FRACT:
12980 case AMDGPUISD::SIN_HW:
12981 case AMDGPUISD::COS_HW:
12982 return true;
12983
12984 // It can/will be lowered or combined as a bit operation.
12985 // Need to check their input recursively to handle.
12986 case ISD::FNEG:
12987 case ISD::FABS:
12988 case ISD::FCOPYSIGN:
12989 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12990
12991 case ISD::AND:
12992 if (Op.getValueType() == MVT::i32) {
12993 // Be careful as we only know it is a bitcast floating point type. It
12994 // could be f32, v2f16, we have no way of knowing. Luckily the constant
12995 // value that we optimize for, which comes up in fp32 to bf16 conversions,
12996 // is valid to optimize for all types.
12997 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
12998 if (RHS->getZExtValue() == 0xffff0000) {
12999 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
13000 }
13001 }
13002 }
13003 break;
13004
13005 case ISD::FSIN:
13006 case ISD::FCOS:
13007 case ISD::FSINCOS:
13008 return Op.getValueType().getScalarType() != MVT::f16;
13009
13010 case ISD::FMINNUM:
13011 case ISD::FMAXNUM:
13012 case ISD::FMINNUM_IEEE:
13013 case ISD::FMAXNUM_IEEE:
13014 case ISD::FMINIMUM:
13015 case ISD::FMAXIMUM:
13016 case AMDGPUISD::CLAMP:
13017 case AMDGPUISD::FMED3:
13018 case AMDGPUISD::FMAX3:
13019 case AMDGPUISD::FMIN3:
13021 case AMDGPUISD::FMINIMUM3: {
13022 // FIXME: Shouldn't treat the generic operations different based these.
13023 // However, we aren't really required to flush the result from
13024 // minnum/maxnum..
13025
13026 // snans will be quieted, so we only need to worry about denormals.
13027 if (Subtarget->supportsMinMaxDenormModes() ||
13028 // FIXME: denormalsEnabledForType is broken for dynamic
13029 denormalsEnabledForType(DAG, Op.getValueType()))
13030 return true;
13031
13032 // Flushing may be required.
13033 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
13034 // targets need to check their input recursively.
13035
13036 // FIXME: Does this apply with clamp? It's implemented with max.
13037 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
13038 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
13039 return false;
13040 }
13041
13042 return true;
13043 }
13044 case ISD::SELECT: {
13045 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
13046 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
13047 }
13048 case ISD::BUILD_VECTOR: {
13049 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
13050 SDValue SrcOp = Op.getOperand(i);
13051 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
13052 return false;
13053 }
13054
13055 return true;
13056 }
13059 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
13060 }
13062 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
13063 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
13064 }
13065 case ISD::UNDEF:
13066 // Could be anything.
13067 return false;
13068
13069 case ISD::BITCAST:
13070 // TODO: This is incorrect as it loses track of the operand's type. We may
13071 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
13072 // same bits that are canonicalized in one type need not be in the other.
13073 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
13074 case ISD::TRUNCATE: {
13075 // Hack round the mess we make when legalizing extract_vector_elt
13076 if (Op.getValueType() == MVT::i16) {
13077 SDValue TruncSrc = Op.getOperand(0);
13078 if (TruncSrc.getValueType() == MVT::i32 &&
13079 TruncSrc.getOpcode() == ISD::BITCAST &&
13080 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
13081 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
13082 }
13083 }
13084 return false;
13085 }
13087 unsigned IntrinsicID = Op.getConstantOperandVal(0);
13088 // TODO: Handle more intrinsics
13089 switch (IntrinsicID) {
13090 case Intrinsic::amdgcn_cvt_pkrtz:
13091 case Intrinsic::amdgcn_cubeid:
13092 case Intrinsic::amdgcn_frexp_mant:
13093 case Intrinsic::amdgcn_fdot2:
13094 case Intrinsic::amdgcn_rcp:
13095 case Intrinsic::amdgcn_rsq:
13096 case Intrinsic::amdgcn_rsq_clamp:
13097 case Intrinsic::amdgcn_rcp_legacy:
13098 case Intrinsic::amdgcn_rsq_legacy:
13099 case Intrinsic::amdgcn_trig_preop:
13100 case Intrinsic::amdgcn_log:
13101 case Intrinsic::amdgcn_exp2:
13102 case Intrinsic::amdgcn_sqrt:
13103 return true;
13104 default:
13105 break;
13106 }
13107
13108 break;
13109 }
13110 default:
13111 break;
13112 }
13113
13114 // FIXME: denormalsEnabledForType is broken for dynamic
13115 return denormalsEnabledForType(DAG, Op.getValueType()) &&
13116 DAG.isKnownNeverSNaN(Op);
13117}
13118
13120 unsigned MaxDepth) const {
13121 const MachineRegisterInfo &MRI = MF.getRegInfo();
13122 MachineInstr *MI = MRI.getVRegDef(Reg);
13123 unsigned Opcode = MI->getOpcode();
13124
13125 if (Opcode == AMDGPU::G_FCANONICALIZE)
13126 return true;
13127
13128 std::optional<FPValueAndVReg> FCR;
13129 // Constant splat (can be padded with undef) or scalar constant.
13130 if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
13131 if (FCR->Value.isSignaling())
13132 return false;
13133 if (!FCR->Value.isDenormal())
13134 return true;
13135
13136 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
13137 return Mode == DenormalMode::getIEEE();
13138 }
13139
13140 if (MaxDepth == 0)
13141 return false;
13142
13143 switch (Opcode) {
13144 case AMDGPU::G_FADD:
13145 case AMDGPU::G_FSUB:
13146 case AMDGPU::G_FMUL:
13147 case AMDGPU::G_FCEIL:
13148 case AMDGPU::G_FFLOOR:
13149 case AMDGPU::G_FRINT:
13150 case AMDGPU::G_FNEARBYINT:
13151 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
13152 case AMDGPU::G_INTRINSIC_TRUNC:
13153 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
13154 case AMDGPU::G_FMA:
13155 case AMDGPU::G_FMAD:
13156 case AMDGPU::G_FSQRT:
13157 case AMDGPU::G_FDIV:
13158 case AMDGPU::G_FREM:
13159 case AMDGPU::G_FPOW:
13160 case AMDGPU::G_FPEXT:
13161 case AMDGPU::G_FLOG:
13162 case AMDGPU::G_FLOG2:
13163 case AMDGPU::G_FLOG10:
13164 case AMDGPU::G_FPTRUNC:
13165 case AMDGPU::G_AMDGPU_RCP_IFLAG:
13166 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
13167 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
13168 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
13169 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
13170 return true;
13171 case AMDGPU::G_FNEG:
13172 case AMDGPU::G_FABS:
13173 case AMDGPU::G_FCOPYSIGN:
13174 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
13175 case AMDGPU::G_FMINNUM:
13176 case AMDGPU::G_FMAXNUM:
13177 case AMDGPU::G_FMINNUM_IEEE:
13178 case AMDGPU::G_FMAXNUM_IEEE:
13179 case AMDGPU::G_FMINIMUM:
13180 case AMDGPU::G_FMAXIMUM: {
13181 if (Subtarget->supportsMinMaxDenormModes() ||
13182 // FIXME: denormalsEnabledForType is broken for dynamic
13183 denormalsEnabledForType(MRI.getType(Reg), MF))
13184 return true;
13185
13186 [[fallthrough]];
13187 }
13188 case AMDGPU::G_BUILD_VECTOR:
13189 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
13190 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
13191 return false;
13192 return true;
13193 case AMDGPU::G_INTRINSIC:
13194 case AMDGPU::G_INTRINSIC_CONVERGENT:
13195 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
13196 case Intrinsic::amdgcn_fmul_legacy:
13197 case Intrinsic::amdgcn_fmad_ftz:
13198 case Intrinsic::amdgcn_sqrt:
13199 case Intrinsic::amdgcn_fmed3:
13200 case Intrinsic::amdgcn_sin:
13201 case Intrinsic::amdgcn_cos:
13202 case Intrinsic::amdgcn_log:
13203 case Intrinsic::amdgcn_exp2:
13204 case Intrinsic::amdgcn_log_clamp:
13205 case Intrinsic::amdgcn_rcp:
13206 case Intrinsic::amdgcn_rcp_legacy:
13207 case Intrinsic::amdgcn_rsq:
13208 case Intrinsic::amdgcn_rsq_clamp:
13209 case Intrinsic::amdgcn_rsq_legacy:
13210 case Intrinsic::amdgcn_div_scale:
13211 case Intrinsic::amdgcn_div_fmas:
13212 case Intrinsic::amdgcn_div_fixup:
13213 case Intrinsic::amdgcn_fract:
13214 case Intrinsic::amdgcn_cvt_pkrtz:
13215 case Intrinsic::amdgcn_cubeid:
13216 case Intrinsic::amdgcn_cubema:
13217 case Intrinsic::amdgcn_cubesc:
13218 case Intrinsic::amdgcn_cubetc:
13219 case Intrinsic::amdgcn_frexp_mant:
13220 case Intrinsic::amdgcn_fdot2:
13221 case Intrinsic::amdgcn_trig_preop:
13222 return true;
13223 default:
13224 break;
13225 }
13226
13227 [[fallthrough]];
13228 default:
13229 return false;
13230 }
13231
13232 llvm_unreachable("invalid operation");
13233}
13234
13235// Constant fold canonicalize.
13236SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
13237 const SDLoc &SL, EVT VT,
13238 const APFloat &C) const {
13239 // Flush denormals to 0 if not enabled.
13240 if (C.isDenormal()) {
13241 DenormalMode Mode =
13242 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
13243 if (Mode == DenormalMode::getPreserveSign()) {
13244 return DAG.getConstantFP(
13245 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
13246 }
13247
13248 if (Mode != DenormalMode::getIEEE())
13249 return SDValue();
13250 }
13251
13252 if (C.isNaN()) {
13253 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
13254 if (C.isSignaling()) {
13255 // Quiet a signaling NaN.
13256 // FIXME: Is this supposed to preserve payload bits?
13257 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
13258 }
13259
13260 // Make sure it is the canonical NaN bitpattern.
13261 //
13262 // TODO: Can we use -1 as the canonical NaN value since it's an inline
13263 // immediate?
13264 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
13265 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
13266 }
13267
13268 // Already canonical.
13269 return DAG.getConstantFP(C, SL, VT);
13270}
13271
13273 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
13274}
13275
13276SDValue
13277SITargetLowering::performFCanonicalizeCombine(SDNode *N,
13278 DAGCombinerInfo &DCI) const {
13279 SelectionDAG &DAG = DCI.DAG;
13280 SDValue N0 = N->getOperand(0);
13281 EVT VT = N->getValueType(0);
13282
13283 // fcanonicalize undef -> qnan
13284 if (N0.isUndef()) {
13286 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
13287 }
13288
13289 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
13290 EVT VT = N->getValueType(0);
13291 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
13292 }
13293
13294 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
13295 // (fcanonicalize k)
13296 //
13297 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
13298
13299 // TODO: This could be better with wider vectors that will be split to v2f16,
13300 // and to consider uses since there aren't that many packed operations.
13301 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
13302 isTypeLegal(MVT::v2f16)) {
13303 SDLoc SL(N);
13304 SDValue NewElts[2];
13305 SDValue Lo = N0.getOperand(0);
13306 SDValue Hi = N0.getOperand(1);
13307 EVT EltVT = Lo.getValueType();
13308
13310 for (unsigned I = 0; I != 2; ++I) {
13311 SDValue Op = N0.getOperand(I);
13312 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
13313 NewElts[I] =
13314 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
13315 } else if (Op.isUndef()) {
13316 // Handled below based on what the other operand is.
13317 NewElts[I] = Op;
13318 } else {
13319 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
13320 }
13321 }
13322
13323 // If one half is undef, and one is constant, prefer a splat vector rather
13324 // than the normal qNaN. If it's a register, prefer 0.0 since that's
13325 // cheaper to use and may be free with a packed operation.
13326 if (NewElts[0].isUndef()) {
13327 if (isa<ConstantFPSDNode>(NewElts[1]))
13328 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
13329 ? NewElts[1]
13330 : DAG.getConstantFP(0.0f, SL, EltVT);
13331 }
13332
13333 if (NewElts[1].isUndef()) {
13334 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
13335 ? NewElts[0]
13336 : DAG.getConstantFP(0.0f, SL, EltVT);
13337 }
13338
13339 return DAG.getBuildVector(VT, SL, NewElts);
13340 }
13341 }
13342
13343 return SDValue();
13344}
13345
13346static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
13347 switch (Opc) {
13348 case ISD::FMAXNUM:
13349 case ISD::FMAXNUM_IEEE:
13350 return AMDGPUISD::FMAX3;
13351 case ISD::FMAXIMUM:
13352 return AMDGPUISD::FMAXIMUM3;
13353 case ISD::SMAX:
13354 return AMDGPUISD::SMAX3;
13355 case ISD::UMAX:
13356 return AMDGPUISD::UMAX3;
13357 case ISD::FMINNUM:
13358 case ISD::FMINNUM_IEEE:
13359 return AMDGPUISD::FMIN3;
13360 case ISD::FMINIMUM:
13361 return AMDGPUISD::FMINIMUM3;
13362 case ISD::SMIN:
13363 return AMDGPUISD::SMIN3;
13364 case ISD::UMIN:
13365 return AMDGPUISD::UMIN3;
13366 default:
13367 llvm_unreachable("Not a min/max opcode");
13368 }
13369}
13370
13371SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
13372 const SDLoc &SL, SDValue Src,
13373 SDValue MinVal,
13374 SDValue MaxVal,
13375 bool Signed) const {
13376
13377 // med3 comes from
13378 // min(max(x, K0), K1), K0 < K1
13379 // max(min(x, K0), K1), K1 < K0
13380 //
13381 // "MinVal" and "MaxVal" respectively refer to the rhs of the
13382 // min/max op.
13383 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
13384 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
13385
13386 if (!MinK || !MaxK)
13387 return SDValue();
13388
13389 if (Signed) {
13390 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
13391 return SDValue();
13392 } else {
13393 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
13394 return SDValue();
13395 }
13396
13397 EVT VT = MinK->getValueType(0);
13398 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
13399 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
13400 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13401
13402 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
13403 // not available, but this is unlikely to be profitable as constants
13404 // will often need to be materialized & extended, especially on
13405 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
13406 return SDValue();
13407}
13408
13410 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
13411 return C;
13412
13413 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
13414 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
13415 return C;
13416 }
13417
13418 return nullptr;
13419}
13420
13421SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
13422 const SDLoc &SL, SDValue Op0,
13423 SDValue Op1) const {
13425 if (!K1)
13426 return SDValue();
13427
13429 if (!K0)
13430 return SDValue();
13431
13432 // Ordered >= (although NaN inputs should have folded away by now).
13433 if (K0->getValueAPF() > K1->getValueAPF())
13434 return SDValue();
13435
13436 const MachineFunction &MF = DAG.getMachineFunction();
13438
13439 // TODO: Check IEEE bit enabled?
13440 EVT VT = Op0.getValueType();
13441 if (Info->getMode().DX10Clamp) {
13442 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
13443 // hardware fmed3 behavior converting to a min.
13444 // FIXME: Should this be allowing -0.0?
13445 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
13446 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
13447 }
13448
13449 // med3 for f16 is only available on gfx9+, and not available for v2f16.
13450 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
13451 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
13452 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
13453 // then give the other result, which is different from med3 with a NaN
13454 // input.
13455 SDValue Var = Op0.getOperand(0);
13456 if (!DAG.isKnownNeverSNaN(Var))
13457 return SDValue();
13458
13460
13461 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
13462 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
13463 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
13464 SDValue(K0, 0), SDValue(K1, 0));
13465 }
13466 }
13467
13468 return SDValue();
13469}
13470
13471/// \return true if the subtarget supports minimum3 and maximum3 with the given
13472/// base min/max opcode \p Opc for type \p VT.
13473static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
13474 EVT VT) {
13475 switch (Opc) {
13476 case ISD::FMINNUM:
13477 case ISD::FMAXNUM:
13478 case ISD::FMINNUM_IEEE:
13479 case ISD::FMAXNUM_IEEE:
13482 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16());
13483 case ISD::FMINIMUM:
13484 case ISD::FMAXIMUM:
13485 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
13486 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16());
13487 case ISD::SMAX:
13488 case ISD::SMIN:
13489 case ISD::UMAX:
13490 case ISD::UMIN:
13491 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
13492 default:
13493 return false;
13494 }
13495
13496 llvm_unreachable("not a min/max opcode");
13497}
13498
13499SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
13500 DAGCombinerInfo &DCI) const {
13501 SelectionDAG &DAG = DCI.DAG;
13502
13503 EVT VT = N->getValueType(0);
13504 unsigned Opc = N->getOpcode();
13505 SDValue Op0 = N->getOperand(0);
13506 SDValue Op1 = N->getOperand(1);
13507
13508 // Only do this if the inner op has one use since this will just increases
13509 // register pressure for no benefit.
13510
13511 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
13512 // max(max(a, b), c) -> max3(a, b, c)
13513 // min(min(a, b), c) -> min3(a, b, c)
13514 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
13515 SDLoc DL(N);
13516 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
13517 Op0.getOperand(0), Op0.getOperand(1), Op1);
13518 }
13519
13520 // Try commuted.
13521 // max(a, max(b, c)) -> max3(a, b, c)
13522 // min(a, min(b, c)) -> min3(a, b, c)
13523 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
13524 SDLoc DL(N);
13525 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
13526 Op0, Op1.getOperand(0), Op1.getOperand(1));
13527 }
13528 }
13529
13530 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
13531 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
13532 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
13533 if (SDValue Med3 = performIntMed3ImmCombine(
13534 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
13535 return Med3;
13536 }
13537 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
13538 if (SDValue Med3 = performIntMed3ImmCombine(
13539 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
13540 return Med3;
13541 }
13542
13543 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
13544 if (SDValue Med3 = performIntMed3ImmCombine(
13545 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
13546 return Med3;
13547 }
13548 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
13549 if (SDValue Med3 = performIntMed3ImmCombine(
13550 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
13551 return Med3;
13552 }
13553
13554 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
13555 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
13556 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
13557 (Opc == AMDGPUISD::FMIN_LEGACY &&
13558 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
13559 (VT == MVT::f32 || VT == MVT::f64 ||
13560 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
13561 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
13562 Op0.hasOneUse()) {
13563 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
13564 return Res;
13565 }
13566
13567 return SDValue();
13568}
13569
13571 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
13572 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
13573 // FIXME: Should this be allowing -0.0?
13574 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13575 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13576 }
13577 }
13578
13579 return false;
13580}
13581
13582// FIXME: Should only worry about snans for version with chain.
13583SDValue SITargetLowering::performFMed3Combine(SDNode *N,
13584 DAGCombinerInfo &DCI) const {
13585 EVT VT = N->getValueType(0);
13586 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
13587 // NaNs. With a NaN input, the order of the operands may change the result.
13588
13589 SelectionDAG &DAG = DCI.DAG;
13590 SDLoc SL(N);
13591
13592 SDValue Src0 = N->getOperand(0);
13593 SDValue Src1 = N->getOperand(1);
13594 SDValue Src2 = N->getOperand(2);
13595
13596 if (isClampZeroToOne(Src0, Src1)) {
13597 // const_a, const_b, x -> clamp is safe in all cases including signaling
13598 // nans.
13599 // FIXME: Should this be allowing -0.0?
13600 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
13601 }
13602
13603 const MachineFunction &MF = DAG.getMachineFunction();
13605
13606 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
13607 // handling no dx10-clamp?
13608 if (Info->getMode().DX10Clamp) {
13609 // If NaNs is clamped to 0, we are free to reorder the inputs.
13610
13611 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13612 std::swap(Src0, Src1);
13613
13614 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13615 std::swap(Src1, Src2);
13616
13617 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13618 std::swap(Src0, Src1);
13619
13620 if (isClampZeroToOne(Src1, Src2))
13621 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
13622 }
13623
13624 return SDValue();
13625}
13626
13627SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
13628 DAGCombinerInfo &DCI) const {
13629 SDValue Src0 = N->getOperand(0);
13630 SDValue Src1 = N->getOperand(1);
13631 if (Src0.isUndef() && Src1.isUndef())
13632 return DCI.DAG.getUNDEF(N->getValueType(0));
13633 return SDValue();
13634}
13635
13636// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
13637// expanded into a set of cmp/select instructions.
13639 unsigned NumElem,
13640 bool IsDivergentIdx,
13641 const GCNSubtarget *Subtarget) {
13643 return false;
13644
13645 unsigned VecSize = EltSize * NumElem;
13646
13647 // Sub-dword vectors of size 2 dword or less have better implementation.
13648 if (VecSize <= 64 && EltSize < 32)
13649 return false;
13650
13651 // Always expand the rest of sub-dword instructions, otherwise it will be
13652 // lowered via memory.
13653 if (EltSize < 32)
13654 return true;
13655
13656 // Always do this if var-idx is divergent, otherwise it will become a loop.
13657 if (IsDivergentIdx)
13658 return true;
13659
13660 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
13661 unsigned NumInsts = NumElem /* Number of compares */ +
13662 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
13663
13664 // On some architectures (GFX9) movrel is not available and it's better
13665 // to expand.
13666 if (Subtarget->useVGPRIndexMode())
13667 return NumInsts <= 16;
13668
13669 // If movrel is available, use it instead of expanding for vector of 8
13670 // elements.
13671 if (Subtarget->hasMovrel())
13672 return NumInsts <= 15;
13673
13674 return true;
13675}
13676
13678 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
13679 if (isa<ConstantSDNode>(Idx))
13680 return false;
13681
13682 SDValue Vec = N->getOperand(0);
13683 EVT VecVT = Vec.getValueType();
13684 EVT EltVT = VecVT.getVectorElementType();
13685 unsigned EltSize = EltVT.getSizeInBits();
13686 unsigned NumElem = VecVT.getVectorNumElements();
13687
13689 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
13690}
13691
13692SDValue
13693SITargetLowering::performExtractVectorEltCombine(SDNode *N,
13694 DAGCombinerInfo &DCI) const {
13695 SDValue Vec = N->getOperand(0);
13696 SelectionDAG &DAG = DCI.DAG;
13697
13698 EVT VecVT = Vec.getValueType();
13699 EVT VecEltVT = VecVT.getVectorElementType();
13700 EVT ResVT = N->getValueType(0);
13701
13702 unsigned VecSize = VecVT.getSizeInBits();
13703 unsigned VecEltSize = VecEltVT.getSizeInBits();
13704
13705 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
13707 SDLoc SL(N);
13708 SDValue Idx = N->getOperand(1);
13709 SDValue Elt =
13710 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
13711 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
13712 }
13713
13714 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
13715 // =>
13716 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
13717 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
13718 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
13719 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13720 SDLoc SL(N);
13721 SDValue Idx = N->getOperand(1);
13722 unsigned Opc = Vec.getOpcode();
13723
13724 switch (Opc) {
13725 default:
13726 break;
13727 // TODO: Support other binary operations.
13728 case ISD::FADD:
13729 case ISD::FSUB:
13730 case ISD::FMUL:
13731 case ISD::ADD:
13732 case ISD::UMIN:
13733 case ISD::UMAX:
13734 case ISD::SMIN:
13735 case ISD::SMAX:
13736 case ISD::FMAXNUM:
13737 case ISD::FMINNUM:
13738 case ISD::FMAXNUM_IEEE:
13739 case ISD::FMINNUM_IEEE:
13740 case ISD::FMAXIMUM:
13741 case ISD::FMINIMUM: {
13742 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13743 Vec.getOperand(0), Idx);
13744 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13745 Vec.getOperand(1), Idx);
13746
13747 DCI.AddToWorklist(Elt0.getNode());
13748 DCI.AddToWorklist(Elt1.getNode());
13749 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
13750 }
13751 }
13752 }
13753
13754 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
13756 SDLoc SL(N);
13757 SDValue Idx = N->getOperand(1);
13758 SDValue V;
13759 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13760 SDValue IC = DAG.getVectorIdxConstant(I, SL);
13761 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
13762 if (I == 0)
13763 V = Elt;
13764 else
13765 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
13766 }
13767 return V;
13768 }
13769
13770 if (!DCI.isBeforeLegalize())
13771 return SDValue();
13772
13773 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
13774 // elements. This exposes more load reduction opportunities by replacing
13775 // multiple small extract_vector_elements with a single 32-bit extract.
13776 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
13777 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
13778 VecSize > 32 && VecSize % 32 == 0 && Idx) {
13779 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
13780
13781 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
13782 unsigned EltIdx = BitIndex / 32;
13783 unsigned LeftoverBitIdx = BitIndex % 32;
13784 SDLoc SL(N);
13785
13786 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
13787 DCI.AddToWorklist(Cast.getNode());
13788
13789 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
13790 DAG.getConstant(EltIdx, SL, MVT::i32));
13791 DCI.AddToWorklist(Elt.getNode());
13792 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
13793 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
13794 DCI.AddToWorklist(Srl.getNode());
13795
13796 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
13797 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
13798 DCI.AddToWorklist(Trunc.getNode());
13799
13800 if (VecEltVT == ResVT) {
13801 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
13802 }
13803
13804 assert(ResVT.isScalarInteger());
13805 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
13806 }
13807
13808 return SDValue();
13809}
13810
13811SDValue
13812SITargetLowering::performInsertVectorEltCombine(SDNode *N,
13813 DAGCombinerInfo &DCI) const {
13814 SDValue Vec = N->getOperand(0);
13815 SDValue Idx = N->getOperand(2);
13816 EVT VecVT = Vec.getValueType();
13817 EVT EltVT = VecVT.getVectorElementType();
13818
13819 // INSERT_VECTOR_ELT (<n x e>, var-idx)
13820 // => BUILD_VECTOR n x select (e, const-idx)
13822 return SDValue();
13823
13824 SelectionDAG &DAG = DCI.DAG;
13825 SDLoc SL(N);
13826 SDValue Ins = N->getOperand(1);
13827 EVT IdxVT = Idx.getValueType();
13828
13830 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13831 SDValue IC = DAG.getConstant(I, SL, IdxVT);
13832 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
13833 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
13834 Ops.push_back(V);
13835 }
13836
13837 return DAG.getBuildVector(VecVT, SL, Ops);
13838}
13839
13840/// Return the source of an fp_extend from f16 to f32, or a converted FP
13841/// constant.
13843 if (Src.getOpcode() == ISD::FP_EXTEND &&
13844 Src.getOperand(0).getValueType() == MVT::f16) {
13845 return Src.getOperand(0);
13846 }
13847
13848 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13849 APFloat Val = CFP->getValueAPF();
13850 bool LosesInfo = true;
13852 if (!LosesInfo)
13853 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
13854 }
13855
13856 return SDValue();
13857}
13858
13859SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
13860 DAGCombinerInfo &DCI) const {
13861 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
13862 "combine only useful on gfx8");
13863
13864 SDValue TruncSrc = N->getOperand(0);
13865 EVT VT = N->getValueType(0);
13866 if (VT != MVT::f16)
13867 return SDValue();
13868
13869 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
13870 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
13871 return SDValue();
13872
13873 SelectionDAG &DAG = DCI.DAG;
13874 SDLoc SL(N);
13875
13876 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
13877 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
13878 // casting back.
13879
13880 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
13881 // fmin(fmax(a, b), fmax(fmin(a, b), c))
13882 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
13883 if (!A)
13884 return SDValue();
13885
13886 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
13887 if (!B)
13888 return SDValue();
13889
13890 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
13891 if (!C)
13892 return SDValue();
13893
13894 // This changes signaling nan behavior. If an input is a signaling nan, it
13895 // would have been quieted by the fpext originally. We don't care because
13896 // these are unconstrained ops. If we needed to insert quieting canonicalizes
13897 // we would be worse off than just doing the promotion.
13898 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
13899 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
13900 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
13901 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
13902}
13903
13904unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
13905 const SDNode *N0,
13906 const SDNode *N1) const {
13907 EVT VT = N0->getValueType(0);
13908
13909 // Only do this if we are not trying to support denormals. v_mad_f32 does not
13910 // support denormals ever.
13911 if (((VT == MVT::f32 &&
13913 (VT == MVT::f16 && Subtarget->hasMadF16() &&
13916 return ISD::FMAD;
13917
13918 const TargetOptions &Options = DAG.getTarget().Options;
13919 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
13920 (N0->getFlags().hasAllowContract() &&
13921 N1->getFlags().hasAllowContract())) &&
13923 return ISD::FMA;
13924 }
13925
13926 return 0;
13927}
13928
13929// For a reassociatable opcode perform:
13930// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
13931SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
13932 SelectionDAG &DAG) const {
13933 EVT VT = N->getValueType(0);
13934 if (VT != MVT::i32 && VT != MVT::i64)
13935 return SDValue();
13936
13937 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
13938 return SDValue();
13939
13940 unsigned Opc = N->getOpcode();
13941 SDValue Op0 = N->getOperand(0);
13942 SDValue Op1 = N->getOperand(1);
13943
13944 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
13945 return SDValue();
13946
13947 if (Op0->isDivergent())
13948 std::swap(Op0, Op1);
13949
13950 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
13951 return SDValue();
13952
13953 SDValue Op2 = Op1.getOperand(1);
13954 Op1 = Op1.getOperand(0);
13955 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
13956 return SDValue();
13957
13958 if (Op1->isDivergent())
13959 std::swap(Op1, Op2);
13960
13961 SDLoc SL(N);
13962 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
13963 return DAG.getNode(Opc, SL, VT, Add1, Op2);
13964}
13965
13966static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
13967 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
13969 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
13970 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
13971 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
13972}
13973
13974// Fold
13975// y = lshr i64 x, 32
13976// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
13977// with Const.hi == -1
13978// To
13979// res = mad_u64_u32 y.lo ,Const.lo, x.lo
13981 SDValue MulLHS, SDValue MulRHS,
13982 SDValue AddRHS) {
13983 if (MulRHS.getOpcode() == ISD::SRL)
13984 std::swap(MulLHS, MulRHS);
13985
13986 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
13987 return SDValue();
13988
13989 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
13990 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
13991 MulLHS.getOperand(0) != AddRHS)
13992 return SDValue();
13993
13994 ConstantSDNode *Const = dyn_cast<ConstantSDNode>(MulRHS.getNode());
13995 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
13996 return SDValue();
13997
13998 SDValue ConstMul =
13999 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
14000 return getMad64_32(DAG, SL, MVT::i64,
14001 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
14002 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
14003}
14004
14005// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
14006// multiplies, if any.
14007//
14008// Full 64-bit multiplies that feed into an addition are lowered here instead
14009// of using the generic expansion. The generic expansion ends up with
14010// a tree of ADD nodes that prevents us from using the "add" part of the
14011// MAD instruction. The expansion produced here results in a chain of ADDs
14012// instead of a tree.
14013SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
14014 DAGCombinerInfo &DCI) const {
14015 assert(N->getOpcode() == ISD::ADD);
14016
14017 SelectionDAG &DAG = DCI.DAG;
14018 EVT VT = N->getValueType(0);
14019 SDLoc SL(N);
14020 SDValue LHS = N->getOperand(0);
14021 SDValue RHS = N->getOperand(1);
14022
14023 if (VT.isVector())
14024 return SDValue();
14025
14026 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
14027 // result in scalar registers for uniform values.
14028 if (!N->isDivergent() && Subtarget->hasSMulHi())
14029 return SDValue();
14030
14031 unsigned NumBits = VT.getScalarSizeInBits();
14032 if (NumBits <= 32 || NumBits > 64)
14033 return SDValue();
14034
14035 if (LHS.getOpcode() != ISD::MUL) {
14036 assert(RHS.getOpcode() == ISD::MUL);
14037 std::swap(LHS, RHS);
14038 }
14039
14040 // Avoid the fold if it would unduly increase the number of multiplies due to
14041 // multiple uses, except on hardware with full-rate multiply-add (which is
14042 // part of full-rate 64-bit ops).
14043 if (!Subtarget->hasFullRate64Ops()) {
14044 unsigned NumUsers = 0;
14045 for (SDNode *User : LHS->users()) {
14046 // There is a use that does not feed into addition, so the multiply can't
14047 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
14048 if (User->getOpcode() != ISD::ADD)
14049 return SDValue();
14050
14051 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
14052 // MUL + 3xADD + 3xADDC over 3xMAD.
14053 ++NumUsers;
14054 if (NumUsers >= 3)
14055 return SDValue();
14056 }
14057 }
14058
14059 SDValue MulLHS = LHS.getOperand(0);
14060 SDValue MulRHS = LHS.getOperand(1);
14061 SDValue AddRHS = RHS;
14062
14063 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
14064 return FoldedMAD;
14065
14066 // Always check whether operands are small unsigned values, since that
14067 // knowledge is useful in more cases. Check for small signed values only if
14068 // doing so can unlock a shorter code sequence.
14069 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
14070 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
14071
14072 bool MulSignedLo = false;
14073 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
14074 MulSignedLo =
14075 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
14076 }
14077
14078 // The operands and final result all have the same number of bits. If
14079 // operands need to be extended, they can be extended with garbage. The
14080 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
14081 // truncated away in the end.
14082 if (VT != MVT::i64) {
14083 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
14084 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
14085 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
14086 }
14087
14088 // The basic code generated is conceptually straightforward. Pseudo code:
14089 //
14090 // accum = mad_64_32 lhs.lo, rhs.lo, accum
14091 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
14092 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
14093 //
14094 // The second and third lines are optional, depending on whether the factors
14095 // are {sign,zero}-extended or not.
14096 //
14097 // The actual DAG is noisier than the pseudo code, but only due to
14098 // instructions that disassemble values into low and high parts, and
14099 // assemble the final result.
14100 SDValue One = DAG.getConstant(1, SL, MVT::i32);
14101
14102 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
14103 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
14104 SDValue Accum =
14105 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
14106
14107 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
14108 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
14109
14110 if (!MulLHSUnsigned32) {
14111 auto MulLHSHi =
14112 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
14113 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
14114 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
14115 }
14116
14117 if (!MulRHSUnsigned32) {
14118 auto MulRHSHi =
14119 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
14120 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
14121 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
14122 }
14123
14124 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
14125 Accum = DAG.getBitcast(MVT::i64, Accum);
14126 }
14127
14128 if (VT != MVT::i64)
14129 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
14130 return Accum;
14131}
14132
14133SDValue
14134SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
14135 DAGCombinerInfo &DCI) const {
14136 SDValue RHS = N->getOperand(1);
14137 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14138 if (!CRHS)
14139 return SDValue();
14140
14141 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
14142 // common.
14143 uint64_t Val = CRHS->getZExtValue();
14144 if (countr_zero(Val) >= 32) {
14145 SelectionDAG &DAG = DCI.DAG;
14146 SDLoc SL(N);
14147 SDValue LHS = N->getOperand(0);
14148
14149 // Avoid carry machinery if we know the low half of the add does not
14150 // contribute to the final result.
14151 //
14152 // add i64:x, K if computeTrailingZeros(K) >= 32
14153 // => build_pair (add x.hi, K.hi), x.lo
14154
14155 // Breaking the 64-bit add here with this strange constant is unlikely
14156 // to interfere with addressing mode patterns.
14157
14158 SDValue Hi = getHiHalf64(LHS, DAG);
14159 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
14160 SDValue AddHi =
14161 DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags());
14162
14163 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
14164 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
14165 }
14166
14167 return SDValue();
14168}
14169
14170// Collect the ultimate src of each of the mul node's operands, and confirm
14171// each operand is 8 bytes.
14172static std::optional<ByteProvider<SDValue>>
14173handleMulOperand(const SDValue &MulOperand) {
14174 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
14175 if (!Byte0 || Byte0->isConstantZero()) {
14176 return std::nullopt;
14177 }
14178 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
14179 if (Byte1 && !Byte1->isConstantZero()) {
14180 return std::nullopt;
14181 }
14182 return Byte0;
14183}
14184
14185static unsigned addPermMasks(unsigned First, unsigned Second) {
14186 unsigned FirstCs = First & 0x0c0c0c0c;
14187 unsigned SecondCs = Second & 0x0c0c0c0c;
14188 unsigned FirstNoCs = First & ~0x0c0c0c0c;
14189 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
14190
14191 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
14192 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
14193 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
14194 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
14195
14196 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
14197}
14198
14199struct DotSrc {
14201 int64_t PermMask;
14203};
14204
14208 SmallVectorImpl<DotSrc> &Src1s, int Step) {
14209
14210 assert(Src0.Src.has_value() && Src1.Src.has_value());
14211 // Src0s and Src1s are empty, just place arbitrarily.
14212 if (Step == 0) {
14213 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
14214 Src0.SrcOffset / 4});
14215 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
14216 Src1.SrcOffset / 4});
14217 return;
14218 }
14219
14220 for (int BPI = 0; BPI < 2; BPI++) {
14221 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
14222 if (BPI == 1) {
14223 BPP = {Src1, Src0};
14224 }
14225 unsigned ZeroMask = 0x0c0c0c0c;
14226 unsigned FMask = 0xFF << (8 * (3 - Step));
14227
14228 unsigned FirstMask =
14229 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14230 unsigned SecondMask =
14231 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14232 // Attempt to find Src vector which contains our SDValue, if so, add our
14233 // perm mask to the existing one. If we are unable to find a match for the
14234 // first SDValue, attempt to find match for the second.
14235 int FirstGroup = -1;
14236 for (int I = 0; I < 2; I++) {
14237 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
14238 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
14239 return IterElt.SrcOp == *BPP.first.Src &&
14240 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
14241 };
14242
14243 auto *Match = llvm::find_if(Srcs, MatchesFirst);
14244 if (Match != Srcs.end()) {
14245 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
14246 FirstGroup = I;
14247 break;
14248 }
14249 }
14250 if (FirstGroup != -1) {
14251 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
14252 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
14253 return IterElt.SrcOp == *BPP.second.Src &&
14254 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
14255 };
14256 auto *Match = llvm::find_if(Srcs, MatchesSecond);
14257 if (Match != Srcs.end()) {
14258 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
14259 } else
14260 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
14261 return;
14262 }
14263 }
14264
14265 // If we have made it here, then we could not find a match in Src0s or Src1s
14266 // for either Src0 or Src1, so just place them arbitrarily.
14267
14268 unsigned ZeroMask = 0x0c0c0c0c;
14269 unsigned FMask = 0xFF << (8 * (3 - Step));
14270
14271 Src0s.push_back(
14272 {*Src0.Src,
14273 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14274 Src0.SrcOffset / 4});
14275 Src1s.push_back(
14276 {*Src1.Src,
14277 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14278 Src1.SrcOffset / 4});
14279}
14280
14282 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
14283 bool IsAny) {
14284
14285 // If we just have one source, just permute it accordingly.
14286 if (Srcs.size() == 1) {
14287 auto *Elt = Srcs.begin();
14288 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
14289
14290 // v_perm will produce the original value
14291 if (Elt->PermMask == 0x3020100)
14292 return EltOp;
14293
14294 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
14295 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
14296 }
14297
14298 auto *FirstElt = Srcs.begin();
14299 auto *SecondElt = std::next(FirstElt);
14300
14302
14303 // If we have multiple sources in the chain, combine them via perms (using
14304 // calculated perm mask) and Ors.
14305 while (true) {
14306 auto FirstMask = FirstElt->PermMask;
14307 auto SecondMask = SecondElt->PermMask;
14308
14309 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
14310 unsigned FirstPlusFour = FirstMask | 0x04040404;
14311 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
14312 // original 0x0C.
14313 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
14314
14315 auto PermMask = addPermMasks(FirstMask, SecondMask);
14316 auto FirstVal =
14317 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14318 auto SecondVal =
14319 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
14320
14321 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
14322 SecondVal,
14323 DAG.getConstant(PermMask, SL, MVT::i32)));
14324
14325 FirstElt = std::next(SecondElt);
14326 if (FirstElt == Srcs.end())
14327 break;
14328
14329 SecondElt = std::next(FirstElt);
14330 // If we only have a FirstElt, then just combine that into the cumulative
14331 // source node.
14332 if (SecondElt == Srcs.end()) {
14333 auto EltOp =
14334 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14335
14336 Perms.push_back(
14337 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
14338 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
14339 break;
14340 }
14341 }
14342
14343 assert(Perms.size() == 1 || Perms.size() == 2);
14344 return Perms.size() == 2
14345 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
14346 : Perms[0];
14347}
14348
14349static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
14350 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
14351 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
14352 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
14353 EntryMask += ZeroMask;
14354 }
14355}
14356
14357static bool isMul(const SDValue Op) {
14358 auto Opcode = Op.getOpcode();
14359
14360 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
14361 Opcode == AMDGPUISD::MUL_I24);
14362}
14363
14364static std::optional<bool>
14366 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
14367 const SDValue &S1Op, const SelectionDAG &DAG) {
14368 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
14369 // of the dot4 is irrelevant.
14370 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
14371 return false;
14372
14373 auto Known0 = DAG.computeKnownBits(S0Op, 0);
14374 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
14375 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
14376 auto Known1 = DAG.computeKnownBits(S1Op, 0);
14377 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
14378 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
14379
14380 assert(!(S0IsUnsigned && S0IsSigned));
14381 assert(!(S1IsUnsigned && S1IsSigned));
14382
14383 // There are 9 possible permutations of
14384 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
14385
14386 // In two permutations, the sign bits are known to be the same for both Ops,
14387 // so simply return Signed / Unsigned corresponding to the MSB
14388
14389 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
14390 return S0IsSigned;
14391
14392 // In another two permutations, the sign bits are known to be opposite. In
14393 // this case return std::nullopt to indicate a bad match.
14394
14395 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
14396 return std::nullopt;
14397
14398 // In the remaining five permutations, we don't know the value of the sign
14399 // bit for at least one Op. Since we have a valid ByteProvider, we know that
14400 // the upper bits must be extension bits. Thus, the only ways for the sign
14401 // bit to be unknown is if it was sign extended from unknown value, or if it
14402 // was any extended. In either case, it is correct to use the signed
14403 // version of the signedness semantics of dot4
14404
14405 // In two of such permutations, we known the sign bit is set for
14406 // one op, and the other is unknown. It is okay to used signed version of
14407 // dot4.
14408 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14409 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14410 return true;
14411
14412 // In one such permutation, we don't know either of the sign bits. It is okay
14413 // to used the signed version of dot4.
14414 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14415 return true;
14416
14417 // In two of such permutations, we known the sign bit is unset for
14418 // one op, and the other is unknown. Return std::nullopt to indicate a
14419 // bad match.
14420 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14421 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14422 return std::nullopt;
14423
14424 llvm_unreachable("Fully covered condition");
14425}
14426
14427SDValue SITargetLowering::performAddCombine(SDNode *N,
14428 DAGCombinerInfo &DCI) const {
14429 SelectionDAG &DAG = DCI.DAG;
14430 EVT VT = N->getValueType(0);
14431 SDLoc SL(N);
14432 SDValue LHS = N->getOperand(0);
14433 SDValue RHS = N->getOperand(1);
14434
14435 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
14436 if (Subtarget->hasMad64_32()) {
14437 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
14438 return Folded;
14439 }
14440 }
14441
14442 if (SDValue V = reassociateScalarOps(N, DAG)) {
14443 return V;
14444 }
14445
14446 if (VT == MVT::i64) {
14447 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
14448 return Folded;
14449 }
14450
14451 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
14452 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
14453 SDValue TempNode(N, 0);
14454 std::optional<bool> IsSigned;
14458
14459 // Match the v_dot4 tree, while collecting src nodes.
14460 int ChainLength = 0;
14461 for (int I = 0; I < 4; I++) {
14462 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
14463 if (MulIdx == -1)
14464 break;
14465 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14466 if (!Src0)
14467 break;
14468 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14469 if (!Src1)
14470 break;
14471
14472 auto IterIsSigned = checkDot4MulSignedness(
14473 TempNode->getOperand(MulIdx), *Src0, *Src1,
14474 TempNode->getOperand(MulIdx)->getOperand(0),
14475 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14476 if (!IterIsSigned)
14477 break;
14478 if (!IsSigned)
14479 IsSigned = *IterIsSigned;
14480 if (*IterIsSigned != *IsSigned)
14481 break;
14482 placeSources(*Src0, *Src1, Src0s, Src1s, I);
14483 auto AddIdx = 1 - MulIdx;
14484 // Allow the special case where add (add (mul24, 0), mul24) became ->
14485 // add (mul24, mul24).
14486 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
14487 Src2s.push_back(TempNode->getOperand(AddIdx));
14488 auto Src0 =
14489 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
14490 if (!Src0)
14491 break;
14492 auto Src1 =
14493 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
14494 if (!Src1)
14495 break;
14496 auto IterIsSigned = checkDot4MulSignedness(
14497 TempNode->getOperand(AddIdx), *Src0, *Src1,
14498 TempNode->getOperand(AddIdx)->getOperand(0),
14499 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14500 if (!IterIsSigned)
14501 break;
14502 assert(IsSigned);
14503 if (*IterIsSigned != *IsSigned)
14504 break;
14505 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
14506 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
14507 ChainLength = I + 2;
14508 break;
14509 }
14510
14511 TempNode = TempNode->getOperand(AddIdx);
14512 Src2s.push_back(TempNode);
14513 ChainLength = I + 1;
14514 if (TempNode->getNumOperands() < 2)
14515 break;
14516 LHS = TempNode->getOperand(0);
14517 RHS = TempNode->getOperand(1);
14518 }
14519
14520 if (ChainLength < 2)
14521 return SDValue();
14522
14523 // Masks were constructed with assumption that we would find a chain of
14524 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
14525 // 0x0c) so they do not affect dot calculation.
14526 if (ChainLength < 4) {
14527 fixMasks(Src0s, ChainLength);
14528 fixMasks(Src1s, ChainLength);
14529 }
14530
14531 SDValue Src0, Src1;
14532
14533 // If we are just using a single source for both, and have permuted the
14534 // bytes consistently, we can just use the sources without permuting
14535 // (commutation).
14536 bool UseOriginalSrc = false;
14537 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
14538 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
14539 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
14540 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
14541 SmallVector<unsigned, 4> SrcBytes;
14542 auto Src0Mask = Src0s.begin()->PermMask;
14543 SrcBytes.push_back(Src0Mask & 0xFF000000);
14544 bool UniqueEntries = true;
14545 for (auto I = 1; I < 4; I++) {
14546 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
14547
14548 if (is_contained(SrcBytes, NextByte)) {
14549 UniqueEntries = false;
14550 break;
14551 }
14552 SrcBytes.push_back(NextByte);
14553 }
14554
14555 if (UniqueEntries) {
14556 UseOriginalSrc = true;
14557
14558 auto *FirstElt = Src0s.begin();
14559 auto FirstEltOp =
14560 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14561
14562 auto *SecondElt = Src1s.begin();
14563 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
14564 SecondElt->DWordOffset);
14565
14566 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
14567 MVT::getIntegerVT(32));
14568 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
14569 MVT::getIntegerVT(32));
14570 }
14571 }
14572
14573 if (!UseOriginalSrc) {
14574 Src0 = resolveSources(DAG, SL, Src0s, false, true);
14575 Src1 = resolveSources(DAG, SL, Src1s, false, true);
14576 }
14577
14578 assert(IsSigned);
14579 SDValue Src2 =
14580 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14581
14582 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
14583 : Intrinsic::amdgcn_udot4,
14584 SL, MVT::i64);
14585
14586 assert(!VT.isVector());
14587 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
14588 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
14589
14590 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
14591 }
14592
14593 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14594 return SDValue();
14595
14596 // add x, zext (setcc) => uaddo_carry x, 0, setcc
14597 // add x, sext (setcc) => usubo_carry x, 0, setcc
14598 unsigned Opc = LHS.getOpcode();
14599 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
14600 Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
14601 std::swap(RHS, LHS);
14602
14603 Opc = RHS.getOpcode();
14604 switch (Opc) {
14605 default:
14606 break;
14607 case ISD::ZERO_EXTEND:
14608 case ISD::SIGN_EXTEND:
14609 case ISD::ANY_EXTEND: {
14610 auto Cond = RHS.getOperand(0);
14611 // If this won't be a real VOPC output, we would still need to insert an
14612 // extra instruction anyway.
14613 if (!isBoolSGPR(Cond))
14614 break;
14615 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14616 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
14618 return DAG.getNode(Opc, SL, VTList, Args);
14619 }
14620 case ISD::UADDO_CARRY: {
14621 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
14622 if (!isNullConstant(RHS.getOperand(1)))
14623 break;
14624 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
14625 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
14626 }
14627 }
14628 return SDValue();
14629}
14630
14631SDValue SITargetLowering::performSubCombine(SDNode *N,
14632 DAGCombinerInfo &DCI) const {
14633 SelectionDAG &DAG = DCI.DAG;
14634 EVT VT = N->getValueType(0);
14635
14636 if (VT == MVT::i64) {
14637 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
14638 return Folded;
14639 }
14640
14641 if (VT != MVT::i32)
14642 return SDValue();
14643
14644 SDLoc SL(N);
14645 SDValue LHS = N->getOperand(0);
14646 SDValue RHS = N->getOperand(1);
14647
14648 // sub x, zext (setcc) => usubo_carry x, 0, setcc
14649 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
14650 unsigned Opc = RHS.getOpcode();
14651 switch (Opc) {
14652 default:
14653 break;
14654 case ISD::ZERO_EXTEND:
14655 case ISD::SIGN_EXTEND:
14656 case ISD::ANY_EXTEND: {
14657 auto Cond = RHS.getOperand(0);
14658 // If this won't be a real VOPC output, we would still need to insert an
14659 // extra instruction anyway.
14660 if (!isBoolSGPR(Cond))
14661 break;
14662 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14663 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
14665 return DAG.getNode(Opc, SL, VTList, Args);
14666 }
14667 }
14668
14669 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
14670 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
14671 if (!isNullConstant(LHS.getOperand(1)))
14672 return SDValue();
14673 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
14674 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
14675 }
14676 return SDValue();
14677}
14678
14679SDValue
14680SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
14681 DAGCombinerInfo &DCI) const {
14682
14683 if (N->getValueType(0) != MVT::i32)
14684 return SDValue();
14685
14686 if (!isNullConstant(N->getOperand(1)))
14687 return SDValue();
14688
14689 SelectionDAG &DAG = DCI.DAG;
14690 SDValue LHS = N->getOperand(0);
14691
14692 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
14693 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
14694 unsigned LHSOpc = LHS.getOpcode();
14695 unsigned Opc = N->getOpcode();
14696 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
14697 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
14698 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
14699 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
14700 }
14701 return SDValue();
14702}
14703
14704SDValue SITargetLowering::performFAddCombine(SDNode *N,
14705 DAGCombinerInfo &DCI) const {
14706 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14707 return SDValue();
14708
14709 SelectionDAG &DAG = DCI.DAG;
14710 EVT VT = N->getValueType(0);
14711
14712 SDLoc SL(N);
14713 SDValue LHS = N->getOperand(0);
14714 SDValue RHS = N->getOperand(1);
14715
14716 // These should really be instruction patterns, but writing patterns with
14717 // source modifiers is a pain.
14718
14719 // fadd (fadd (a, a), b) -> mad 2.0, a, b
14720 if (LHS.getOpcode() == ISD::FADD) {
14721 SDValue A = LHS.getOperand(0);
14722 if (A == LHS.getOperand(1)) {
14723 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14724 if (FusedOp != 0) {
14725 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14726 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
14727 }
14728 }
14729 }
14730
14731 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
14732 if (RHS.getOpcode() == ISD::FADD) {
14733 SDValue A = RHS.getOperand(0);
14734 if (A == RHS.getOperand(1)) {
14735 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14736 if (FusedOp != 0) {
14737 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14738 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
14739 }
14740 }
14741 }
14742
14743 return SDValue();
14744}
14745
14746SDValue SITargetLowering::performFSubCombine(SDNode *N,
14747 DAGCombinerInfo &DCI) const {
14748 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14749 return SDValue();
14750
14751 SelectionDAG &DAG = DCI.DAG;
14752 SDLoc SL(N);
14753 EVT VT = N->getValueType(0);
14754 assert(!VT.isVector());
14755
14756 // Try to get the fneg to fold into the source modifier. This undoes generic
14757 // DAG combines and folds them into the mad.
14758 //
14759 // Only do this if we are not trying to support denormals. v_mad_f32 does
14760 // not support denormals ever.
14761 SDValue LHS = N->getOperand(0);
14762 SDValue RHS = N->getOperand(1);
14763 if (LHS.getOpcode() == ISD::FADD) {
14764 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
14765 SDValue A = LHS.getOperand(0);
14766 if (A == LHS.getOperand(1)) {
14767 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14768 if (FusedOp != 0) {
14769 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14770 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
14771
14772 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
14773 }
14774 }
14775 }
14776
14777 if (RHS.getOpcode() == ISD::FADD) {
14778 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
14779
14780 SDValue A = RHS.getOperand(0);
14781 if (A == RHS.getOperand(1)) {
14782 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14783 if (FusedOp != 0) {
14784 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
14785 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
14786 }
14787 }
14788 }
14789
14790 return SDValue();
14791}
14792
14793SDValue SITargetLowering::performFDivCombine(SDNode *N,
14794 DAGCombinerInfo &DCI) const {
14795 SelectionDAG &DAG = DCI.DAG;
14796 SDLoc SL(N);
14797 EVT VT = N->getValueType(0);
14798 if (VT != MVT::f16 || !Subtarget->has16BitInsts())
14799 return SDValue();
14800
14801 SDValue LHS = N->getOperand(0);
14802 SDValue RHS = N->getOperand(1);
14803
14804 SDNodeFlags Flags = N->getFlags();
14805 SDNodeFlags RHSFlags = RHS->getFlags();
14806 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
14807 !RHS->hasOneUse())
14808 return SDValue();
14809
14810 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
14811 bool IsNegative = false;
14812 if (CLHS->isExactlyValue(1.0) ||
14813 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14814 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
14815 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
14816 if (RHS.getOpcode() == ISD::FSQRT) {
14817 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
14818 SDValue Rsq =
14819 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
14820 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
14821 }
14822 }
14823 }
14824
14825 return SDValue();
14826}
14827
14828SDValue SITargetLowering::performFMulCombine(SDNode *N,
14829 DAGCombinerInfo &DCI) const {
14830 SelectionDAG &DAG = DCI.DAG;
14831 EVT VT = N->getValueType(0);
14832 EVT ScalarVT = VT.getScalarType();
14833 EVT IntVT = VT.changeElementType(MVT::i32);
14834
14835 SDValue LHS = N->getOperand(0);
14836 SDValue RHS = N->getOperand(1);
14837
14838 // It is cheaper to realize i32 inline constants as compared against
14839 // materializing f16 or f64 (or even non-inline f32) values,
14840 // possible via ldexp usage, as shown below :
14841 //
14842 // Given : A = 2^a & B = 2^b ; where a and b are integers.
14843 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
14844 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
14845 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
14846 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
14847 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
14848 if (!TrueNode)
14849 return SDValue();
14850 const ConstantFPSDNode *FalseNode =
14851 isConstOrConstSplatFP(RHS.getOperand(2));
14852 if (!FalseNode)
14853 return SDValue();
14854
14855 if (TrueNode->isNegative() != FalseNode->isNegative())
14856 return SDValue();
14857
14858 // For f32, only non-inline constants should be transformed.
14860 if (ScalarVT == MVT::f32 &&
14861 TII->isInlineConstant(TrueNode->getValueAPF()) &&
14862 TII->isInlineConstant(FalseNode->getValueAPF()))
14863 return SDValue();
14864
14865 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
14866 if (TrueNodeExpVal == INT_MIN)
14867 return SDValue();
14868 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
14869 if (FalseNodeExpVal == INT_MIN)
14870 return SDValue();
14871
14872 SDLoc SL(N);
14873 SDValue SelectNode =
14874 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
14875 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
14876 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
14877
14878 LHS = TrueNode->isNegative()
14879 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
14880 : LHS;
14881
14882 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
14883 }
14884
14885 return SDValue();
14886}
14887
14888SDValue SITargetLowering::performFMACombine(SDNode *N,
14889 DAGCombinerInfo &DCI) const {
14890 SelectionDAG &DAG = DCI.DAG;
14891 EVT VT = N->getValueType(0);
14892 SDLoc SL(N);
14893
14894 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
14895 return SDValue();
14896
14897 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
14898 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
14899 SDValue Op1 = N->getOperand(0);
14900 SDValue Op2 = N->getOperand(1);
14901 SDValue FMA = N->getOperand(2);
14902
14903 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
14904 Op2.getOpcode() != ISD::FP_EXTEND)
14905 return SDValue();
14906
14907 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
14908 // regardless of the denorm mode setting. Therefore,
14909 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
14910 const TargetOptions &Options = DAG.getTarget().Options;
14911 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
14912 (N->getFlags().hasAllowContract() &&
14913 FMA->getFlags().hasAllowContract())) {
14914 Op1 = Op1.getOperand(0);
14915 Op2 = Op2.getOperand(0);
14916 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14918 return SDValue();
14919
14920 SDValue Vec1 = Op1.getOperand(0);
14921 SDValue Idx1 = Op1.getOperand(1);
14922 SDValue Vec2 = Op2.getOperand(0);
14923
14924 SDValue FMAOp1 = FMA.getOperand(0);
14925 SDValue FMAOp2 = FMA.getOperand(1);
14926 SDValue FMAAcc = FMA.getOperand(2);
14927
14928 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
14929 FMAOp2.getOpcode() != ISD::FP_EXTEND)
14930 return SDValue();
14931
14932 FMAOp1 = FMAOp1.getOperand(0);
14933 FMAOp2 = FMAOp2.getOperand(0);
14934 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14936 return SDValue();
14937
14938 SDValue Vec3 = FMAOp1.getOperand(0);
14939 SDValue Vec4 = FMAOp2.getOperand(0);
14940 SDValue Idx2 = FMAOp1.getOperand(1);
14941
14942 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
14943 // Idx1 and Idx2 cannot be the same.
14944 Idx1 == Idx2)
14945 return SDValue();
14946
14947 if (Vec1 == Vec2 || Vec3 == Vec4)
14948 return SDValue();
14949
14950 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
14951 return SDValue();
14952
14953 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
14954 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
14955 DAG.getTargetConstant(0, SL, MVT::i1));
14956 }
14957 }
14958 return SDValue();
14959}
14960
14961SDValue SITargetLowering::performSetCCCombine(SDNode *N,
14962 DAGCombinerInfo &DCI) const {
14963 SelectionDAG &DAG = DCI.DAG;
14964 SDLoc SL(N);
14965
14966 SDValue LHS = N->getOperand(0);
14967 SDValue RHS = N->getOperand(1);
14968 EVT VT = LHS.getValueType();
14969 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
14970
14971 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14972 if (!CRHS) {
14973 CRHS = dyn_cast<ConstantSDNode>(LHS);
14974 if (CRHS) {
14975 std::swap(LHS, RHS);
14977 }
14978 }
14979
14980 if (CRHS) {
14981 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
14982 isBoolSGPR(LHS.getOperand(0))) {
14983 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
14984 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
14985 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
14986 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
14987 if ((CRHS->isAllOnes() &&
14988 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
14989 (CRHS->isZero() &&
14990 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
14991 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14992 DAG.getAllOnesConstant(SL, MVT::i1));
14993 if ((CRHS->isAllOnes() &&
14994 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
14995 (CRHS->isZero() &&
14996 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
14997 return LHS.getOperand(0);
14998 }
14999
15000 const APInt &CRHSVal = CRHS->getAPIntValue();
15001 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
15002 LHS.getOpcode() == ISD::SELECT &&
15003 isa<ConstantSDNode>(LHS.getOperand(1)) &&
15004 isa<ConstantSDNode>(LHS.getOperand(2)) &&
15005 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
15006 isBoolSGPR(LHS.getOperand(0))) {
15007 // Given CT != FT:
15008 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
15009 // setcc (select cc, CT, CF), CF, ne => cc
15010 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
15011 // setcc (select cc, CT, CF), CT, eq => cc
15012 const APInt &CT = LHS.getConstantOperandAPInt(1);
15013 const APInt &CF = LHS.getConstantOperandAPInt(2);
15014
15015 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
15016 (CT == CRHSVal && CC == ISD::SETNE))
15017 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
15018 DAG.getAllOnesConstant(SL, MVT::i1));
15019 if ((CF == CRHSVal && CC == ISD::SETNE) ||
15020 (CT == CRHSVal && CC == ISD::SETEQ))
15021 return LHS.getOperand(0);
15022 }
15023 }
15024
15025 if (VT != MVT::f32 && VT != MVT::f64 &&
15026 (!Subtarget->has16BitInsts() || VT != MVT::f16))
15027 return SDValue();
15028
15029 // Match isinf/isfinite pattern
15030 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
15031 // (fcmp one (fabs x), inf) -> (fp_class x,
15032 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
15033 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
15034 LHS.getOpcode() == ISD::FABS) {
15035 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
15036 if (!CRHS)
15037 return SDValue();
15038
15039 const APFloat &APF = CRHS->getValueAPF();
15040 if (APF.isInfinity() && !APF.isNegative()) {
15041 const unsigned IsInfMask =
15043 const unsigned IsFiniteMask =
15047 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
15048 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
15049 DAG.getConstant(Mask, SL, MVT::i32));
15050 }
15051 }
15052
15053 return SDValue();
15054}
15055
15056SDValue
15057SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
15058 DAGCombinerInfo &DCI) const {
15059 SelectionDAG &DAG = DCI.DAG;
15060 SDLoc SL(N);
15061 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
15062
15063 SDValue Src = N->getOperand(0);
15064 SDValue Shift = N->getOperand(0);
15065
15066 // TODO: Extend type shouldn't matter (assuming legal types).
15067 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
15068 Shift = Shift.getOperand(0);
15069
15070 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
15071 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
15072 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
15073 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
15074 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
15075 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
15076 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
15077 SDValue Shifted = DAG.getZExtOrTrunc(
15078 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
15079
15080 unsigned ShiftOffset = 8 * Offset;
15081 if (Shift.getOpcode() == ISD::SHL)
15082 ShiftOffset -= C->getZExtValue();
15083 else
15084 ShiftOffset += C->getZExtValue();
15085
15086 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
15087 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
15088 MVT::f32, Shifted);
15089 }
15090 }
15091 }
15092
15093 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15094 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
15095 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
15096 // We simplified Src. If this node is not dead, visit it again so it is
15097 // folded properly.
15098 if (N->getOpcode() != ISD::DELETED_NODE)
15099 DCI.AddToWorklist(N);
15100 return SDValue(N, 0);
15101 }
15102
15103 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
15104 if (SDValue DemandedSrc =
15106 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
15107
15108 return SDValue();
15109}
15110
15111SDValue SITargetLowering::performClampCombine(SDNode *N,
15112 DAGCombinerInfo &DCI) const {
15113 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
15114 if (!CSrc)
15115 return SDValue();
15116
15117 const MachineFunction &MF = DCI.DAG.getMachineFunction();
15118 const APFloat &F = CSrc->getValueAPF();
15119 APFloat Zero = APFloat::getZero(F.getSemantics());
15120 if (F < Zero ||
15121 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
15122 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
15123 }
15124
15125 APFloat One(F.getSemantics(), "1.0");
15126 if (F > One)
15127 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
15128
15129 return SDValue(CSrc, 0);
15130}
15131
15133 DAGCombinerInfo &DCI) const {
15134 switch (N->getOpcode()) {
15135 case ISD::ADD:
15136 case ISD::SUB:
15137 case ISD::SHL:
15138 case ISD::SRL:
15139 case ISD::SRA:
15140 case ISD::AND:
15141 case ISD::OR:
15142 case ISD::XOR:
15143 case ISD::MUL:
15144 case ISD::SETCC:
15145 case ISD::SELECT:
15146 case ISD::SMIN:
15147 case ISD::SMAX:
15148 case ISD::UMIN:
15149 case ISD::UMAX:
15150 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
15151 return Res;
15152 break;
15153 default:
15154 break;
15155 }
15156
15157 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
15158 return SDValue();
15159
15160 switch (N->getOpcode()) {
15161 case ISD::ADD:
15162 return performAddCombine(N, DCI);
15163 case ISD::SUB:
15164 return performSubCombine(N, DCI);
15165 case ISD::UADDO_CARRY:
15166 case ISD::USUBO_CARRY:
15167 return performAddCarrySubCarryCombine(N, DCI);
15168 case ISD::FADD:
15169 return performFAddCombine(N, DCI);
15170 case ISD::FSUB:
15171 return performFSubCombine(N, DCI);
15172 case ISD::FDIV:
15173 return performFDivCombine(N, DCI);
15174 case ISD::FMUL:
15175 return performFMulCombine(N, DCI);
15176 case ISD::SETCC:
15177 return performSetCCCombine(N, DCI);
15178 case ISD::FMAXNUM:
15179 case ISD::FMINNUM:
15180 case ISD::FMAXNUM_IEEE:
15181 case ISD::FMINNUM_IEEE:
15182 case ISD::FMAXIMUM:
15183 case ISD::FMINIMUM:
15184 case ISD::SMAX:
15185 case ISD::SMIN:
15186 case ISD::UMAX:
15187 case ISD::UMIN:
15190 return performMinMaxCombine(N, DCI);
15191 case ISD::FMA:
15192 return performFMACombine(N, DCI);
15193 case ISD::AND:
15194 return performAndCombine(N, DCI);
15195 case ISD::OR:
15196 return performOrCombine(N, DCI);
15197 case ISD::FSHR: {
15199 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
15200 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
15201 return matchPERM(N, DCI);
15202 }
15203 break;
15204 }
15205 case ISD::XOR:
15206 return performXorCombine(N, DCI);
15207 case ISD::ZERO_EXTEND:
15208 return performZeroExtendCombine(N, DCI);
15210 return performSignExtendInRegCombine(N, DCI);
15212 return performClassCombine(N, DCI);
15213 case ISD::FCANONICALIZE:
15214 return performFCanonicalizeCombine(N, DCI);
15215 case AMDGPUISD::RCP:
15216 return performRcpCombine(N, DCI);
15217 case ISD::FLDEXP:
15218 case AMDGPUISD::FRACT:
15219 case AMDGPUISD::RSQ:
15222 case AMDGPUISD::RSQ_CLAMP: {
15223 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
15224 SDValue Src = N->getOperand(0);
15225 if (Src.isUndef())
15226 return Src;
15227 break;
15228 }
15229 case ISD::SINT_TO_FP:
15230 case ISD::UINT_TO_FP:
15231 return performUCharToFloatCombine(N, DCI);
15232 case ISD::FCOPYSIGN:
15233 return performFCopySignCombine(N, DCI);
15238 return performCvtF32UByteNCombine(N, DCI);
15239 case AMDGPUISD::FMED3:
15240 return performFMed3Combine(N, DCI);
15242 return performCvtPkRTZCombine(N, DCI);
15243 case AMDGPUISD::CLAMP:
15244 return performClampCombine(N, DCI);
15245 case ISD::SCALAR_TO_VECTOR: {
15246 SelectionDAG &DAG = DCI.DAG;
15247 EVT VT = N->getValueType(0);
15248
15249 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
15250 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
15251 SDLoc SL(N);
15252 SDValue Src = N->getOperand(0);
15253 EVT EltVT = Src.getValueType();
15254 if (EltVT != MVT::i16)
15255 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
15256
15257 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
15258 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
15259 }
15260
15261 break;
15262 }
15264 return performExtractVectorEltCombine(N, DCI);
15266 return performInsertVectorEltCombine(N, DCI);
15267 case ISD::FP_ROUND:
15268 return performFPRoundCombine(N, DCI);
15269 case ISD::LOAD: {
15270 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
15271 return Widened;
15272 [[fallthrough]];
15273 }
15274 default: {
15275 if (!DCI.isBeforeLegalize()) {
15276 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
15277 return performMemSDNodeCombine(MemNode, DCI);
15278 }
15279
15280 break;
15281 }
15282 }
15283
15285}
15286
15287/// Helper function for adjustWritemask
15288static unsigned SubIdx2Lane(unsigned Idx) {
15289 switch (Idx) {
15290 default:
15291 return ~0u;
15292 case AMDGPU::sub0:
15293 return 0;
15294 case AMDGPU::sub1:
15295 return 1;
15296 case AMDGPU::sub2:
15297 return 2;
15298 case AMDGPU::sub3:
15299 return 3;
15300 case AMDGPU::sub4:
15301 return 4; // Possible with TFE/LWE
15302 }
15303}
15304
15305/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
15306SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
15307 SelectionDAG &DAG) const {
15308 unsigned Opcode = Node->getMachineOpcode();
15309
15310 // Subtract 1 because the vdata output is not a MachineSDNode operand.
15311 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
15312 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
15313 return Node; // not implemented for D16
15314
15315 SDNode *Users[5] = {nullptr};
15316 unsigned Lane = 0;
15317 unsigned DmaskIdx =
15318 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
15319 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
15320 unsigned NewDmask = 0;
15321 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
15322 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
15323 bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
15324 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx)))
15325 ? true
15326 : false;
15327 unsigned TFCLane = 0;
15328 bool HasChain = Node->getNumValues() > 1;
15329
15330 if (OldDmask == 0) {
15331 // These are folded out, but on the chance it happens don't assert.
15332 return Node;
15333 }
15334
15335 unsigned OldBitsSet = llvm::popcount(OldDmask);
15336 // Work out which is the TFE/LWE lane if that is enabled.
15337 if (UsesTFC) {
15338 TFCLane = OldBitsSet;
15339 }
15340
15341 // Try to figure out the used register components
15342 for (SDUse &Use : Node->uses()) {
15343
15344 // Don't look at users of the chain.
15345 if (Use.getResNo() != 0)
15346 continue;
15347
15348 SDNode *User = Use.getUser();
15349
15350 // Abort if we can't understand the usage
15351 if (!User->isMachineOpcode() ||
15352 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
15353 return Node;
15354
15355 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
15356 // Note that subregs are packed, i.e. Lane==0 is the first bit set
15357 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
15358 // set, etc.
15359 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
15360 if (Lane == ~0u)
15361 return Node;
15362
15363 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
15364 if (UsesTFC && Lane == TFCLane) {
15365 Users[Lane] = User;
15366 } else {
15367 // Set which texture component corresponds to the lane.
15368 unsigned Comp;
15369 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
15370 Comp = llvm::countr_zero(Dmask);
15371 Dmask &= ~(1 << Comp);
15372 }
15373
15374 // Abort if we have more than one user per component.
15375 if (Users[Lane])
15376 return Node;
15377
15378 Users[Lane] = User;
15379 NewDmask |= 1 << Comp;
15380 }
15381 }
15382
15383 // Don't allow 0 dmask, as hardware assumes one channel enabled.
15384 bool NoChannels = !NewDmask;
15385 if (NoChannels) {
15386 if (!UsesTFC) {
15387 // No uses of the result and not using TFC. Then do nothing.
15388 return Node;
15389 }
15390 // If the original dmask has one channel - then nothing to do
15391 if (OldBitsSet == 1)
15392 return Node;
15393 // Use an arbitrary dmask - required for the instruction to work
15394 NewDmask = 1;
15395 }
15396 // Abort if there's no change
15397 if (NewDmask == OldDmask)
15398 return Node;
15399
15400 unsigned BitsSet = llvm::popcount(NewDmask);
15401
15402 // Check for TFE or LWE - increase the number of channels by one to account
15403 // for the extra return value
15404 // This will need adjustment for D16 if this is also included in
15405 // adjustWriteMask (this function) but at present D16 are excluded.
15406 unsigned NewChannels = BitsSet + UsesTFC;
15407
15408 int NewOpcode =
15409 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
15410 assert(NewOpcode != -1 &&
15411 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
15412 "failed to find equivalent MIMG op");
15413
15414 // Adjust the writemask in the node
15416 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
15417 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
15418 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
15419
15420 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
15421
15422 MVT ResultVT = NewChannels == 1
15423 ? SVT
15424 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
15425 : NewChannels == 5 ? 8
15426 : NewChannels);
15427 SDVTList NewVTList =
15428 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
15429
15430 MachineSDNode *NewNode =
15431 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
15432
15433 if (HasChain) {
15434 // Update chain.
15435 DAG.setNodeMemRefs(NewNode, Node->memoperands());
15436 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
15437 }
15438
15439 if (NewChannels == 1) {
15440 assert(Node->hasNUsesOfValue(1, 0));
15441 SDNode *Copy =
15442 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
15443 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
15444 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
15445 return nullptr;
15446 }
15447
15448 // Update the users of the node with the new indices
15449 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
15450 SDNode *User = Users[i];
15451 if (!User) {
15452 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
15453 // Users[0] is still nullptr because channel 0 doesn't really have a use.
15454 if (i || !NoChannels)
15455 continue;
15456 } else {
15457 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
15458 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
15459 if (NewUser != User) {
15460 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
15461 DAG.RemoveDeadNode(User);
15462 }
15463 }
15464
15465 switch (Idx) {
15466 default:
15467 break;
15468 case AMDGPU::sub0:
15469 Idx = AMDGPU::sub1;
15470 break;
15471 case AMDGPU::sub1:
15472 Idx = AMDGPU::sub2;
15473 break;
15474 case AMDGPU::sub2:
15475 Idx = AMDGPU::sub3;
15476 break;
15477 case AMDGPU::sub3:
15478 Idx = AMDGPU::sub4;
15479 break;
15480 }
15481 }
15482
15483 DAG.RemoveDeadNode(Node);
15484 return nullptr;
15485}
15486
15488 if (Op.getOpcode() == ISD::AssertZext)
15489 Op = Op.getOperand(0);
15490
15491 return isa<FrameIndexSDNode>(Op);
15492}
15493
15494/// Legalize target independent instructions (e.g. INSERT_SUBREG)
15495/// with frame index operands.
15496/// LLVM assumes that inputs are to these instructions are registers.
15497SDNode *
15499 SelectionDAG &DAG) const {
15500 if (Node->getOpcode() == ISD::CopyToReg) {
15501 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
15502 SDValue SrcVal = Node->getOperand(2);
15503
15504 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
15505 // to try understanding copies to physical registers.
15506 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
15507 SDLoc SL(Node);
15509 SDValue VReg = DAG.getRegister(
15510 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
15511
15512 SDNode *Glued = Node->getGluedNode();
15513 SDValue ToVReg = DAG.getCopyToReg(
15514 Node->getOperand(0), SL, VReg, SrcVal,
15515 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
15516 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
15517 VReg, ToVReg.getValue(1));
15518 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
15519 DAG.RemoveDeadNode(Node);
15520 return ToResultReg.getNode();
15521 }
15522 }
15523
15525 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
15526 if (!isFrameIndexOp(Node->getOperand(i))) {
15527 Ops.push_back(Node->getOperand(i));
15528 continue;
15529 }
15530
15531 SDLoc DL(Node);
15532 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
15533 Node->getOperand(i).getValueType(),
15534 Node->getOperand(i)),
15535 0));
15536 }
15537
15538 return DAG.UpdateNodeOperands(Node, Ops);
15539}
15540
15541/// Fold the instructions after selecting them.
15542/// Returns null if users were already updated.
15544 SelectionDAG &DAG) const {
15546 unsigned Opcode = Node->getMachineOpcode();
15547
15548 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
15549 !TII->isGather4(Opcode) &&
15550 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
15551 return adjustWritemask(Node, DAG);
15552 }
15553
15554 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
15556 return Node;
15557 }
15558
15559 switch (Opcode) {
15560 case AMDGPU::V_DIV_SCALE_F32_e64:
15561 case AMDGPU::V_DIV_SCALE_F64_e64: {
15562 // Satisfy the operand register constraint when one of the inputs is
15563 // undefined. Ordinarily each undef value will have its own implicit_def of
15564 // a vreg, so force these to use a single register.
15565 SDValue Src0 = Node->getOperand(1);
15566 SDValue Src1 = Node->getOperand(3);
15567 SDValue Src2 = Node->getOperand(5);
15568
15569 if ((Src0.isMachineOpcode() &&
15570 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
15571 (Src0 == Src1 || Src0 == Src2))
15572 break;
15573
15574 MVT VT = Src0.getValueType().getSimpleVT();
15575 const TargetRegisterClass *RC =
15576 getRegClassFor(VT, Src0.getNode()->isDivergent());
15577
15579 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
15580
15581 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
15582 Src0, SDValue());
15583
15584 // src0 must be the same register as src1 or src2, even if the value is
15585 // undefined, so make sure we don't violate this constraint.
15586 if (Src0.isMachineOpcode() &&
15587 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
15588 if (Src1.isMachineOpcode() &&
15589 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15590 Src0 = Src1;
15591 else if (Src2.isMachineOpcode() &&
15592 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15593 Src0 = Src2;
15594 else {
15595 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
15596 Src0 = UndefReg;
15597 Src1 = UndefReg;
15598 }
15599 } else
15600 break;
15601
15602 SmallVector<SDValue, 9> Ops(Node->ops());
15603 Ops[1] = Src0;
15604 Ops[3] = Src1;
15605 Ops[5] = Src2;
15606 Ops.push_back(ImpDef.getValue(1));
15607 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
15608 }
15609 default:
15610 break;
15611 }
15612
15613 return Node;
15614}
15615
15616// Any MIMG instructions that use tfe or lwe require an initialization of the
15617// result register that will be written in the case of a memory access failure.
15618// The required code is also added to tie this init code to the result of the
15619// img instruction.
15622 const SIRegisterInfo &TRI = TII->getRegisterInfo();
15623 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
15624 MachineBasicBlock &MBB = *MI.getParent();
15625
15626 int DstIdx =
15627 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
15628 unsigned InitIdx = 0;
15629
15630 if (TII->isImage(MI)) {
15631 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
15632 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
15633 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
15634
15635 if (!TFE && !LWE) // intersect_ray
15636 return;
15637
15638 unsigned TFEVal = TFE ? TFE->getImm() : 0;
15639 unsigned LWEVal = LWE ? LWE->getImm() : 0;
15640 unsigned D16Val = D16 ? D16->getImm() : 0;
15641
15642 if (!TFEVal && !LWEVal)
15643 return;
15644
15645 // At least one of TFE or LWE are non-zero
15646 // We have to insert a suitable initialization of the result value and
15647 // tie this to the dest of the image instruction.
15648
15649 // Calculate which dword we have to initialize to 0.
15650 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
15651
15652 // check that dmask operand is found.
15653 assert(MO_Dmask && "Expected dmask operand in instruction");
15654
15655 unsigned dmask = MO_Dmask->getImm();
15656 // Determine the number of active lanes taking into account the
15657 // Gather4 special case
15658 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
15659
15660 bool Packed = !Subtarget->hasUnpackedD16VMem();
15661
15662 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15663
15664 // Abandon attempt if the dst size isn't large enough
15665 // - this is in fact an error but this is picked up elsewhere and
15666 // reported correctly.
15667 uint32_t DstSize =
15668 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15669 if (DstSize < InitIdx)
15670 return;
15671 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
15672 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15673 } else {
15674 return;
15675 }
15676
15677 const DebugLoc &DL = MI.getDebugLoc();
15678
15679 // Create a register for the initialization value.
15680 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
15681 unsigned NewDst = 0; // Final initialized value will be in here
15682
15683 // If PRTStrictNull feature is enabled (the default) then initialize
15684 // all the result registers to 0, otherwise just the error indication
15685 // register (VGPRn+1)
15686 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
15687 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
15688
15689 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
15690 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15691 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
15692 // Initialize dword
15693 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
15694 // clang-format off
15695 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
15696 .addImm(0);
15697 // clang-format on
15698 // Insert into the super-reg
15699 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
15700 .addReg(PrevDst)
15701 .addReg(SubReg)
15703
15704 PrevDst = NewDst;
15705 }
15706
15707 // Add as an implicit operand
15708 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
15709
15710 // Tie the just added implicit operand to the dst
15711 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
15712}
15713
15714/// Assign the register class depending on the number of
15715/// bits set in the writemask
15717 SDNode *Node) const {
15719
15720 MachineFunction *MF = MI.getParent()->getParent();
15723
15724 if (TII->isVOP3(MI.getOpcode())) {
15725 // Make sure constant bus requirements are respected.
15726 TII->legalizeOperandsVOP3(MRI, MI);
15727
15728 // Prefer VGPRs over AGPRs in mAI instructions where possible.
15729 // This saves a chain-copy of registers and better balance register
15730 // use between vgpr and agpr as agpr tuples tend to be big.
15731 if (!MI.getDesc().operands().empty()) {
15732 unsigned Opc = MI.getOpcode();
15733 bool HasAGPRs = Info->mayNeedAGPRs();
15734 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15735 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
15736 for (auto I :
15737 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
15738 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
15739 if (I == -1)
15740 break;
15741 if ((I == Src2Idx) && (HasAGPRs))
15742 break;
15743 MachineOperand &Op = MI.getOperand(I);
15744 if (!Op.isReg() || !Op.getReg().isVirtual())
15745 continue;
15746 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
15747 if (!TRI->hasAGPRs(RC))
15748 continue;
15749 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
15750 if (!Src || !Src->isCopy() ||
15751 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
15752 continue;
15753 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
15754 // All uses of agpr64 and agpr32 can also accept vgpr except for
15755 // v_accvgpr_read, but we do not produce agpr reads during selection,
15756 // so no use checks are needed.
15757 MRI.setRegClass(Op.getReg(), NewRC);
15758 }
15759
15760 if (TII->isMAI(MI)) {
15761 // The ordinary src0, src1, src2 were legalized above.
15762 //
15763 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
15764 // as a separate instruction.
15765 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
15766 AMDGPU::OpName::scale_src0);
15767 if (Src0Idx != -1) {
15768 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
15769 AMDGPU::OpName::scale_src1);
15770 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
15771 TII->usesConstantBus(MRI, MI, Src1Idx))
15772 TII->legalizeOpWithMove(MI, Src1Idx);
15773 }
15774 }
15775
15776 if (!HasAGPRs)
15777 return;
15778
15779 // Resolve the rest of AV operands to AGPRs.
15780 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
15781 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15782 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
15783 if (TRI->isVectorSuperClass(RC)) {
15784 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
15785 MRI.setRegClass(Src2->getReg(), NewRC);
15786 if (Src2->isTied())
15787 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
15788 }
15789 }
15790 }
15791 }
15792
15793 return;
15794 }
15795
15796 if (TII->isImage(MI))
15797 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
15798}
15799
15801 uint64_t Val) {
15802 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
15803 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
15804}
15805
15807 const SDLoc &DL,
15808 SDValue Ptr) const {
15810
15811 // Build the half of the subregister with the constants before building the
15812 // full 128-bit register. If we are building multiple resource descriptors,
15813 // this will allow CSEing of the 2-component register.
15814 const SDValue Ops0[] = {
15815 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
15816 buildSMovImm32(DAG, DL, 0),
15817 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15818 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
15819 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
15820
15821 SDValue SubRegHi = SDValue(
15822 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
15823
15824 // Combine the constants and the pointer.
15825 const SDValue Ops1[] = {
15826 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
15827 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
15828 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
15829
15830 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
15831}
15832
15833/// Return a resource descriptor with the 'Add TID' bit enabled
15834/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
15835/// of the resource descriptor) to create an offset, which is added to
15836/// the resource pointer.
15838 SDValue Ptr, uint32_t RsrcDword1,
15839 uint64_t RsrcDword2And3) const {
15840 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
15841 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
15842 if (RsrcDword1) {
15843 PtrHi =
15844 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
15845 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
15846 0);
15847 }
15848
15849 SDValue DataLo =
15850 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
15851 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
15852
15853 const SDValue Ops[] = {
15854 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15855 PtrLo,
15856 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15857 PtrHi,
15858 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
15859 DataLo,
15860 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
15861 DataHi,
15862 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
15863
15864 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
15865}
15866
15867//===----------------------------------------------------------------------===//
15868// SI Inline Assembly Support
15869//===----------------------------------------------------------------------===//
15870
15871std::pair<unsigned, const TargetRegisterClass *>
15873 StringRef Constraint,
15874 MVT VT) const {
15875 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
15876
15877 const TargetRegisterClass *RC = nullptr;
15878 if (Constraint.size() == 1) {
15879 const unsigned BitWidth = VT.getSizeInBits();
15880 switch (Constraint[0]) {
15881 default:
15882 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15883 case 's':
15884 case 'r':
15885 switch (BitWidth) {
15886 case 16:
15887 RC = &AMDGPU::SReg_32RegClass;
15888 break;
15889 case 64:
15890 RC = &AMDGPU::SGPR_64RegClass;
15891 break;
15892 default:
15894 if (!RC)
15895 return std::pair(0U, nullptr);
15896 break;
15897 }
15898 break;
15899 case 'v':
15900 switch (BitWidth) {
15901 case 16:
15902 RC = &AMDGPU::VGPR_32RegClass;
15903 break;
15904 default:
15905 RC = TRI->getVGPRClassForBitWidth(BitWidth);
15906 if (!RC)
15907 return std::pair(0U, nullptr);
15908 break;
15909 }
15910 break;
15911 case 'a':
15912 if (!Subtarget->hasMAIInsts())
15913 break;
15914 switch (BitWidth) {
15915 case 16:
15916 RC = &AMDGPU::AGPR_32RegClass;
15917 break;
15918 default:
15919 RC = TRI->getAGPRClassForBitWidth(BitWidth);
15920 if (!RC)
15921 return std::pair(0U, nullptr);
15922 break;
15923 }
15924 break;
15925 }
15926 // We actually support i128, i16 and f16 as inline parameters
15927 // even if they are not reported as legal
15928 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
15929 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
15930 return std::pair(0U, RC);
15931 }
15932
15933 if (Constraint.starts_with("{") && Constraint.ends_with("}")) {
15934 StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
15935 if (RegName.consume_front("v")) {
15936 RC = &AMDGPU::VGPR_32RegClass;
15937 } else if (RegName.consume_front("s")) {
15938 RC = &AMDGPU::SGPR_32RegClass;
15939 } else if (RegName.consume_front("a")) {
15940 RC = &AMDGPU::AGPR_32RegClass;
15941 }
15942
15943 if (RC) {
15944 uint32_t Idx;
15945 if (RegName.consume_front("[")) {
15946 uint32_t End;
15947 bool Failed = RegName.consumeInteger(10, Idx);
15948 Failed |= !RegName.consume_front(":");
15949 Failed |= RegName.consumeInteger(10, End);
15950 Failed |= !RegName.consume_back("]");
15951 if (!Failed) {
15952 uint32_t Width = (End - Idx + 1) * 32;
15953 // Prohibit constraints for register ranges with a width that does not
15954 // match the required type.
15955 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
15956 return std::pair(0U, nullptr);
15957 MCRegister Reg = RC->getRegister(Idx);
15959 RC = TRI->getVGPRClassForBitWidth(Width);
15960 else if (SIRegisterInfo::isSGPRClass(RC))
15961 RC = TRI->getSGPRClassForBitWidth(Width);
15962 else if (SIRegisterInfo::isAGPRClass(RC))
15963 RC = TRI->getAGPRClassForBitWidth(Width);
15964 if (RC) {
15965 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15966 if (!Reg) {
15967 // The register class does not contain the requested register,
15968 // e.g., because it is an SGPR pair that would violate alignment
15969 // requirements.
15970 return std::pair(0U, nullptr);
15971 }
15972 return std::pair(Reg, RC);
15973 }
15974 }
15975 } else {
15976 // Check for lossy scalar/vector conversions.
15977 if (VT.isVector() && VT.getSizeInBits() != 32)
15978 return std::pair(0U, nullptr);
15979 bool Failed = RegName.getAsInteger(10, Idx);
15980 if (!Failed && Idx < RC->getNumRegs())
15981 return std::pair(RC->getRegister(Idx), RC);
15982 }
15983 }
15984 }
15985
15986 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15987 if (Ret.first)
15988 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
15989
15990 return Ret;
15991}
15992
15993static bool isImmConstraint(StringRef Constraint) {
15994 if (Constraint.size() == 1) {
15995 switch (Constraint[0]) {
15996 default:
15997 break;
15998 case 'I':
15999 case 'J':
16000 case 'A':
16001 case 'B':
16002 case 'C':
16003 return true;
16004 }
16005 } else if (Constraint == "DA" || Constraint == "DB") {
16006 return true;
16007 }
16008 return false;
16009}
16010
16013 if (Constraint.size() == 1) {
16014 switch (Constraint[0]) {
16015 default:
16016 break;
16017 case 's':
16018 case 'v':
16019 case 'a':
16020 return C_RegisterClass;
16021 }
16022 }
16023 if (isImmConstraint(Constraint)) {
16024 return C_Other;
16025 }
16026 return TargetLowering::getConstraintType(Constraint);
16027}
16028
16029static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
16031 Val = Val & maskTrailingOnes<uint64_t>(Size);
16032 }
16033 return Val;
16034}
16035
16037 StringRef Constraint,
16038 std::vector<SDValue> &Ops,
16039 SelectionDAG &DAG) const {
16040 if (isImmConstraint(Constraint)) {
16041 uint64_t Val;
16042 if (getAsmOperandConstVal(Op, Val) &&
16043 checkAsmConstraintVal(Op, Constraint, Val)) {
16044 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
16045 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
16046 }
16047 } else {
16048 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
16049 }
16050}
16051
16053 unsigned Size = Op.getScalarValueSizeInBits();
16054 if (Size > 64)
16055 return false;
16056
16057 if (Size == 16 && !Subtarget->has16BitInsts())
16058 return false;
16059
16060 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
16061 Val = C->getSExtValue();
16062 return true;
16063 }
16064 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
16065 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
16066 return true;
16067 }
16068 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) {
16069 if (Size != 16 || Op.getNumOperands() != 2)
16070 return false;
16071 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
16072 return false;
16073 if (ConstantSDNode *C = V->getConstantSplatNode()) {
16074 Val = C->getSExtValue();
16075 return true;
16076 }
16077 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
16078 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
16079 return true;
16080 }
16081 }
16082
16083 return false;
16084}
16085
16087 uint64_t Val) const {
16088 if (Constraint.size() == 1) {
16089 switch (Constraint[0]) {
16090 case 'I':
16092 case 'J':
16093 return isInt<16>(Val);
16094 case 'A':
16095 return checkAsmConstraintValA(Op, Val);
16096 case 'B':
16097 return isInt<32>(Val);
16098 case 'C':
16099 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
16101 default:
16102 break;
16103 }
16104 } else if (Constraint.size() == 2) {
16105 if (Constraint == "DA") {
16106 int64_t HiBits = static_cast<int32_t>(Val >> 32);
16107 int64_t LoBits = static_cast<int32_t>(Val);
16108 return checkAsmConstraintValA(Op, HiBits, 32) &&
16109 checkAsmConstraintValA(Op, LoBits, 32);
16110 }
16111 if (Constraint == "DB") {
16112 return true;
16113 }
16114 }
16115 llvm_unreachable("Invalid asm constraint");
16116}
16117
16119 unsigned MaxSize) const {
16120 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
16121 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
16122 if (Size == 16) {
16123 MVT VT = Op.getSimpleValueType();
16124 switch (VT.SimpleTy) {
16125 default:
16126 return false;
16127 case MVT::i16:
16128 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
16129 case MVT::f16:
16130 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
16131 case MVT::bf16:
16132 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
16133 case MVT::v2i16:
16134 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
16135 case MVT::v2f16:
16136 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
16137 case MVT::v2bf16:
16138 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
16139 }
16140 }
16141 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
16142 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
16143 return true;
16144 return false;
16145}
16146
16147static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
16148 switch (UnalignedClassID) {
16149 case AMDGPU::VReg_64RegClassID:
16150 return AMDGPU::VReg_64_Align2RegClassID;
16151 case AMDGPU::VReg_96RegClassID:
16152 return AMDGPU::VReg_96_Align2RegClassID;
16153 case AMDGPU::VReg_128RegClassID:
16154 return AMDGPU::VReg_128_Align2RegClassID;
16155 case AMDGPU::VReg_160RegClassID:
16156 return AMDGPU::VReg_160_Align2RegClassID;
16157 case AMDGPU::VReg_192RegClassID:
16158 return AMDGPU::VReg_192_Align2RegClassID;
16159 case AMDGPU::VReg_224RegClassID:
16160 return AMDGPU::VReg_224_Align2RegClassID;
16161 case AMDGPU::VReg_256RegClassID:
16162 return AMDGPU::VReg_256_Align2RegClassID;
16163 case AMDGPU::VReg_288RegClassID:
16164 return AMDGPU::VReg_288_Align2RegClassID;
16165 case AMDGPU::VReg_320RegClassID:
16166 return AMDGPU::VReg_320_Align2RegClassID;
16167 case AMDGPU::VReg_352RegClassID:
16168 return AMDGPU::VReg_352_Align2RegClassID;
16169 case AMDGPU::VReg_384RegClassID:
16170 return AMDGPU::VReg_384_Align2RegClassID;
16171 case AMDGPU::VReg_512RegClassID:
16172 return AMDGPU::VReg_512_Align2RegClassID;
16173 case AMDGPU::VReg_1024RegClassID:
16174 return AMDGPU::VReg_1024_Align2RegClassID;
16175 case AMDGPU::AReg_64RegClassID:
16176 return AMDGPU::AReg_64_Align2RegClassID;
16177 case AMDGPU::AReg_96RegClassID:
16178 return AMDGPU::AReg_96_Align2RegClassID;
16179 case AMDGPU::AReg_128RegClassID:
16180 return AMDGPU::AReg_128_Align2RegClassID;
16181 case AMDGPU::AReg_160RegClassID:
16182 return AMDGPU::AReg_160_Align2RegClassID;
16183 case AMDGPU::AReg_192RegClassID:
16184 return AMDGPU::AReg_192_Align2RegClassID;
16185 case AMDGPU::AReg_256RegClassID:
16186 return AMDGPU::AReg_256_Align2RegClassID;
16187 case AMDGPU::AReg_512RegClassID:
16188 return AMDGPU::AReg_512_Align2RegClassID;
16189 case AMDGPU::AReg_1024RegClassID:
16190 return AMDGPU::AReg_1024_Align2RegClassID;
16191 default:
16192 return -1;
16193 }
16194}
16195
16196// Figure out which registers should be reserved for stack access. Only after
16197// the function is legalized do we know all of the non-spill stack objects or if
16198// calls are present.
16202 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
16203 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16204 const SIInstrInfo *TII = ST.getInstrInfo();
16205
16206 if (Info->isEntryFunction()) {
16207 // Callable functions have fixed registers used for stack access.
16209 }
16210
16211 // TODO: Move this logic to getReservedRegs()
16212 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
16213 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
16214 Register SReg = ST.isWave32()
16215 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
16216 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
16217 &AMDGPU::SGPR_64RegClass);
16218 Info->setSGPRForEXECCopy(SReg);
16219
16220 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
16221 Info->getStackPtrOffsetReg()));
16222 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
16223 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
16224
16225 // We need to worry about replacing the default register with itself in case
16226 // of MIR testcases missing the MFI.
16227 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
16228 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
16229
16230 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
16231 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
16232
16233 Info->limitOccupancy(MF);
16234
16235 if (ST.isWave32() && !MF.empty()) {
16236 for (auto &MBB : MF) {
16237 for (auto &MI : MBB) {
16238 TII->fixImplicitOperands(MI);
16239 }
16240 }
16241 }
16242
16243 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
16244 // classes if required. Ideally the register class constraints would differ
16245 // per-subtarget, but there's no easy way to achieve that right now. This is
16246 // not a problem for VGPRs because the correctly aligned VGPR class is implied
16247 // from using them as the register class for legal types.
16248 if (ST.needsAlignedVGPRs()) {
16249 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
16250 const Register Reg = Register::index2VirtReg(I);
16251 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
16252 if (!RC)
16253 continue;
16254 int NewClassID = getAlignedAGPRClassID(RC->getID());
16255 if (NewClassID != -1)
16256 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
16257 }
16258 }
16259
16261}
16262
16264 KnownBits &Known,
16265 const APInt &DemandedElts,
16266 const SelectionDAG &DAG,
16267 unsigned Depth) const {
16268 Known.resetAll();
16269 unsigned Opc = Op.getOpcode();
16270 switch (Opc) {
16272 unsigned IID = Op.getConstantOperandVal(0);
16273 switch (IID) {
16274 case Intrinsic::amdgcn_mbcnt_lo:
16275 case Intrinsic::amdgcn_mbcnt_hi: {
16276 const GCNSubtarget &ST =
16278 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
16279 // most 31 + src1.
16280 Known.Zero.setBitsFrom(
16281 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
16282 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
16283 Known = KnownBits::add(Known, Known2);
16284 return;
16285 }
16286 }
16287 break;
16288 }
16289 }
16291 Op, Known, DemandedElts, DAG, Depth);
16292}
16293
16295 const int FI, KnownBits &Known, const MachineFunction &MF) const {
16297
16298 // Set the high bits to zero based on the maximum allowed scratch size per
16299 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
16300 // calculation won't overflow, so assume the sign bit is never set.
16301 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
16302}
16303
16305 KnownBits &Known, unsigned Dim) {
16306 unsigned MaxValue =
16307 ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim);
16308 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
16309}
16310
16312 GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts,
16313 const MachineRegisterInfo &MRI, unsigned Depth) const {
16314 const MachineInstr *MI = MRI.getVRegDef(R);
16315 switch (MI->getOpcode()) {
16316 case AMDGPU::G_INTRINSIC:
16317 case AMDGPU::G_INTRINSIC_CONVERGENT: {
16318 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
16319 switch (IID) {
16320 case Intrinsic::amdgcn_workitem_id_x:
16321 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
16322 break;
16323 case Intrinsic::amdgcn_workitem_id_y:
16324 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1);
16325 break;
16326 case Intrinsic::amdgcn_workitem_id_z:
16327 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2);
16328 break;
16329 case Intrinsic::amdgcn_mbcnt_lo:
16330 case Intrinsic::amdgcn_mbcnt_hi: {
16331 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
16332 // most 31 + src1.
16333 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
16334 ? getSubtarget()->getWavefrontSizeLog2()
16335 : 5);
16336 KnownBits Known2;
16337 KB.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
16338 Depth + 1);
16339 Known = KnownBits::add(Known, Known2);
16340 break;
16341 }
16342 case Intrinsic::amdgcn_groupstaticsize: {
16343 // We can report everything over the maximum size as 0. We can't report
16344 // based on the actual size because we don't know if it's accurate or not
16345 // at any given point.
16346 Known.Zero.setHighBits(
16347 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
16348 break;
16349 }
16350 }
16351 break;
16352 }
16353 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
16354 Known.Zero.setHighBits(24);
16355 break;
16356 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
16357 Known.Zero.setHighBits(16);
16358 break;
16359 case AMDGPU::G_AMDGPU_SMED3:
16360 case AMDGPU::G_AMDGPU_UMED3: {
16361 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
16362
16363 KnownBits Known2;
16364 KB.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
16365 if (Known2.isUnknown())
16366 break;
16367
16368 KnownBits Known1;
16369 KB.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
16370 if (Known1.isUnknown())
16371 break;
16372
16373 KnownBits Known0;
16374 KB.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
16375 if (Known0.isUnknown())
16376 break;
16377
16378 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
16379 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
16380 Known.One = Known0.One & Known1.One & Known2.One;
16381 break;
16382 }
16383 }
16384}
16385
16388 unsigned Depth) const {
16389 const MachineInstr *MI = MRI.getVRegDef(R);
16390 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
16391 // FIXME: Can this move to generic code? What about the case where the call
16392 // site specifies a lower alignment?
16393 Intrinsic::ID IID = GI->getIntrinsicID();
16395 AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID);
16396 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
16397 return *RetAlign;
16398 }
16399 return Align(1);
16400}
16401
16404 const Align CacheLineAlign = Align(64);
16405
16406 // Pre-GFX10 target did not benefit from loop alignment
16407 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
16408 getSubtarget()->hasInstFwdPrefetchBug())
16409 return PrefAlign;
16410
16411 // On GFX10 I$ is 4 x 64 bytes cache lines.
16412 // By default prefetcher keeps one cache line behind and reads two ahead.
16413 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
16414 // behind and one ahead.
16415 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
16416 // If loop fits 64 bytes it always spans no more than two cache lines and
16417 // does not need an alignment.
16418 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
16419 // Else if loop is less or equal 192 bytes we need two lines behind.
16420
16422 const MachineBasicBlock *Header = ML->getHeader();
16423 if (Header->getAlignment() != PrefAlign)
16424 return Header->getAlignment(); // Already processed.
16425
16426 unsigned LoopSize = 0;
16427 for (const MachineBasicBlock *MBB : ML->blocks()) {
16428 // If inner loop block is aligned assume in average half of the alignment
16429 // size to be added as nops.
16430 if (MBB != Header)
16431 LoopSize += MBB->getAlignment().value() / 2;
16432
16433 for (const MachineInstr &MI : *MBB) {
16434 LoopSize += TII->getInstSizeInBytes(MI);
16435 if (LoopSize > 192)
16436 return PrefAlign;
16437 }
16438 }
16439
16440 if (LoopSize <= 64)
16441 return PrefAlign;
16442
16443 if (LoopSize <= 128)
16444 return CacheLineAlign;
16445
16446 // If any of parent loops is surrounded by prefetch instructions do not
16447 // insert new for inner loop, which would reset parent's settings.
16448 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
16449 if (MachineBasicBlock *Exit = P->getExitBlock()) {
16450 auto I = Exit->getFirstNonDebugInstr();
16451 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
16452 return CacheLineAlign;
16453 }
16454 }
16455
16456 MachineBasicBlock *Pre = ML->getLoopPreheader();
16457 MachineBasicBlock *Exit = ML->getExitBlock();
16458
16459 if (Pre && Exit) {
16460 auto PreTerm = Pre->getFirstTerminator();
16461 if (PreTerm == Pre->begin() ||
16462 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
16463 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
16464 .addImm(1); // prefetch 2 lines behind PC
16465
16466 auto ExitHead = Exit->getFirstNonDebugInstr();
16467 if (ExitHead == Exit->end() ||
16468 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
16469 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
16470 .addImm(2); // prefetch 1 line behind PC
16471 }
16472
16473 return CacheLineAlign;
16474}
16475
16477static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
16478 assert(N->getOpcode() == ISD::CopyFromReg);
16479 do {
16480 // Follow the chain until we find an INLINEASM node.
16481 N = N->getOperand(0).getNode();
16482 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
16483 return true;
16484 } while (N->getOpcode() == ISD::CopyFromReg);
16485 return false;
16486}
16487
16490 UniformityInfo *UA) const {
16491 switch (N->getOpcode()) {
16492 case ISD::CopyFromReg: {
16493 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
16494 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
16495 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16496 Register Reg = R->getReg();
16497
16498 // FIXME: Why does this need to consider isLiveIn?
16499 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
16500 return !TRI->isSGPRReg(MRI, Reg);
16501
16502 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
16503 return UA->isDivergent(V);
16504
16505 assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
16506 return !TRI->isSGPRReg(MRI, Reg);
16507 }
16508 case ISD::LOAD: {
16509 const LoadSDNode *L = cast<LoadSDNode>(N);
16510 unsigned AS = L->getAddressSpace();
16511 // A flat load may access private memory.
16513 }
16514 case ISD::CALLSEQ_END:
16515 return true;
16517 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
16519 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
16538 // Target-specific read-modify-write atomics are sources of divergence.
16539 return true;
16540 default:
16541 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
16542 // Generic read-modify-write atomics are sources of divergence.
16543 return A->readMem() && A->writeMem();
16544 }
16545 return false;
16546 }
16547}
16548
16550 EVT VT) const {
16551 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
16552 case MVT::f32:
16554 case MVT::f64:
16555 case MVT::f16:
16557 default:
16558 return false;
16559 }
16560}
16561
16563 LLT Ty, const MachineFunction &MF) const {
16564 switch (Ty.getScalarSizeInBits()) {
16565 case 32:
16566 return !denormalModeIsFlushAllF32(MF);
16567 case 64:
16568 case 16:
16569 return !denormalModeIsFlushAllF64F16(MF);
16570 default:
16571 return false;
16572 }
16573}
16574
16576 const SelectionDAG &DAG,
16577 bool SNaN,
16578 unsigned Depth) const {
16579 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
16580 const MachineFunction &MF = DAG.getMachineFunction();
16582
16583 if (Info->getMode().DX10Clamp)
16584 return true; // Clamped to 0.
16585 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
16586 }
16587
16589 Depth);
16590}
16591
16592// On older subtargets, global FP atomic instructions have a hardcoded FP mode
16593// and do not support FP32 denormals, and only support v2f16/f64 denormals.
16595 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
16596 return true;
16597
16599 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
16600 if (DenormMode == DenormalMode::getPreserveSign())
16601 return true;
16602
16603 // TODO: Remove this.
16604 return RMW->getFunction()
16605 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
16606 .getValueAsBool();
16607}
16608
16610 LLVMContext &Ctx = RMW->getContext();
16611 StringRef SS = Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("");
16612 StringRef MemScope = SS.empty() ? StringRef("system") : SS;
16613
16614 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
16615 << "Hardware instruction generated for atomic "
16616 << RMW->getOperationName(RMW->getOperation())
16617 << " operation at memory scope " << MemScope;
16618}
16619
16620static bool isV2F16OrV2BF16(Type *Ty) {
16621 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
16622 Type *EltTy = VT->getElementType();
16623 return VT->getNumElements() == 2 &&
16624 (EltTy->isHalfTy() || EltTy->isBFloatTy());
16625 }
16626
16627 return false;
16628}
16629
16630static bool isV2F16(Type *Ty) {
16631 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
16632 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
16633}
16634
16635static bool isV2BF16(Type *Ty) {
16636 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
16637 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
16638}
16639
16640/// \return true if atomicrmw integer ops work for the type.
16641static bool isAtomicRMWLegalIntTy(Type *Ty) {
16642 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
16643 unsigned BW = IT->getBitWidth();
16644 return BW == 32 || BW == 64;
16645 }
16646
16647 return false;
16648}
16649
16650/// \return true if this atomicrmw xchg type can be selected.
16651static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
16652 Type *Ty = RMW->getType();
16653 if (isAtomicRMWLegalIntTy(Ty))
16654 return true;
16655
16656 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
16657 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
16658 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
16659 return BW == 32 || BW == 64;
16660 }
16661
16662 if (Ty->isFloatTy() || Ty->isDoubleTy())
16663 return true;
16664
16665 if (FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty)) {
16666 return VT->getNumElements() == 2 &&
16667 VT->getElementType()->getPrimitiveSizeInBits() == 16;
16668 }
16669
16670 return false;
16671}
16672
16673/// \returns true if it's valid to emit a native instruction for \p RMW, based
16674/// on the properties of the target memory.
16675static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
16676 const AtomicRMWInst *RMW,
16677 bool HasSystemScope) {
16678 // The remote/fine-grained access logic is different from the integer
16679 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
16680 // fine-grained access does not work, even for a device local allocation.
16681 //
16682 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
16683 // allocations work.
16684 if (HasSystemScope) {
16686 RMW->hasMetadata("amdgpu.no.remote.memory"))
16687 return true;
16689 return true;
16690
16691 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
16692}
16693
16694/// \return Action to perform on AtomicRMWInsts for integer operations.
16697 return isAtomicRMWLegalIntTy(RMW->getType())
16700}
16701
16702/// Return if a flat address space atomicrmw can access private memory.
16704 const MDNode *NoaliasAddrSpaceMD =
16705 I->getMetadata(LLVMContext::MD_noalias_addrspace);
16706 if (!NoaliasAddrSpaceMD)
16707 return true;
16708
16709 for (unsigned I = 0, E = NoaliasAddrSpaceMD->getNumOperands() / 2; I != E;
16710 ++I) {
16711 auto *Low = mdconst::extract<ConstantInt>(
16712 NoaliasAddrSpaceMD->getOperand(2 * I + 0));
16713 if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS)) {
16714 auto *High = mdconst::extract<ConstantInt>(
16715 NoaliasAddrSpaceMD->getOperand(2 * I + 1));
16716 return High->getValue().ule(AMDGPUAS::PRIVATE_ADDRESS);
16717 }
16718 }
16719
16720 return true;
16721}
16722
16725 unsigned AS = RMW->getPointerAddressSpace();
16726 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
16728
16729 // 64-bit flat atomics that dynamically reside in private memory will silently
16730 // be dropped.
16731 //
16732 // Note that we will emit a new copy of the original atomic in the expansion,
16733 // which will be incrementally relegalized.
16734 const DataLayout &DL = RMW->getFunction()->getDataLayout();
16735 if (AS == AMDGPUAS::FLAT_ADDRESS &&
16736 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
16739
16740 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
16742 ORE.emit([=]() {
16743 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
16744 });
16745 return Kind;
16746 };
16747
16748 auto SSID = RMW->getSyncScopeID();
16749 bool HasSystemScope =
16750 SSID == SyncScope::System ||
16751 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
16752
16753 auto Op = RMW->getOperation();
16754 switch (Op) {
16755 case AtomicRMWInst::Xchg: {
16756 // PCIe supports add and xchg for system atomics.
16757 return isAtomicRMWLegalXChgTy(RMW)
16760 }
16761 case AtomicRMWInst::Add:
16762 case AtomicRMWInst::And:
16766 case AtomicRMWInst::Sub:
16767 case AtomicRMWInst::Or:
16768 case AtomicRMWInst::Xor: {
16769 // Atomic sub/or/xor do not work over PCI express, but atomic add
16770 // does. InstCombine transforms these with 0 to or, so undo that.
16771 if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) {
16772 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
16773 ConstVal && ConstVal->isNullValue())
16775 }
16776
16778 }
16779 case AtomicRMWInst::FAdd: {
16780 Type *Ty = RMW->getType();
16781
16782 // TODO: Handle REGION_ADDRESS
16783 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16784 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
16785 // is fixed to round-to-nearest-even.
16786 //
16787 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
16788 // round-to-nearest-even.
16789 //
16790 // We ignore the rounding mode problem, even in strictfp. The C++ standard
16791 // suggests it is OK if the floating-point mode may not match the calling
16792 // thread.
16793 if (Ty->isFloatTy()) {
16796 }
16797
16798 if (Ty->isDoubleTy()) {
16799 // Ignores denormal mode, but we don't consider flushing mandatory.
16802 }
16803
16804 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
16806
16808 }
16809
16810 // LDS atomics respect the denormal mode from the mode register.
16811 //
16812 // Traditionally f32 global/buffer memory atomics would unconditionally
16813 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
16814 // flush.
16815 //
16816 // On targets with flat atomic fadd, denormals would flush depending on
16817 // whether the target address resides in LDS or global memory. We consider
16818 // this flat-maybe-flush as will-flush.
16819 if (Ty->isFloatTy() &&
16823
16824 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
16825 // safe. The message phrasing also should be better.
16826 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16827 if (AS == AMDGPUAS::FLAT_ADDRESS) {
16828 // gfx940, gfx12
16829 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
16830 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16831 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16832 // gfx90a, gfx940, gfx12
16833 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
16834 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16835
16836 // gfx940, gfx12
16837 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
16838 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16839 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16840 // gfx90a, gfx940, gfx12
16841 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
16842 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16843
16844 // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16845 // buffer. gfx12 does have the buffer version.
16846 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
16847 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16848 }
16849
16850 // global and flat atomic fadd f64: gfx90a, gfx940.
16851 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
16852 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16853
16854 if (AS != AMDGPUAS::FLAT_ADDRESS) {
16855 if (Ty->isFloatTy()) {
16856 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940,
16857 // gfx11+.
16858 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16859 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16860 // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16861 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16862 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16863 } else {
16864 // gfx908
16865 if (RMW->use_empty() &&
16867 isV2F16(Ty))
16868 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16869 }
16870 }
16871
16872 // flat atomic fadd f32: gfx940, gfx11+.
16873 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16874 if (Subtarget->hasFlatAtomicFaddF32Inst())
16875 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16876
16877 // If it is in flat address space, and the type is float, we will try to
16878 // expand it, if the target supports global and lds atomic fadd. The
16879 // reason we need that is, in the expansion, we emit the check of
16880 // address space. If it is in global address space, we emit the global
16881 // atomic fadd; if it is in shared address space, we emit the LDS atomic
16882 // fadd.
16883 if (Subtarget->hasLDSFPAtomicAddF32()) {
16884 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16886 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16888 }
16889 }
16890 }
16891
16893 }
16895 case AtomicRMWInst::FMax: {
16896 Type *Ty = RMW->getType();
16897
16898 // LDS float and double fmin/fmax were always supported.
16899 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16900 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
16902 }
16903
16904 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16905 // For flat and global cases:
16906 // float, double in gfx7. Manual claims denormal support.
16907 // Removed in gfx8.
16908 // float, double restored in gfx10.
16909 // double removed again in gfx11, so only f32 for gfx11/gfx12.
16910 //
16911 // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but
16912 // no f32.
16913 if (AS == AMDGPUAS::FLAT_ADDRESS) {
16914 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16915 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16916 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16917 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16918 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
16920 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16921 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16922 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16923 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16924 }
16925 }
16926
16928 }
16929 case AtomicRMWInst::Min:
16930 case AtomicRMWInst::Max:
16932 case AtomicRMWInst::UMax: {
16935 // Always expand system scope min/max atomics.
16936 if (HasSystemScope)
16938 }
16939
16941 }
16944 default:
16946 }
16947
16948 llvm_unreachable("covered atomicrmw op switch");
16949}
16950
16956}
16957
16960 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16963}
16964
16967 unsigned AddrSpace = CmpX->getPointerAddressSpace();
16968 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
16970
16971 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
16973
16974 const DataLayout &DL = CmpX->getDataLayout();
16975
16976 Type *ValTy = CmpX->getNewValOperand()->getType();
16977
16978 // If a 64-bit flat atomic may alias private, we need to avoid using the
16979 // atomic in the private case.
16980 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
16982}
16983
16984const TargetRegisterClass *
16985SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
16987 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16988 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16989 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
16990 : &AMDGPU::SReg_32RegClass;
16991 if (!TRI->isSGPRClass(RC) && !isDivergent)
16992 return TRI->getEquivalentSGPRClass(RC);
16993 if (TRI->isSGPRClass(RC) && isDivergent)
16994 return TRI->getEquivalentVGPRClass(RC);
16995
16996 return RC;
16997}
16998
16999// FIXME: This is a workaround for DivergenceAnalysis not understanding always
17000// uniform values (as produced by the mask results of control flow intrinsics)
17001// used outside of divergent blocks. The phi users need to also be treated as
17002// always uniform.
17003//
17004// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
17005static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
17006 unsigned WaveSize) {
17007 // FIXME: We assume we never cast the mask results of a control flow
17008 // intrinsic.
17009 // Early exit if the type won't be consistent as a compile time hack.
17010 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
17011 if (!IT || IT->getBitWidth() != WaveSize)
17012 return false;
17013
17014 if (!isa<Instruction>(V))
17015 return false;
17016 if (!Visited.insert(V).second)
17017 return false;
17018 bool Result = false;
17019 for (const auto *U : V->users()) {
17020 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
17021 if (V == U->getOperand(1)) {
17022 switch (Intrinsic->getIntrinsicID()) {
17023 default:
17024 Result = false;
17025 break;
17026 case Intrinsic::amdgcn_if_break:
17027 case Intrinsic::amdgcn_if:
17028 case Intrinsic::amdgcn_else:
17029 Result = true;
17030 break;
17031 }
17032 }
17033 if (V == U->getOperand(0)) {
17034 switch (Intrinsic->getIntrinsicID()) {
17035 default:
17036 Result = false;
17037 break;
17038 case Intrinsic::amdgcn_end_cf:
17039 case Intrinsic::amdgcn_loop:
17040 Result = true;
17041 break;
17042 }
17043 }
17044 } else {
17045 Result = hasCFUser(U, Visited, WaveSize);
17046 }
17047 if (Result)
17048 break;
17049 }
17050 return Result;
17051}
17052
17054 const Value *V) const {
17055 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
17056 if (CI->isInlineAsm()) {
17057 // FIXME: This cannot give a correct answer. This should only trigger in
17058 // the case where inline asm returns mixed SGPR and VGPR results, used
17059 // outside the defining block. We don't have a specific result to
17060 // consider, so this assumes if any value is SGPR, the overall register
17061 // also needs to be SGPR.
17062 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
17064 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
17065 for (auto &TC : TargetConstraints) {
17066 if (TC.Type == InlineAsm::isOutput) {
17068 const TargetRegisterClass *RC =
17069 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
17070 TC.ConstraintVT)
17071 .second;
17072 if (RC && SIRI->isSGPRClass(RC))
17073 return true;
17074 }
17075 }
17076 }
17077 }
17079 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
17080}
17081
17083 for (SDUse &Use : N->uses()) {
17084 if (MemSDNode *M = dyn_cast<MemSDNode>(Use.getUser())) {
17085 if (getBasePtrIndex(M) == Use.getOperandNo())
17086 return true;
17087 }
17088 }
17089 return false;
17090}
17091
17093 SDValue N1) const {
17094 if (!N0.hasOneUse())
17095 return false;
17096 // Take care of the opportunity to keep N0 uniform
17097 if (N0->isDivergent() || !N1->isDivergent())
17098 return true;
17099 // Check if we have a good chance to form the memory access pattern with the
17100 // base and offset
17101 return (DAG.isBaseWithConstantOffset(N0) &&
17103}
17104
17106 Register N0, Register N1) const {
17107 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
17108}
17109
17112 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
17114 if (I.getMetadata("amdgpu.noclobber"))
17115 Flags |= MONoClobber;
17116 if (I.getMetadata("amdgpu.last.use"))
17117 Flags |= MOLastUse;
17118 return Flags;
17119}
17120
17122 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
17123 const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const {
17124 if (User->getOpcode() != ISD::CopyToReg)
17125 return false;
17126 if (!Def->isMachineOpcode())
17127 return false;
17128 MachineSDNode *MDef = dyn_cast<MachineSDNode>(Def);
17129 if (!MDef)
17130 return false;
17131
17132 unsigned ResNo = User->getOperand(Op).getResNo();
17133 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
17134 return false;
17135 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
17136 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
17137 PhysReg = AMDGPU::SCC;
17138 const TargetRegisterClass *RC =
17139 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
17140 Cost = RC->getCopyCost();
17141 return true;
17142 }
17143 return false;
17144}
17145
17146/// Check if it is profitable to hoist instruction in then/else to if.
17148 if (!I->hasOneUse())
17149 return true;
17150
17151 Instruction *User = I->user_back();
17152 // TODO: Add more patterns that are not profitable to hoist and
17153 // handle modifiers such as fabs and fneg
17154 switch (I->getOpcode()) {
17155 case Instruction::FMul: {
17156 if (User->getOpcode() != Instruction::FSub &&
17157 User->getOpcode() != Instruction::FAdd)
17158 return true;
17159
17161
17162 return ((!I->hasAllowContract() || !User->hasAllowContract()) &&
17163 Options.AllowFPOpFusion != FPOpFusion::Fast &&
17164 !Options.UnsafeFPMath) ||
17165 !isFMAFasterThanFMulAndFAdd(*I->getFunction(), User->getType());
17166 }
17167 default:
17168 return true;
17169 }
17170 return true;
17171}
17172
17174 Instruction *AI) const {
17175 // Given: atomicrmw fadd ptr %addr, float %val ordering
17176 //
17177 // With this expansion we produce the following code:
17178 // [...]
17179 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
17180 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
17181 //
17182 // atomicrmw.shared:
17183 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
17184 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
17185 // float %val ordering
17186 // br label %atomicrmw.phi
17187 //
17188 // atomicrmw.check.private:
17189 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
17190 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
17191 //
17192 // atomicrmw.private:
17193 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
17194 // %loaded.private = load float, ptr addrspace(5) %cast.private
17195 // %val.new = fadd float %loaded.private, %val
17196 // store float %val.new, ptr addrspace(5) %cast.private
17197 // br label %atomicrmw.phi
17198 //
17199 // atomicrmw.global:
17200 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
17201 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
17202 // float %val ordering
17203 // br label %atomicrmw.phi
17204 //
17205 // atomicrmw.phi:
17206 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
17207 // [ %loaded.private, %atomicrmw.private ],
17208 // [ %loaded.global, %atomicrmw.global ]
17209 // br label %atomicrmw.end
17210 //
17211 // atomicrmw.end:
17212 // [...]
17213 //
17214 //
17215 // For 64-bit atomics which may reside in private memory, we perform a simpler
17216 // version that only inserts the private check, and uses the flat operation.
17217
17218 IRBuilder<> Builder(AI);
17219 LLVMContext &Ctx = Builder.getContext();
17220
17221 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
17222 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
17224 Value *Addr = AI->getOperand(PtrOpIdx);
17225
17226 /// TODO: Only need to check private, then emit flat-known-not private (no
17227 /// need for shared block, or cast to global).
17228 AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);
17229
17230 Align Alignment;
17231 if (RMW)
17232 Alignment = RMW->getAlign();
17233 else if (CX)
17234 Alignment = CX->getAlign();
17235 else
17236 llvm_unreachable("unhandled atomic operation");
17237
17238 // FullFlatEmulation is true if we need to issue the private, shared, and
17239 // global cases.
17240 //
17241 // If this is false, we are only dealing with the flat-targeting-private case,
17242 // where we only insert a check for private and still use the flat instruction
17243 // for global and shared.
17244
17245 bool FullFlatEmulation = RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
17246 Subtarget->hasAtomicFaddInsts() &&
17247 RMW->getType()->isFloatTy();
17248
17249 // If the return value isn't used, do not introduce a false use in the phi.
17250 bool ReturnValueIsUsed = !AI->use_empty();
17251
17252 BasicBlock *BB = Builder.GetInsertBlock();
17253 Function *F = BB->getParent();
17254 BasicBlock *ExitBB =
17255 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
17256 BasicBlock *SharedBB = nullptr;
17257
17258 BasicBlock *CheckPrivateBB = BB;
17259 if (FullFlatEmulation) {
17260 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
17261 CheckPrivateBB =
17262 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
17263 }
17264
17265 BasicBlock *PrivateBB =
17266 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
17267 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
17268 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
17269
17270 std::prev(BB->end())->eraseFromParent();
17271 Builder.SetInsertPoint(BB);
17272
17273 Value *LoadedShared = nullptr;
17274 if (FullFlatEmulation) {
17275 CallInst *IsShared = Builder.CreateIntrinsic(
17276 Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared");
17277 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
17278 Builder.SetInsertPoint(SharedBB);
17279 Value *CastToLocal = Builder.CreateAddrSpaceCast(
17281
17282 Instruction *Clone = AI->clone();
17283 Clone->insertInto(SharedBB, SharedBB->end());
17284 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
17285 LoadedShared = Clone;
17286
17287 Builder.CreateBr(PhiBB);
17288 Builder.SetInsertPoint(CheckPrivateBB);
17289 }
17290
17291 CallInst *IsPrivate = Builder.CreateIntrinsic(
17292 Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
17293 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
17294
17295 Builder.SetInsertPoint(PrivateBB);
17296
17297 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
17299
17300 Value *LoadedPrivate;
17301 if (RMW) {
17302 LoadedPrivate = Builder.CreateAlignedLoad(
17303 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
17304
17305 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
17306 LoadedPrivate, RMW->getValOperand());
17307
17308 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
17309 } else {
17310 auto [ResultLoad, Equal] =
17311 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
17312 CX->getNewValOperand(), CX->getAlign());
17313
17314 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
17315 ResultLoad, 0);
17316 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
17317 }
17318
17319 Builder.CreateBr(PhiBB);
17320
17321 Builder.SetInsertPoint(GlobalBB);
17322
17323 // Continue using a flat instruction if we only emitted the check for private.
17324 Instruction *LoadedGlobal = AI;
17325 if (FullFlatEmulation) {
17326 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
17328 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
17329 }
17330
17331 AI->removeFromParent();
17332 AI->insertInto(GlobalBB, GlobalBB->end());
17333
17334 // The new atomicrmw may go through another round of legalization later.
17335 if (!FullFlatEmulation) {
17336 // We inserted the runtime check already, make sure we do not try to
17337 // re-expand this.
17338 // TODO: Should union with any existing metadata.
17339 MDBuilder MDB(F->getContext());
17340 MDNode *RangeNotPrivate =
17343 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
17344 RangeNotPrivate);
17345 }
17346
17347 Builder.CreateBr(PhiBB);
17348
17349 Builder.SetInsertPoint(PhiBB);
17350
17351 if (ReturnValueIsUsed) {
17352 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
17353 AI->replaceAllUsesWith(Loaded);
17354 if (FullFlatEmulation)
17355 Loaded->addIncoming(LoadedShared, SharedBB);
17356 Loaded->addIncoming(LoadedPrivate, PrivateBB);
17357 Loaded->addIncoming(LoadedGlobal, GlobalBB);
17358 Loaded->takeName(AI);
17359 }
17360
17361 Builder.CreateBr(ExitBB);
17362}
17363
17366
17369 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
17370 ConstVal && ConstVal->isNullValue()) {
17371 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
17373
17374 // We may still need the private-alias-flat handling below.
17375
17376 // TODO: Skip this for cases where we cannot access remote memory.
17377 }
17378 }
17379
17380 // The non-flat expansions should only perform the de-canonicalization of
17381 // identity values.
17383 return;
17384
17386}
17387
17390}
17391
17392LoadInst *
17394 IRBuilder<> Builder(AI);
17395 auto Order = AI->getOrdering();
17396
17397 // The optimization removes store aspect of the atomicrmw. Therefore, cache
17398 // must be flushed if the atomic ordering had a release semantics. This is
17399 // not necessary a fence, a release fence just coincides to do that flush.
17400 // Avoid replacing of an atomicrmw with a release semantics.
17401 if (isReleaseOrStronger(Order))
17402 return nullptr;
17403
17404 LoadInst *LI = Builder.CreateAlignedLoad(
17405 AI->getType(), AI->getPointerOperand(), AI->getAlign());
17406 LI->setAtomic(Order, AI->getSyncScopeID());
17407 LI->copyMetadata(*AI);
17408 LI->takeName(AI);
17409 AI->replaceAllUsesWith(LI);
17410 AI->eraseFromParent();
17411 return LI;
17412}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static const LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
basic Basic Alias true
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:282
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
static bool isUndef(const MachineInstr &MI)
unsigned const TargetRegisterInfo * TRI
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t High
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition: SIDefines.h:1214
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition: SIDefines.h:1211
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
raw_pwrite_stream & OS
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
bool hasCvtPkF16F32Inst() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool hasBF16ConversionInsts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition: APFloat.h:1122
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:5463
LLVM_READONLY int getExactLog2Abs() const
Definition: APFloat.h:1489
bool isNegative() const
Definition: APFloat.h:1445
APInt bitcastToAPInt() const
Definition: APFloat.h:1351
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition: APFloat.h:1140
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:1100
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:1081
bool isInfinity() const
Definition: APFloat.h:1442
Class for arbitrary precision integers.
Definition: APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1392
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1386
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:258
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition: APInt.h:466
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1618
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1237
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1221
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition: Function.cpp:349
const Function * getParent() const
Definition: Argument.h:43
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:501
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:640
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:544
static unsigned getPointerOperandIndex()
Definition: Instructions.h:631
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:704
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:827
static unsigned getPointerOperandIndex()
Definition: Instructions.h:872
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:716
@ Add
*p = old + v
Definition: Instructions.h:720
@ FAdd
*p = old + v
Definition: Instructions.h:741
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:734
@ Or
*p = old | v
Definition: Instructions.h:728
@ Sub
*p = old - v
Definition: Instructions.h:722
@ And
*p = old & v
Definition: Instructions.h:724
@ Xor
*p = old ^ v
Definition: Instructions.h:730
@ FSub
*p = old - v
Definition: Instructions.h:744
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:756
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:732
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:738
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:752
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:736
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:748
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:760
@ Nand
*p = ~(old & v)
Definition: Instructions.h:726
Value * getPointerOperand()
Definition: Instructions.h:870
void setOperation(BinOp Operation)
Definition: Instructions.h:821
BinOp getOperation() const
Definition: Instructions.h:805
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition: Instructions.h:861
Value * getValOperand()
Definition: Instructions.h:874
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition: Instructions.h:847
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:878
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
bool getValueAsBool() const
Return the attribute's value as a boolean.
Definition: Attributes.cpp:382
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:474
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:213
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:599
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:220
BitVector & set()
Definition: BitVector.h:351
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
Definition: ByteProvider.h:30
static ByteProvider getConstantZero()
Definition: ByteProvider.h:73
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
Definition: ByteProvider.h:66
std::optional< ISelOp > Src
Definition: ByteProvider.h:57
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
Definition: InstrTypes.h:1451
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1286
unsigned arg_size() const
Definition: InstrTypes.h:1284
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
bool isSigned() const
Definition: InstrTypes.h:928
bool isFPPredicate() const
Definition: InstrTypes.h:780
bool isIntPredicate() const
Definition: InstrTypes.h:781
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:208
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition: Constant.h:42
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition: Constants.cpp:90
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:843
bool isBigEndian() const
Definition: DataLayout.h:198
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:457
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:572
unsigned getNumElements() const
Definition: DerivedTypes.h:615
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Class to represent function types.
Definition: DerivedTypes.h:105
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:137
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:216
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:373
iterator_range< arg_iterator > args()
Definition: Function.h:898
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:766
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:277
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition: Function.cpp:807
Argument * getArg(unsigned i) const
Definition: Function.h:892
bool hasPrefetch() const
Definition: GCNSubtarget.h:962
bool hasMemoryAtomicFaddF32DenormalSupport() const
Definition: GCNSubtarget.h:905
bool hasD16Images() const
Definition: GCNSubtarget.h:710
bool hasMinimum3Maximum3F32() const
bool useVGPRIndexMode() const
bool hasAtomicDsPkAdd16Insts() const
Definition: GCNSubtarget.h:867
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:487
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:478
bool hasAtomicFMinFMaxF64FlatInsts() const
Definition: GCNSubtarget.h:863
bool hasDot7Insts() const
Definition: GCNSubtarget.h:809
bool hasApertureRegs() const
Definition: GCNSubtarget.h:611
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:641
bool hasAtomicFMinFMaxF32FlatInsts() const
Definition: GCNSubtarget.h:859
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:779
bool hasBCNT(unsigned Size) const
Definition: GCNSubtarget.h:421
bool hasMAIInsts() const
Definition: GCNSubtarget.h:837
bool hasLDSLoadB96_B128() const
Returns true if the target supports global_load_lds_dwordx3/global_load_lds_dwordx4 or buffer_load_dw...
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
Definition: GCNSubtarget.h:912
bool hasMultiDwordFlatScratchAddressing() const
Definition: GCNSubtarget.h:690
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
Definition: GCNSubtarget.h:537
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
Definition: GCNSubtarget.h:595
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:279
bool hasDot1Insts() const
Definition: GCNSubtarget.h:785
bool hasAtomicFaddRtnInsts() const
Definition: GCNSubtarget.h:875
bool hasPkMovB32() const
Align getStackAlignment() const
Definition: GCNSubtarget.h:975
bool hasScalarSubwordLoads() const
Definition: GCNSubtarget.h:465
bool enableFlatScratch() const
Definition: GCNSubtarget.h:666
bool hasMadF16() const
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
Definition: GCNSubtarget.h:637
bool supportsGetDoorbellID() const
Definition: GCNSubtarget.h:471
bool hasFlatAtomicFaddF32Inst() const
Definition: GCNSubtarget.h:895
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:291
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasMad64_32() const
Definition: GCNSubtarget.h:755
bool useDS128() const
Definition: GCNSubtarget.h:547
bool hasMinimum3Maximum3PKF16() const
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
Definition: GCNSubtarget.h:467
const SIFrameLowering * getFrameLowering() const override
Definition: GCNSubtarget.h:283
bool hasMinimum3Maximum3F16() const
bool hasAtomicFMinFMaxF32GlobalInsts() const
Definition: GCNSubtarget.h:851
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
Definition: GCNSubtarget.h:437
bool hasIntClamp() const
Definition: GCNSubtarget.h:367
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:387
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:615
bool hasLDSFPAtomicAddF64() const
bool hasFlatGlobalInsts() const
Definition: GCNSubtarget.h:645
bool getScalarizeGlobalBehavior() const
Definition: GCNSubtarget.h:988
bool hasScalarSMulU64() const
Definition: GCNSubtarget.h:744
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
Definition: GCNSubtarget.h:346
bool hasShaderCyclesHiLoRegisters() const
Definition: GCNSubtarget.h:942
bool hasFFBL() const
Definition: GCNSubtarget.h:425
bool hasNSAEncoding() const
bool hasSMemRealTime() const
bool usePRTStrictNull() const
Definition: GCNSubtarget.h:569
bool hasAtomicFMinFMaxF64GlobalInsts() const
Definition: GCNSubtarget.h:855
bool hasMed3_16() const
Definition: GCNSubtarget.h:433
bool hasUnalignedScratchAccessEnabled() const
Definition: GCNSubtarget.h:603
bool hasMovrel() const
bool hasAtomicFlatPkAdd16Insts() const
Definition: GCNSubtarget.h:869
bool hasBFI() const
Definition: GCNSubtarget.h:413
bool hasUnalignedBufferAccessEnabled() const
Definition: GCNSubtarget.h:587
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:354
bool hasImageGather4D16Bug() const
bool hasDot10Insts() const
Definition: GCNSubtarget.h:821
bool supportsMinMaxDenormModes() const
Definition: GCNSubtarget.h:532
bool hasFFBH() const
Definition: GCNSubtarget.h:429
bool hasAtomicFaddInsts() const
Definition: GCNSubtarget.h:871
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const
Definition: GCNSubtarget.h:879
bool hasAtomicBufferPkAddBF16Inst() const
Definition: GCNSubtarget.h:891
bool hasAtomicFaddNoRtnInsts() const
Definition: GCNSubtarget.h:877
bool hasFlatBufferGlobalAtomicFaddF64Inst() const
Definition: GCNSubtarget.h:899
bool hasScalarDwordx3Loads() const
bool hasLDSFPAtomicAddF32() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
Definition: GCNSubtarget.h:557
bool hasDot8Insts() const
Definition: GCNSubtarget.h:813
bool hasDS96AndDS128() const
Definition: GCNSubtarget.h:552
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:541
Generation getGeneration() const
Definition: GCNSubtarget.h:327
bool hasAtomicBufferGlobalPkAddF16Insts() const
Definition: GCNSubtarget.h:883
bool hasScalarAddSub64() const
Definition: GCNSubtarget.h:742
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:746
bool hasAtomicGlobalPkAddBF16Inst() const
Definition: GCNSubtarget.h:887
bool hasAddr64() const
Definition: GCNSubtarget.h:391
bool isWave64() const
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
Definition: GCNSubtarget.h:441
bool hasPackedTID() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:738
bool hasFractBug() const
Definition: GCNSubtarget.h:405
bool hasGDS() const
bool hasBFE() const
Definition: GCNSubtarget.h:409
bool hasGWSAutoReplay() const
Definition: GCNSubtarget.h:725
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
Definition: GlobalValue.h:512
unsigned getAddressSpace() const
Definition: GlobalValue.h:206
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:657
Type * getValueType() const
Definition: GlobalValue.h:297
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2562
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1815
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:194
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:193
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:900
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2435
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1164
LLVMContext & getContext() const
Definition: IRBuilder.h:195
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1158
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:199
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1834
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2157
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2705
Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:80
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
Definition: Instruction.h:404
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:94
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:72
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1679
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Definition: Instruction.cpp:76
InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
Definition: DerivedTypes.h:42
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:264
constexpr bool isScalar() const
Definition: LowLevelType.h:146
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Definition: LowLevelType.h:57
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:190
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
Definition: LowLevelType.h:218
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
std::optional< StringRef > getSyncScopeName(SyncScope::ID Id) const
getSyncScopeName - Returns the name of a SyncScope::ID registered with LLVMContext,...
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
Definition: Instructions.h:176
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:261
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:241
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition: MDBuilder.cpp:95
Metadata node.
Definition: Metadata.h:1073
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1434
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1440
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:237
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
Definition: MachineInstr.h:71
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:587
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition: ModRef.h:198
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition: ModRef.h:192
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition: ModRef.h:195
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:294
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
Register getReg() const
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition: Register.h:84
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:228
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:751
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
Definition: SelectionDAG.h:983
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:577
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
Definition: SelectionDAG.h:802
const Pass * getPass() const
Definition: SelectionDAG.h:493
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getRegister(Register Reg, EVT VT)
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:503
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:857
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
Definition: SelectionDAG.h:828
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:497
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:713
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:498
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:701
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:492
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:874
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:510
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:586
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:580
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:805
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:853
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:265
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:150
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition: StringRef.h:144
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:277
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:43
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:68
R Default(T Value)
Definition: StringSwitch.h:177
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:81
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:404
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:310
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition: Type.h:255
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
void set(Value *Val)
Definition: Value.h:892
User * getUser() const
Returns the User that contains this Use.
Definition: Use.h:64
unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition: Use.cpp:31
const Use & getOperandUse(unsigned i) const
Definition: User.h:241
Value * getOperand(unsigned i) const
Definition: User.h:228
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1094
iterator_range< use_iterator > uses()
Definition: Value.h:376
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Type * getElementType() const
Definition: DerivedTypes.h:460
constexpr bool isZero() const
Definition: TypeSize.h:156
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: Lint.cpp:87
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ MaxID
The highest possible ID. Must be some 2^k - 1.
Definition: CallingConv.h:274
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:243
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1201
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:753
@ ATOMIC_LOAD_FMAX
Definition: ISDOpcodes.h:1355
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ SET_FPENV
Sets the current floating-point environment.
Definition: ISDOpcodes.h:1077
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1348
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:574
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:744
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1350
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1320
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1351
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:502
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1110
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:814
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:498
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:1333
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:558
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:717
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:964
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1346
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:954
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1347
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:997
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1502
@ ATOMIC_LOAD_FADD
Definition: ISDOpcodes.h:1353
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:936
@ CONVERGENCECTRL_GLUE
Definition: ISDOpcodes.h:1484
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:635
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
Definition: ISDOpcodes.h:1267
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1126
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:752
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1300
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1059
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1156
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1349
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:515
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition: ISDOpcodes.h:522
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1316
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:229
@ ATOMIC_LOAD_FMIN
Definition: ISDOpcodes.h:1356
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:931
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:674
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1095
@ GET_FPENV
Gets the current floating-point environment.
Definition: ISDOpcodes.h:1072
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:615
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1344
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:588
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1044
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:550
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1290
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:772
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1327
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1352
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1031
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1120
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:849
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:697
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:1176
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:973
@ ATOMIC_LOAD_UDEC_WRAP
Definition: ISDOpcodes.h:1358
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1342
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:480
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1050
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1343
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1261
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:485
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1287
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:539
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1341
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:1004
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:421
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:958
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:1173
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1149
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:794
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ ATOMIC_LOAD_UINC_WRAP
Definition: ISDOpcodes.h:1357
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:508
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ AssertZext
Definition: ISDOpcodes.h:62
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
Definition: ISDOpcodes.h:1055
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:530
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
Definition: ISDOpcodes.h:1651
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1618
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1598
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
Function * getDeclarationIfExists(Module *M, ID id, ArrayRef< Type * > Tys, FunctionType *FT=nullptr)
This version supports overloaded intrinsics.
Definition: Intrinsics.cpp:771
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Offsets
Offsets in bytes from the start of the input buffer.
Definition: SIInstrInfo.h:1613
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double inv_pi
Definition: MathExtras.h:54
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:480
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition: Analysis.cpp:233
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition: MathExtras.h:245
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition: STLExtras.h:864
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
Definition: LowerAtomic.cpp:40
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
@ Done
Definition: Threading.h:60
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2115
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:556
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:395
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:286
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:43
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:341
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:292
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:155
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition: Analysis.cpp:199
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:160
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
Definition: LowerAtomic.cpp:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:404
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ AfterLegalizeDAG
Definition: DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition: DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:375
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition: MathExtras.h:236
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
@ DS_Warning
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:47
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
SDValue SrcOp
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:302
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:255
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition: SCCPSolver.h:41
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition: ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:390
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:295
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:238
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition: ValueTypes.h:113
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:465
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:407
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:251
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:210
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition: ValueTypes.cpp:320
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
unsigned getOrigArgIndex() const
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:65
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:73
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:336
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:240
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals