LLVM 21.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
25#include "llvm/IR/IntrinsicsAMDGPU.h"
29
30using namespace llvm;
31
32#include "AMDGPUGenCallingConv.inc"
33
35 "amdgpu-bypass-slow-div",
36 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
37 cl::init(true));
38
39// Find a larger type to do a load / store of a vector with.
41 unsigned StoreSize = VT.getStoreSizeInBits();
42 if (StoreSize <= 32)
43 return EVT::getIntegerVT(Ctx, StoreSize);
44
45 if (StoreSize % 32 == 0)
46 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
47
48 return VT;
49}
50
53}
54
56 // In order for this to be a signed 24-bit value, bit 23, must
57 // be a sign bit.
58 return DAG.ComputeMaxSignificantBits(Op);
59}
60
62 const AMDGPUSubtarget &STI)
63 : TargetLowering(TM), Subtarget(&STI) {
64 // Always lower memset, memcpy, and memmove intrinsics to load/store
65 // instructions, rather then generating calls to memset, mempcy or memmove.
69
70 // Enable ganging up loads and stores in the memcpy DAG lowering.
72
73 // Lower floating point store/load to integer store/load to reduce the number
74 // of patterns in tablegen.
76 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
77
79 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
80
82 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
83
85 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
86
88 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
89
91 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
92
94 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
95
97 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
98
100 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
101
102 setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
103 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
104
105 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
106 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
107
108 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
109 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
110
111 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
112 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
113
114 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
115 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
116
118 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
119
121 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
122
124 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
125
127 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
128
130 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
131
133 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
134
136 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
137
139 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
140
142 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
143
145 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
146
147 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
148 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
149
150 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
151 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
152
154 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
155
156 // TODO: Would be better to consume as directly legal
158 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
159
161 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
162
164 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
165
167 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);
168
170 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32);
171
173 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64);
174
176 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16);
177
179 AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16);
180
181 // There are no 64-bit extloads. These should be done as a 32-bit extload and
182 // an extension to 64-bit.
183 for (MVT VT : MVT::integer_valuetypes())
185 Expand);
186
187 for (MVT VT : MVT::integer_valuetypes()) {
188 if (VT == MVT::i64)
189 continue;
190
191 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
192 setLoadExtAction(Op, VT, MVT::i1, Promote);
193 setLoadExtAction(Op, VT, MVT::i8, Legal);
194 setLoadExtAction(Op, VT, MVT::i16, Legal);
195 setLoadExtAction(Op, VT, MVT::i32, Expand);
196 }
197 }
198
200 for (auto MemVT :
201 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
203 Expand);
204
205 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
206 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
207 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
208 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
209 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
210 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
211 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
212 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
213 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
214 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
215 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
216 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
217 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
218 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
219
220 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
221 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
222 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
223 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
224 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
225 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
226
227 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
228 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
229 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
230 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
231 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
232 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
233 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
234 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
235 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
236 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
237 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
238 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
239
241 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
242
244 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
245
247 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
248
250 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
251
253 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
254
256 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
257
259 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
260
262 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
263
265 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
266
268 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
269
271 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
272
274 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
275
277 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
278
280 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
281
283 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
284
286 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
287
289 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
290
292 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
293
295 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
296
298 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
299
301 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
302
304 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
305
307 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
308
310 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
311
313 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
314
316 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
317
319 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
320
321 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
322 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
323 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
324 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
325
326 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
327 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
328 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
329 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
330
331 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
332 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
333 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
334 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
335 setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);
336 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
337 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
338 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
339 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
340 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
341 setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
342 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
343 setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);
344 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
345
346 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
347 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
348 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
349
350 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
351 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);
352 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
353
354 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
355
356 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
357 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
358 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
359 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
360 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
361 setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);
362 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
363
364 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
365 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
366 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
367 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
368 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
369
370 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
371 setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
372 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
373
374 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
375 setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
376 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
377 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
378 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
379 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
380 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
381 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
382
383 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
384 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
385
387
388 // For R600, this is totally unsupported, just custom lower to produce an
389 // error.
391
392 // Library functions. These default to Expand, but we have instructions
393 // for them.
396 MVT::f32, Legal);
397
399 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
401 {MVT::f16, MVT::f32, MVT::f64}, Expand);
402
405 Custom);
406
407 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
408
409 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
410
411 setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64},
412 Expand);
413
414 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
415
416 if (Subtarget->has16BitInsts())
417 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
418 else {
419 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
421 }
422
424 Custom);
425
426 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
427 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
428 // default unless marked custom/legal.
430 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
431 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
432 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64,
433 MVT::v16f64},
434 Custom);
435
436 if (isTypeLegal(MVT::f16))
438 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16},
439 Custom);
440
441 // Expand to fneg + fadd.
443
445 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
446 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
447 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
448 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
449 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
450 Custom);
451
454 {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32,
455 MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32,
456 MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32,
457 MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,
458 MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
459 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
460 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
461 Custom);
462
464 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
465
466 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
467 for (MVT VT : ScalarIntVTs) {
468 // These should use [SU]DIVREM, so set them to expand
470 Expand);
471
472 // GPU does not have divrem function for signed or unsigned.
474
475 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
477
479
480 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
482 }
483
484 // The hardware supports 32-bit FSHR, but not FSHL.
486
487 // The hardware supports 32-bit ROTR, but not ROTL.
488 setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
490
492
496 MVT::i64, Custom);
498
500 Legal);
501
504 MVT::i64, Custom);
505
506 for (auto VT : {MVT::i8, MVT::i16})
508
509 static const MVT::SimpleValueType VectorIntTypes[] = {
510 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
511 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
512
513 for (MVT VT : VectorIntTypes) {
514 // Expand the following operations for the current type by default.
527 VT, Expand);
528 }
529
530 static const MVT::SimpleValueType FloatVectorTypes[] = {
531 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
532 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
533
534 for (MVT VT : FloatVectorTypes) {
547 VT, Expand);
548 }
549
550 // This causes using an unrolled select operation rather than expansion with
551 // bit operations. This is in general better, but the alternative using BFI
552 // instructions may be better if the select sources are SGPRs.
554 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
555
557 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
558
560 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
561
563 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
564
566 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
567
569 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
570
572 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
573
575 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
576
578 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
579
581 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
582
584 setJumpIsExpensive(true);
585
586 // FIXME: This is only partially true. If we have to do vector compares, any
587 // SGPR pair can be a condition register. If we have a uniform condition, we
588 // are better off doing SALU operations, where there is only one SCC. For now,
589 // we don't have a way of knowing during instruction selection if a condition
590 // will be uniform and we always use vector compares. Assume we are using
591 // vector compares until that is fixed.
593
596
598
599 // We want to find all load dependencies for long chains of stores to enable
600 // merging into very wide vectors. The problem is with vectors with > 4
601 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
602 // vectors are a legal type, even though we have to split the loads
603 // usually. When we can more precisely specify load legality per address
604 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
605 // smarter so that they can figure out what to do in 2 iterations without all
606 // N > 4 stores on the same chain.
608
609 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
610 // about these during lowering.
611 MaxStoresPerMemcpy = 0xffffffff;
612 MaxStoresPerMemmove = 0xffffffff;
613 MaxStoresPerMemset = 0xffffffff;
614
615 // The expansion for 64-bit division is enormous.
617 addBypassSlowDiv(64, 32);
618
629
633}
634
636 if (getTargetMachine().Options.NoSignedZerosFPMath)
637 return true;
638
639 const auto Flags = Op.getNode()->getFlags();
640 if (Flags.hasNoSignedZeros())
641 return true;
642
643 return false;
644}
645
646//===----------------------------------------------------------------------===//
647// Target Information
648//===----------------------------------------------------------------------===//
649
651static bool fnegFoldsIntoOpcode(unsigned Opc) {
652 switch (Opc) {
653 case ISD::FADD:
654 case ISD::FSUB:
655 case ISD::FMUL:
656 case ISD::FMA:
657 case ISD::FMAD:
658 case ISD::FMINNUM:
659 case ISD::FMAXNUM:
662 case ISD::FMINIMUM:
663 case ISD::FMAXIMUM:
664 case ISD::SELECT:
665 case ISD::FSIN:
666 case ISD::FTRUNC:
667 case ISD::FRINT:
668 case ISD::FNEARBYINT:
669 case ISD::FROUNDEVEN:
671 case AMDGPUISD::RCP:
678 case AMDGPUISD::FMED3:
679 // TODO: handle llvm.amdgcn.fma.legacy
680 return true;
681 case ISD::BITCAST:
682 llvm_unreachable("bitcast is special cased");
683 default:
684 return false;
685 }
686}
687
688static bool fnegFoldsIntoOp(const SDNode *N) {
689 unsigned Opc = N->getOpcode();
690 if (Opc == ISD::BITCAST) {
691 // TODO: Is there a benefit to checking the conditions performFNegCombine
692 // does? We don't for the other cases.
693 SDValue BCSrc = N->getOperand(0);
694 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
695 return BCSrc.getNumOperands() == 2 &&
696 BCSrc.getOperand(1).getValueSizeInBits() == 32;
697 }
698
699 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
700 }
701
702 return fnegFoldsIntoOpcode(Opc);
703}
704
705/// \p returns true if the operation will definitely need to use a 64-bit
706/// encoding, and thus will use a VOP3 encoding regardless of the source
707/// modifiers.
709static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
710 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
711 VT == MVT::f64;
712}
713
714/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
715/// type for ISD::SELECT.
717static bool selectSupportsSourceMods(const SDNode *N) {
718 // TODO: Only applies if select will be vector
719 return N->getValueType(0) == MVT::f32;
720}
721
722// Most FP instructions support source modifiers, but this could be refined
723// slightly.
725static bool hasSourceMods(const SDNode *N) {
726 if (isa<MemSDNode>(N))
727 return false;
728
729 switch (N->getOpcode()) {
730 case ISD::CopyToReg:
731 case ISD::FDIV:
732 case ISD::FREM:
733 case ISD::INLINEASM:
737
738 // TODO: Should really be looking at the users of the bitcast. These are
739 // problematic because bitcasts are used to legalize all stores to integer
740 // types.
741 case ISD::BITCAST:
742 return false;
744 switch (N->getConstantOperandVal(0)) {
745 case Intrinsic::amdgcn_interp_p1:
746 case Intrinsic::amdgcn_interp_p2:
747 case Intrinsic::amdgcn_interp_mov:
748 case Intrinsic::amdgcn_interp_p1_f16:
749 case Intrinsic::amdgcn_interp_p2_f16:
750 return false;
751 default:
752 return true;
753 }
754 }
755 case ISD::SELECT:
757 default:
758 return true;
759 }
760}
761
763 unsigned CostThreshold) {
764 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
765 // it is truly free to use a source modifier in all cases. If there are
766 // multiple users but for each one will necessitate using VOP3, there will be
767 // a code size increase. Try to avoid increasing code size unless we know it
768 // will save on the instruction count.
769 unsigned NumMayIncreaseSize = 0;
770 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
771
772 assert(!N->use_empty());
773
774 // XXX - Should this limit number of uses to check?
775 for (const SDNode *U : N->users()) {
776 if (!hasSourceMods(U))
777 return false;
778
779 if (!opMustUseVOP3Encoding(U, VT)) {
780 if (++NumMayIncreaseSize > CostThreshold)
781 return false;
782 }
783 }
784
785 return true;
786}
787
789 ISD::NodeType ExtendKind) const {
790 assert(!VT.isVector() && "only scalar expected");
791
792 // Round to the next multiple of 32-bits.
793 unsigned Size = VT.getSizeInBits();
794 if (Size <= 32)
795 return MVT::i32;
796 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
797}
798
800 return MVT::i32;
801}
802
804 return true;
805}
806
807// The backend supports 32 and 64 bit floating point immediates.
808// FIXME: Why are we reporting vectors of FP immediates as legal?
810 bool ForCodeSize) const {
811 EVT ScalarVT = VT.getScalarType();
812 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
813 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
814}
815
816// We don't want to shrink f64 / f32 constants.
818 EVT ScalarVT = VT.getScalarType();
819 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
820}
821
823 ISD::LoadExtType ExtTy,
824 EVT NewVT) const {
825 // TODO: This may be worth removing. Check regression tests for diffs.
827 return false;
828
829 unsigned NewSize = NewVT.getStoreSizeInBits();
830
831 // If we are reducing to a 32-bit load or a smaller multi-dword load,
832 // this is always better.
833 if (NewSize >= 32)
834 return true;
835
836 EVT OldVT = N->getValueType(0);
837 unsigned OldSize = OldVT.getStoreSizeInBits();
838
839 MemSDNode *MN = cast<MemSDNode>(N);
840 unsigned AS = MN->getAddressSpace();
841 // Do not shrink an aligned scalar load to sub-dword.
842 // Scalar engine cannot do sub-dword loads.
843 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
844 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
847 (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
848 MN->isInvariant())) &&
850 return false;
851
852 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
853 // extloads, so doing one requires using a buffer_load. In cases where we
854 // still couldn't use a scalar load, using the wider load shouldn't really
855 // hurt anything.
856
857 // If the old size already had to be an extload, there's no harm in continuing
858 // to reduce the width.
859 return (OldSize < 32);
860}
861
863 const SelectionDAG &DAG,
864 const MachineMemOperand &MMO) const {
865
866 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
867
868 if (LoadTy.getScalarType() == MVT::i32)
869 return false;
870
871 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
872 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
873
874 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
875 return false;
876
877 unsigned Fast = 0;
879 CastTy, MMO, &Fast) &&
880 Fast;
881}
882
883// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
884// profitable with the expansion for 64-bit since it's generally good to
885// speculate things.
887 return true;
888}
889
891 return true;
892}
893
895 switch (N->getOpcode()) {
896 case ISD::EntryToken:
897 case ISD::TokenFactor:
898 return true;
900 unsigned IntrID = N->getConstantOperandVal(0);
902 }
904 unsigned IntrID = N->getConstantOperandVal(1);
906 }
907 case ISD::LOAD:
908 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
910 return true;
911 return false;
912 case AMDGPUISD::SETCC: // ballot-style instruction
913 return true;
914 }
915 return false;
916}
917
919 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
920 NegatibleCost &Cost, unsigned Depth) const {
921
922 switch (Op.getOpcode()) {
923 case ISD::FMA:
924 case ISD::FMAD: {
925 // Negating a fma is not free if it has users without source mods.
926 if (!allUsesHaveSourceMods(Op.getNode()))
927 return SDValue();
928 break;
929 }
930 case AMDGPUISD::RCP: {
931 SDValue Src = Op.getOperand(0);
932 EVT VT = Op.getValueType();
933 SDLoc SL(Op);
934
935 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
936 ForCodeSize, Cost, Depth + 1);
937 if (NegSrc)
938 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
939 return SDValue();
940 }
941 default:
942 break;
943 }
944
945 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
946 ForCodeSize, Cost, Depth);
947}
948
949//===---------------------------------------------------------------------===//
950// Target Properties
951//===---------------------------------------------------------------------===//
952
955
956 // Packed operations do not have a fabs modifier.
957 return VT == MVT::f32 || VT == MVT::f64 ||
958 (Subtarget->has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16));
959}
960
963 // Report this based on the end legalized type.
964 VT = VT.getScalarType();
965 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
966}
967
969 unsigned NumElem,
970 unsigned AS) const {
971 return true;
972}
973
975 // There are few operations which truly have vector input operands. Any vector
976 // operation is going to involve operations on each component, and a
977 // build_vector will be a copy per element, so it always makes sense to use a
978 // build_vector input in place of the extracted element to avoid a copy into a
979 // super register.
980 //
981 // We should probably only do this if all users are extracts only, but this
982 // should be the common case.
983 return true;
984}
985
987 // Truncate is just accessing a subregister.
988
989 unsigned SrcSize = Source.getSizeInBits();
990 unsigned DestSize = Dest.getSizeInBits();
991
992 return DestSize < SrcSize && DestSize % 32 == 0 ;
993}
994
996 // Truncate is just accessing a subregister.
997
998 unsigned SrcSize = Source->getScalarSizeInBits();
999 unsigned DestSize = Dest->getScalarSizeInBits();
1000
1001 if (DestSize== 16 && Subtarget->has16BitInsts())
1002 return SrcSize >= 32;
1003
1004 return DestSize < SrcSize && DestSize % 32 == 0;
1005}
1006
1008 unsigned SrcSize = Src->getScalarSizeInBits();
1009 unsigned DestSize = Dest->getScalarSizeInBits();
1010
1011 if (SrcSize == 16 && Subtarget->has16BitInsts())
1012 return DestSize >= 32;
1013
1014 return SrcSize == 32 && DestSize == 64;
1015}
1016
1018 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1019 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1020 // this will enable reducing 64-bit operations the 32-bit, which is always
1021 // good.
1022
1023 if (Src == MVT::i16)
1024 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1025
1026 return Src == MVT::i32 && Dest == MVT::i64;
1027}
1028
1030 EVT DestVT) const {
1031 switch (N->getOpcode()) {
1032 case ISD::ADD:
1033 case ISD::SUB:
1034 case ISD::SHL:
1035 case ISD::SRL:
1036 case ISD::SRA:
1037 case ISD::AND:
1038 case ISD::OR:
1039 case ISD::XOR:
1040 case ISD::MUL:
1041 case ISD::SETCC:
1042 case ISD::SELECT:
1043 if (Subtarget->has16BitInsts() &&
1044 (DestVT.isVector() ? !Subtarget->hasVOP3PInsts() : true)) {
1045 // Don't narrow back down to i16 if promoted to i32 already.
1046 if (!N->isDivergent() && DestVT.isInteger() &&
1047 DestVT.getScalarSizeInBits() > 1 &&
1048 DestVT.getScalarSizeInBits() <= 16 &&
1049 SrcVT.getScalarSizeInBits() > 16) {
1050 return false;
1051 }
1052 }
1053 return true;
1054 default:
1055 break;
1056 }
1057
1058 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1059 // limited number of native 64-bit operations. Shrinking an operation to fit
1060 // in a single 32-bit register should always be helpful. As currently used,
1061 // this is much less general than the name suggests, and is only used in
1062 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1063 // not profitable, and may actually be harmful.
1064 if (isa<LoadSDNode>(N))
1065 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1066
1067 return true;
1068}
1069
1071 const SDNode* N, CombineLevel Level) const {
1072 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1073 N->getOpcode() == ISD::SRL) &&
1074 "Expected shift op");
1075
1076 SDValue ShiftLHS = N->getOperand(0);
1077 if (!ShiftLHS->hasOneUse())
1078 return false;
1079
1080 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
1081 !ShiftLHS.getOperand(0)->hasOneUse())
1082 return false;
1083
1084 // Always commute pre-type legalization and right shifts.
1085 // We're looking for shl(or(x,y),z) patterns.
1087 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
1088 return true;
1089
1090 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1091 if (N->getValueType(0) == MVT::i32 && N->hasOneUse() &&
1092 (N->user_begin()->getOpcode() == ISD::SRA ||
1093 N->user_begin()->getOpcode() == ISD::SRL))
1094 return false;
1095
1096 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1097 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1098 if (LHS.getOpcode() != ISD::SHL)
1099 return false;
1100 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1101 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1102 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1103 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1104 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1105 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1106 };
1107 SDValue LHS = N->getOperand(0).getOperand(0);
1108 SDValue RHS = N->getOperand(0).getOperand(1);
1109 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1110}
1111
1112//===---------------------------------------------------------------------===//
1113// TargetLowering Callbacks
1114//===---------------------------------------------------------------------===//
1115
1117 bool IsVarArg) {
1118 switch (CC) {
1126 return CC_AMDGPU;
1129 return CC_AMDGPU_CS_CHAIN;
1130 case CallingConv::C:
1131 case CallingConv::Fast:
1132 case CallingConv::Cold:
1133 return CC_AMDGPU_Func;
1135 return CC_SI_Gfx;
1138 default:
1139 report_fatal_error("Unsupported calling convention for call");
1140 }
1141}
1142
1144 bool IsVarArg) {
1145 switch (CC) {
1148 llvm_unreachable("kernels should not be handled here");
1158 return RetCC_SI_Shader;
1160 return RetCC_SI_Gfx;
1161 case CallingConv::C:
1162 case CallingConv::Fast:
1163 case CallingConv::Cold:
1164 return RetCC_AMDGPU_Func;
1165 default:
1166 report_fatal_error("Unsupported calling convention.");
1167 }
1168}
1169
1170/// The SelectionDAGBuilder will automatically promote function arguments
1171/// with illegal types. However, this does not work for the AMDGPU targets
1172/// since the function arguments are stored in memory as these illegal types.
1173/// In order to handle this properly we need to get the original types sizes
1174/// from the LLVM IR Function and fixup the ISD:InputArg values before
1175/// passing them to AnalyzeFormalArguments()
1176
1177/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1178/// input values across multiple registers. Each item in the Ins array
1179/// represents a single value that will be stored in registers. Ins[x].VT is
1180/// the value type of the value that will be stored in the register, so
1181/// whatever SDNode we lower the argument to needs to be this type.
1182///
1183/// In order to correctly lower the arguments we need to know the size of each
1184/// argument. Since Ins[x].VT gives us the size of the register that will
1185/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1186/// for the original function argument so that we can deduce the correct memory
1187/// type to use for Ins[x]. In most cases the correct memory type will be
1188/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1189/// we have a kernel argument of type v8i8, this argument will be split into
1190/// 8 parts and each part will be represented by its own item in the Ins array.
1191/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1192/// the argument before it was split. From this, we deduce that the memory type
1193/// for each individual part is i8. We pass the memory type as LocVT to the
1194/// calling convention analysis function and the register type (Ins[x].VT) as
1195/// the ValVT.
1197 CCState &State,
1198 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1199 const MachineFunction &MF = State.getMachineFunction();
1200 const Function &Fn = MF.getFunction();
1201 LLVMContext &Ctx = Fn.getParent()->getContext();
1202 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1203 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1205
1206 Align MaxAlign = Align(1);
1207 uint64_t ExplicitArgOffset = 0;
1208 const DataLayout &DL = Fn.getDataLayout();
1209
1210 unsigned InIndex = 0;
1211
1212 for (const Argument &Arg : Fn.args()) {
1213 const bool IsByRef = Arg.hasByRefAttr();
1214 Type *BaseArgTy = Arg.getType();
1215 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1216 Align Alignment = DL.getValueOrABITypeAlignment(
1217 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1218 MaxAlign = std::max(Alignment, MaxAlign);
1219 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1220
1221 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1222 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1223
1224 // We're basically throwing away everything passed into us and starting over
1225 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1226 // to us as computed in Ins.
1227 //
1228 // We also need to figure out what type legalization is trying to do to get
1229 // the correct memory offsets.
1230
1231 SmallVector<EVT, 16> ValueVTs;
1233 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1234
1235 for (unsigned Value = 0, NumValues = ValueVTs.size();
1236 Value != NumValues; ++Value) {
1237 uint64_t BasePartOffset = Offsets[Value];
1238
1239 EVT ArgVT = ValueVTs[Value];
1240 EVT MemVT = ArgVT;
1241 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1242 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1243
1244 if (NumRegs == 1) {
1245 // This argument is not split, so the IR type is the memory type.
1246 if (ArgVT.isExtended()) {
1247 // We have an extended type, like i24, so we should just use the
1248 // register type.
1249 MemVT = RegisterVT;
1250 } else {
1251 MemVT = ArgVT;
1252 }
1253 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1254 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1255 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1256 // We have a vector value which has been split into a vector with
1257 // the same scalar type, but fewer elements. This should handle
1258 // all the floating-point vector types.
1259 MemVT = RegisterVT;
1260 } else if (ArgVT.isVector() &&
1261 ArgVT.getVectorNumElements() == NumRegs) {
1262 // This arg has been split so that each element is stored in a separate
1263 // register.
1264 MemVT = ArgVT.getScalarType();
1265 } else if (ArgVT.isExtended()) {
1266 // We have an extended type, like i65.
1267 MemVT = RegisterVT;
1268 } else {
1269 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1270 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1271 if (RegisterVT.isInteger()) {
1272 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1273 } else if (RegisterVT.isVector()) {
1274 assert(!RegisterVT.getScalarType().isFloatingPoint());
1275 unsigned NumElements = RegisterVT.getVectorNumElements();
1276 assert(MemoryBits % NumElements == 0);
1277 // This vector type has been split into another vector type with
1278 // a different elements size.
1279 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1280 MemoryBits / NumElements);
1281 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1282 } else {
1283 llvm_unreachable("cannot deduce memory type.");
1284 }
1285 }
1286
1287 // Convert one element vectors to scalar.
1288 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1289 MemVT = MemVT.getScalarType();
1290
1291 // Round up vec3/vec5 argument.
1292 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1293 MemVT = MemVT.getPow2VectorType(State.getContext());
1294 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1295 MemVT = MemVT.getRoundIntegerType(State.getContext());
1296 }
1297
1298 unsigned PartOffset = 0;
1299 for (unsigned i = 0; i != NumRegs; ++i) {
1300 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1301 BasePartOffset + PartOffset,
1302 MemVT.getSimpleVT(),
1304 PartOffset += MemVT.getStoreSize();
1305 }
1306 }
1307 }
1308}
1309
1311 SDValue Chain, CallingConv::ID CallConv,
1312 bool isVarArg,
1314 const SmallVectorImpl<SDValue> &OutVals,
1315 const SDLoc &DL, SelectionDAG &DAG) const {
1316 // FIXME: Fails for r600 tests
1317 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1318 // "wave terminate should not have return values");
1319 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1320}
1321
1322//===---------------------------------------------------------------------===//
1323// Target specific lowering
1324//===---------------------------------------------------------------------===//
1325
1326/// Selects the correct CCAssignFn for a given CallingConvention value.
1328 bool IsVarArg) {
1329 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1330}
1331
1333 bool IsVarArg) {
1335}
1336
1338 SelectionDAG &DAG,
1339 MachineFrameInfo &MFI,
1340 int ClobberedFI) const {
1341 SmallVector<SDValue, 8> ArgChains;
1342 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1343 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1344
1345 // Include the original chain at the beginning of the list. When this is
1346 // used by target LowerCall hooks, this helps legalize find the
1347 // CALLSEQ_BEGIN node.
1348 ArgChains.push_back(Chain);
1349
1350 // Add a chain value for each stack argument corresponding
1351 for (SDNode *U : DAG.getEntryNode().getNode()->users()) {
1352 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1353 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1354 if (FI->getIndex() < 0) {
1355 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1356 int64_t InLastByte = InFirstByte;
1357 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1358
1359 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1360 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1361 ArgChains.push_back(SDValue(L, 1));
1362 }
1363 }
1364 }
1365 }
1366
1367 // Build a tokenfactor for all the chains.
1368 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1369}
1370
1373 StringRef Reason) const {
1374 SDValue Callee = CLI.Callee;
1375 SelectionDAG &DAG = CLI.DAG;
1376
1377 const Function &Fn = DAG.getMachineFunction().getFunction();
1378
1379 StringRef FuncName("<unknown>");
1380
1381 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1382 FuncName = G->getSymbol();
1383 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1384 FuncName = G->getGlobal()->getName();
1385
1387 Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1388 DAG.getContext()->diagnose(NoCalls);
1389
1390 if (!CLI.IsTailCall) {
1391 for (ISD::InputArg &Arg : CLI.Ins)
1392 InVals.push_back(DAG.getUNDEF(Arg.VT));
1393 }
1394
1395 return DAG.getEntryNode();
1396}
1397
1399 SmallVectorImpl<SDValue> &InVals) const {
1400 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1401}
1402
1404 SelectionDAG &DAG) const {
1405 const Function &Fn = DAG.getMachineFunction().getFunction();
1406
1407 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1408 SDLoc(Op).getDebugLoc());
1409 DAG.getContext()->diagnose(NoDynamicAlloca);
1410 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1411 return DAG.getMergeValues(Ops, SDLoc());
1412}
1413
1415 SelectionDAG &DAG) const {
1416 switch (Op.getOpcode()) {
1417 default:
1418 Op->print(errs(), &DAG);
1419 llvm_unreachable("Custom lowering code for this "
1420 "instruction is not implemented yet!");
1421 break;
1423 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1425 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1426 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1427 case ISD::FREM: return LowerFREM(Op, DAG);
1428 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1429 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1430 case ISD::FRINT: return LowerFRINT(Op, DAG);
1431 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1432 case ISD::FROUNDEVEN:
1433 return LowerFROUNDEVEN(Op, DAG);
1434 case ISD::FROUND: return LowerFROUND(Op, DAG);
1435 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1436 case ISD::FLOG2:
1437 return LowerFLOG2(Op, DAG);
1438 case ISD::FLOG:
1439 case ISD::FLOG10:
1440 return LowerFLOGCommon(Op, DAG);
1441 case ISD::FEXP:
1442 case ISD::FEXP10:
1443 return lowerFEXP(Op, DAG);
1444 case ISD::FEXP2:
1445 return lowerFEXP2(Op, DAG);
1446 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1447 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1448 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1449 case ISD::FP_TO_SINT:
1450 case ISD::FP_TO_UINT:
1451 return LowerFP_TO_INT(Op, DAG);
1452 case ISD::CTTZ:
1454 case ISD::CTLZ:
1456 return LowerCTLZ_CTTZ(Op, DAG);
1458 }
1459 return Op;
1460}
1461
1464 SelectionDAG &DAG) const {
1465 switch (N->getOpcode()) {
1467 // Different parts of legalization seem to interpret which type of
1468 // sign_extend_inreg is the one to check for custom lowering. The extended
1469 // from type is what really matters, but some places check for custom
1470 // lowering of the result type. This results in trying to use
1471 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1472 // nothing here and let the illegal result integer be handled normally.
1473 return;
1474 case ISD::FLOG2:
1475 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1476 Results.push_back(Lowered);
1477 return;
1478 case ISD::FLOG:
1479 case ISD::FLOG10:
1480 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1481 Results.push_back(Lowered);
1482 return;
1483 case ISD::FEXP2:
1484 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1485 Results.push_back(Lowered);
1486 return;
1487 case ISD::FEXP:
1488 case ISD::FEXP10:
1489 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1490 Results.push_back(Lowered);
1491 return;
1492 case ISD::CTLZ:
1494 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1495 Results.push_back(Lowered);
1496 return;
1497 default:
1498 return;
1499 }
1500}
1501
1503 SDValue Op,
1504 SelectionDAG &DAG) const {
1505
1506 const DataLayout &DL = DAG.getDataLayout();
1507 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1508 const GlobalValue *GV = G->getGlobal();
1509
1510 if (!MFI->isModuleEntryFunction()) {
1511 if (std::optional<uint32_t> Address =
1513 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1514 }
1515 }
1516
1517 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1518 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1519 if (!MFI->isModuleEntryFunction() &&
1520 GV->getName() != "llvm.amdgcn.module.lds" &&
1521 !AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV))) {
1522 SDLoc DL(Op);
1523 const Function &Fn = DAG.getMachineFunction().getFunction();
1524 DiagnosticInfoUnsupported BadLDSDecl(
1525 Fn, "local memory global used by non-kernel function",
1526 DL.getDebugLoc(), DS_Warning);
1527 DAG.getContext()->diagnose(BadLDSDecl);
1528
1529 // We currently don't have a way to correctly allocate LDS objects that
1530 // aren't directly associated with a kernel. We do force inlining of
1531 // functions that use local objects. However, if these dead functions are
1532 // not eliminated, we don't want a compile time error. Just emit a warning
1533 // and a trap, since there should be no callable path here.
1534 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1535 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1536 Trap, DAG.getRoot());
1537 DAG.setRoot(OutputChain);
1538 return DAG.getUNDEF(Op.getValueType());
1539 }
1540
1541 // XXX: What does the value of G->getOffset() mean?
1542 assert(G->getOffset() == 0 &&
1543 "Do not know what to do with an non-zero offset");
1544
1545 // TODO: We could emit code to handle the initialization somewhere.
1546 // We ignore the initializer for now and legalize it to allow selection.
1547 // The initializer will anyway get errored out during assembly emission.
1548 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1549 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1550 }
1551 return SDValue();
1552}
1553
1555 SelectionDAG &DAG) const {
1557 SDLoc SL(Op);
1558
1559 EVT VT = Op.getValueType();
1560 if (VT.getVectorElementType().getSizeInBits() < 32) {
1561 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1562 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1563 unsigned NewNumElt = OpBitSize / 32;
1564 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1566 MVT::i32, NewNumElt);
1567 for (const SDUse &U : Op->ops()) {
1568 SDValue In = U.get();
1569 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1570 if (NewNumElt > 1)
1571 DAG.ExtractVectorElements(NewIn, Args);
1572 else
1573 Args.push_back(NewIn);
1574 }
1575
1576 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1577 NewNumElt * Op.getNumOperands());
1578 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1579 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1580 }
1581 }
1582
1583 for (const SDUse &U : Op->ops())
1584 DAG.ExtractVectorElements(U.get(), Args);
1585
1586 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1587}
1588
1590 SelectionDAG &DAG) const {
1591 SDLoc SL(Op);
1593 unsigned Start = Op.getConstantOperandVal(1);
1594 EVT VT = Op.getValueType();
1595 EVT SrcVT = Op.getOperand(0).getValueType();
1596
1597 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1598 unsigned NumElt = VT.getVectorNumElements();
1599 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1600 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1601
1602 // Extract 32-bit registers at a time.
1603 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1604 EVT NewVT = NumElt == 2
1605 ? MVT::i32
1606 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1607 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1608
1609 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1610 if (NumElt == 2)
1611 Tmp = Args[0];
1612 else
1613 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1614
1615 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1616 }
1617
1618 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1620
1621 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1622}
1623
1624// TODO: Handle fabs too
1626 if (Val.getOpcode() == ISD::FNEG)
1627 return Val.getOperand(0);
1628
1629 return Val;
1630}
1631
1633 if (Val.getOpcode() == ISD::FNEG)
1634 Val = Val.getOperand(0);
1635 if (Val.getOpcode() == ISD::FABS)
1636 Val = Val.getOperand(0);
1637 if (Val.getOpcode() == ISD::FCOPYSIGN)
1638 Val = Val.getOperand(0);
1639 return Val;
1640}
1641
1643 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1644 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1645 SelectionDAG &DAG = DCI.DAG;
1646 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1647 switch (CCOpcode) {
1648 case ISD::SETOEQ:
1649 case ISD::SETONE:
1650 case ISD::SETUNE:
1651 case ISD::SETNE:
1652 case ISD::SETUEQ:
1653 case ISD::SETEQ:
1654 case ISD::SETFALSE:
1655 case ISD::SETFALSE2:
1656 case ISD::SETTRUE:
1657 case ISD::SETTRUE2:
1658 case ISD::SETUO:
1659 case ISD::SETO:
1660 break;
1661 case ISD::SETULE:
1662 case ISD::SETULT: {
1663 if (LHS == True)
1664 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1665 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1666 }
1667 case ISD::SETOLE:
1668 case ISD::SETOLT:
1669 case ISD::SETLE:
1670 case ISD::SETLT: {
1671 // Ordered. Assume ordered for undefined.
1672
1673 // Only do this after legalization to avoid interfering with other combines
1674 // which might occur.
1676 !DCI.isCalledByLegalizer())
1677 return SDValue();
1678
1679 // We need to permute the operands to get the correct NaN behavior. The
1680 // selected operand is the second one based on the failing compare with NaN,
1681 // so permute it based on the compare type the hardware uses.
1682 if (LHS == True)
1683 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1684 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1685 }
1686 case ISD::SETUGE:
1687 case ISD::SETUGT: {
1688 if (LHS == True)
1689 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1690 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1691 }
1692 case ISD::SETGT:
1693 case ISD::SETGE:
1694 case ISD::SETOGE:
1695 case ISD::SETOGT: {
1697 !DCI.isCalledByLegalizer())
1698 return SDValue();
1699
1700 if (LHS == True)
1701 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1702 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1703 }
1704 case ISD::SETCC_INVALID:
1705 llvm_unreachable("Invalid setcc condcode!");
1706 }
1707 return SDValue();
1708}
1709
1710/// Generate Min/Max node
1712 SDValue LHS, SDValue RHS,
1713 SDValue True, SDValue False,
1714 SDValue CC,
1715 DAGCombinerInfo &DCI) const {
1716 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1717 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1718
1719 SelectionDAG &DAG = DCI.DAG;
1720
1721 // If we can't directly match this, try to see if we can fold an fneg to
1722 // match.
1723
1724 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
1725 ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(False);
1726 SDValue NegTrue = peekFNeg(True);
1727
1728 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1729 // fmin/fmax.
1730 //
1731 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1732 // -> fneg (fmin_legacy lhs, K)
1733 //
1734 // TODO: Use getNegatedExpression
1735 if (LHS == NegTrue && CFalse && CRHS) {
1736 APFloat NegRHS = neg(CRHS->getValueAPF());
1737 if (NegRHS == CFalse->getValueAPF()) {
1738 SDValue Combined =
1739 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1740 if (Combined)
1741 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1742 return SDValue();
1743 }
1744 }
1745
1746 return SDValue();
1747}
1748
1749std::pair<SDValue, SDValue>
1751 SDLoc SL(Op);
1752
1753 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1754
1755 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1756 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1757
1758 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1759 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1760
1761 return std::pair(Lo, Hi);
1762}
1763
1765 SDLoc SL(Op);
1766
1767 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1768 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1769 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1770}
1771
1773 SDLoc SL(Op);
1774
1775 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1776 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1777 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1778}
1779
1780// Split a vector type into two parts. The first part is a power of two vector.
1781// The second part is whatever is left over, and is a scalar if it would
1782// otherwise be a 1-vector.
1783std::pair<EVT, EVT>
1785 EVT LoVT, HiVT;
1786 EVT EltVT = VT.getVectorElementType();
1787 unsigned NumElts = VT.getVectorNumElements();
1788 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1789 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1790 HiVT = NumElts - LoNumElts == 1
1791 ? EltVT
1792 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1793 return std::pair(LoVT, HiVT);
1794}
1795
1796// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1797// scalar.
1798std::pair<SDValue, SDValue>
1800 const EVT &LoVT, const EVT &HiVT,
1801 SelectionDAG &DAG) const {
1803 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1804 N.getValueType().getVectorNumElements() &&
1805 "More vector elements requested than available!");
1807 DAG.getVectorIdxConstant(0, DL));
1808 SDValue Hi = DAG.getNode(
1810 HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1811 return std::pair(Lo, Hi);
1812}
1813
1815 SelectionDAG &DAG) const {
1816 LoadSDNode *Load = cast<LoadSDNode>(Op);
1817 EVT VT = Op.getValueType();
1818 SDLoc SL(Op);
1819
1820
1821 // If this is a 2 element vector, we really want to scalarize and not create
1822 // weird 1 element vectors.
1823 if (VT.getVectorNumElements() == 2) {
1824 SDValue Ops[2];
1825 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1826 return DAG.getMergeValues(Ops, SL);
1827 }
1828
1829 SDValue BasePtr = Load->getBasePtr();
1830 EVT MemVT = Load->getMemoryVT();
1831
1832 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1833
1834 EVT LoVT, HiVT;
1835 EVT LoMemVT, HiMemVT;
1836 SDValue Lo, Hi;
1837
1838 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1839 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1840 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1841
1842 unsigned Size = LoMemVT.getStoreSize();
1843 Align BaseAlign = Load->getAlign();
1844 Align HiAlign = commonAlignment(BaseAlign, Size);
1845
1846 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1847 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1848 BaseAlign, Load->getMemOperand()->getFlags());
1849 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1850 SDValue HiLoad =
1851 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1852 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1853 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1854
1855 SDValue Join;
1856 if (LoVT == HiVT) {
1857 // This is the case that the vector is power of two so was evenly split.
1858 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1859 } else {
1860 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1861 DAG.getVectorIdxConstant(0, SL));
1862 Join = DAG.getNode(
1864 VT, Join, HiLoad,
1866 }
1867
1868 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1869 LoLoad.getValue(1), HiLoad.getValue(1))};
1870
1871 return DAG.getMergeValues(Ops, SL);
1872}
1873
1875 SelectionDAG &DAG) const {
1876 LoadSDNode *Load = cast<LoadSDNode>(Op);
1877 EVT VT = Op.getValueType();
1878 SDValue BasePtr = Load->getBasePtr();
1879 EVT MemVT = Load->getMemoryVT();
1880 SDLoc SL(Op);
1881 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1882 Align BaseAlign = Load->getAlign();
1883 unsigned NumElements = MemVT.getVectorNumElements();
1884
1885 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1886 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1887 if (NumElements != 3 ||
1888 (BaseAlign < Align(8) &&
1889 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1890 return SplitVectorLoad(Op, DAG);
1891
1892 assert(NumElements == 3);
1893
1894 EVT WideVT =
1896 EVT WideMemVT =
1898 SDValue WideLoad = DAG.getExtLoad(
1899 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1900 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1901 return DAG.getMergeValues(
1902 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1903 DAG.getVectorIdxConstant(0, SL)),
1904 WideLoad.getValue(1)},
1905 SL);
1906}
1907
1909 SelectionDAG &DAG) const {
1910 StoreSDNode *Store = cast<StoreSDNode>(Op);
1911 SDValue Val = Store->getValue();
1912 EVT VT = Val.getValueType();
1913
1914 // If this is a 2 element vector, we really want to scalarize and not create
1915 // weird 1 element vectors.
1916 if (VT.getVectorNumElements() == 2)
1917 return scalarizeVectorStore(Store, DAG);
1918
1919 EVT MemVT = Store->getMemoryVT();
1920 SDValue Chain = Store->getChain();
1921 SDValue BasePtr = Store->getBasePtr();
1922 SDLoc SL(Op);
1923
1924 EVT LoVT, HiVT;
1925 EVT LoMemVT, HiMemVT;
1926 SDValue Lo, Hi;
1927
1928 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1929 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1930 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1931
1932 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1933
1934 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1935 Align BaseAlign = Store->getAlign();
1936 unsigned Size = LoMemVT.getStoreSize();
1937 Align HiAlign = commonAlignment(BaseAlign, Size);
1938
1939 SDValue LoStore =
1940 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1941 Store->getMemOperand()->getFlags());
1942 SDValue HiStore =
1943 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1944 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1945
1946 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1947}
1948
1949// This is a shortcut for integer division because we have fast i32<->f32
1950// conversions, and fast f32 reciprocal instructions. The fractional part of a
1951// float is enough to accurately represent up to a 24-bit signed integer.
1953 bool Sign) const {
1954 SDLoc DL(Op);
1955 EVT VT = Op.getValueType();
1956 SDValue LHS = Op.getOperand(0);
1957 SDValue RHS = Op.getOperand(1);
1958 MVT IntVT = MVT::i32;
1959 MVT FltVT = MVT::f32;
1960
1961 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1962 if (LHSSignBits < 9)
1963 return SDValue();
1964
1965 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1966 if (RHSSignBits < 9)
1967 return SDValue();
1968
1969 unsigned BitSize = VT.getSizeInBits();
1970 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1971 unsigned DivBits = BitSize - SignBits;
1972 if (Sign)
1973 ++DivBits;
1974
1977
1978 SDValue jq = DAG.getConstant(1, DL, IntVT);
1979
1980 if (Sign) {
1981 // char|short jq = ia ^ ib;
1982 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1983
1984 // jq = jq >> (bitsize - 2)
1985 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1986 DAG.getConstant(BitSize - 2, DL, VT));
1987
1988 // jq = jq | 0x1
1989 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1990 }
1991
1992 // int ia = (int)LHS;
1993 SDValue ia = LHS;
1994
1995 // int ib, (int)RHS;
1996 SDValue ib = RHS;
1997
1998 // float fa = (float)ia;
1999 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
2000
2001 // float fb = (float)ib;
2002 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
2003
2004 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
2005 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
2006
2007 // fq = trunc(fq);
2008 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
2009
2010 // float fqneg = -fq;
2011 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
2012
2014
2015 bool UseFmadFtz = false;
2016 if (Subtarget->isGCN()) {
2018 UseFmadFtz =
2020 }
2021
2022 // float fr = mad(fqneg, fb, fa);
2023 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2024 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
2026 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
2027
2028 // int iq = (int)fq;
2029 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
2030
2031 // fr = fabs(fr);
2032 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
2033
2034 // fb = fabs(fb);
2035 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
2036
2037 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2038
2039 // int cv = fr >= fb;
2040 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
2041
2042 // jq = (cv ? jq : 0);
2043 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
2044
2045 // dst = iq + jq;
2046 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
2047
2048 // Rem needs compensation, it's easier to recompute it
2049 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
2050 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
2051
2052 // Truncate to number of bits this divide really is.
2053 if (Sign) {
2054 SDValue InRegSize
2055 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
2056 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
2057 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
2058 } else {
2059 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
2060 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
2061 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
2062 }
2063
2064 return DAG.getMergeValues({ Div, Rem }, DL);
2065}
2066
2068 SelectionDAG &DAG,
2070 SDLoc DL(Op);
2071 EVT VT = Op.getValueType();
2072
2073 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2074
2075 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2076
2077 SDValue One = DAG.getConstant(1, DL, HalfVT);
2078 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
2079
2080 //HiLo split
2081 SDValue LHS_Lo, LHS_Hi;
2082 SDValue LHS = Op.getOperand(0);
2083 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
2084
2085 SDValue RHS_Lo, RHS_Hi;
2086 SDValue RHS = Op.getOperand(1);
2087 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2088
2089 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2091
2092 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2093 LHS_Lo, RHS_Lo);
2094
2095 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2096 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2097
2098 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2099 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2100 return;
2101 }
2102
2103 if (isTypeLegal(MVT::i64)) {
2104 // The algorithm here is based on ideas from "Software Integer Division",
2105 // Tom Rodeheffer, August 2008.
2106
2109
2110 // Compute denominator reciprocal.
2111 unsigned FMAD =
2112 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2115 : (unsigned)AMDGPUISD::FMAD_FTZ;
2116
2117 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2118 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2119 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2120 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2121 Cvt_Lo);
2122 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2123 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2124 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2125 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2126 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2127 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2128 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2129 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2130 Mul1);
2131 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2132 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2133 SDValue Rcp64 = DAG.getBitcast(VT,
2134 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2135
2136 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2137 SDValue One64 = DAG.getConstant(1, DL, VT);
2138 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2139 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2140
2141 // First round of UNR (Unsigned integer Newton-Raphson).
2142 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2143 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2144 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2145 SDValue Mulhi1_Lo, Mulhi1_Hi;
2146 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2147 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2148 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2149 Mulhi1_Lo, Zero1);
2150 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2151 Mulhi1_Hi, Add1_Lo.getValue(1));
2152 SDValue Add1 = DAG.getBitcast(VT,
2153 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2154
2155 // Second round of UNR.
2156 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2157 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2158 SDValue Mulhi2_Lo, Mulhi2_Hi;
2159 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2160 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2161 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2162 Mulhi2_Lo, Zero1);
2163 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2164 Mulhi2_Hi, Add2_Lo.getValue(1));
2165 SDValue Add2 = DAG.getBitcast(VT,
2166 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2167
2168 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2169
2170 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2171
2172 SDValue Mul3_Lo, Mul3_Hi;
2173 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2174 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2175 Mul3_Lo, Zero1);
2176 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2177 Mul3_Hi, Sub1_Lo.getValue(1));
2178 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2179 SDValue Sub1 = DAG.getBitcast(VT,
2180 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2181
2182 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2183 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2184 ISD::SETUGE);
2185 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2186 ISD::SETUGE);
2187 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2188
2189 // TODO: Here and below portions of the code can be enclosed into if/endif.
2190 // Currently control flow is unconditional and we have 4 selects after
2191 // potential endif to substitute PHIs.
2192
2193 // if C3 != 0 ...
2194 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2195 RHS_Lo, Zero1);
2196 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2197 RHS_Hi, Sub1_Lo.getValue(1));
2198 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2199 Zero, Sub2_Lo.getValue(1));
2200 SDValue Sub2 = DAG.getBitcast(VT,
2201 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2202
2203 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2204
2205 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2206 ISD::SETUGE);
2207 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2208 ISD::SETUGE);
2209 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2210
2211 // if (C6 != 0)
2212 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2213
2214 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2215 RHS_Lo, Zero1);
2216 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2217 RHS_Hi, Sub2_Lo.getValue(1));
2218 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2219 Zero, Sub3_Lo.getValue(1));
2220 SDValue Sub3 = DAG.getBitcast(VT,
2221 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2222
2223 // endif C6
2224 // endif C3
2225
2226 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2227 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2228
2229 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2230 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2231
2232 Results.push_back(Div);
2233 Results.push_back(Rem);
2234
2235 return;
2236 }
2237
2238 // r600 expandion.
2239 // Get Speculative values
2240 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2241 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2242
2243 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2244 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2245 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2246
2247 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2248 SDValue DIV_Lo = Zero;
2249
2250 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2251
2252 for (unsigned i = 0; i < halfBitWidth; ++i) {
2253 const unsigned bitPos = halfBitWidth - i - 1;
2254 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2255 // Get value of high bit
2256 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2257 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2258 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2259
2260 // Shift
2261 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2262 // Add LHS high bit
2263 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2264
2265 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2266 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2267
2268 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2269
2270 // Update REM
2271 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2272 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2273 }
2274
2275 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2276 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2277 Results.push_back(DIV);
2278 Results.push_back(REM);
2279}
2280
2282 SelectionDAG &DAG) const {
2283 SDLoc DL(Op);
2284 EVT VT = Op.getValueType();
2285
2286 if (VT == MVT::i64) {
2288 LowerUDIVREM64(Op, DAG, Results);
2289 return DAG.getMergeValues(Results, DL);
2290 }
2291
2292 if (VT == MVT::i32) {
2293 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2294 return Res;
2295 }
2296
2297 SDValue X = Op.getOperand(0);
2298 SDValue Y = Op.getOperand(1);
2299
2300 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2301 // algorithm used here.
2302
2303 // Initial estimate of inv(y).
2304 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2305
2306 // One round of UNR.
2307 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2308 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2309 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2310 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2311
2312 // Quotient/remainder estimate.
2313 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2314 SDValue R =
2315 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2316
2317 // First quotient/remainder refinement.
2318 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2319 SDValue One = DAG.getConstant(1, DL, VT);
2320 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2321 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2322 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2323 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2324 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2325
2326 // Second quotient/remainder refinement.
2327 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2328 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2329 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2330 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2331 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2332
2333 return DAG.getMergeValues({Q, R}, DL);
2334}
2335
2337 SelectionDAG &DAG) const {
2338 SDLoc DL(Op);
2339 EVT VT = Op.getValueType();
2340
2341 SDValue LHS = Op.getOperand(0);
2342 SDValue RHS = Op.getOperand(1);
2343
2344 SDValue Zero = DAG.getConstant(0, DL, VT);
2345 SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
2346
2347 if (VT == MVT::i32) {
2348 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2349 return Res;
2350 }
2351
2352 if (VT == MVT::i64 &&
2353 DAG.ComputeNumSignBits(LHS) > 32 &&
2354 DAG.ComputeNumSignBits(RHS) > 32) {
2355 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2356
2357 //HiLo split
2358 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2359 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2360 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2361 LHS_Lo, RHS_Lo);
2362 SDValue Res[2] = {
2363 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2364 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2365 };
2366 return DAG.getMergeValues(Res, DL);
2367 }
2368
2369 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2370 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2371 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2372 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2373
2374 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2375 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2376
2377 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2378 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2379
2380 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2381 SDValue Rem = Div.getValue(1);
2382
2383 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2384 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2385
2386 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2387 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2388
2389 SDValue Res[2] = {
2390 Div,
2391 Rem
2392 };
2393 return DAG.getMergeValues(Res, DL);
2394}
2395
2396// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2398 SDLoc SL(Op);
2399 EVT VT = Op.getValueType();
2400 auto Flags = Op->getFlags();
2401 SDValue X = Op.getOperand(0);
2402 SDValue Y = Op.getOperand(1);
2403
2404 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2405 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2406 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2407 // TODO: For f32 use FMAD instead if !hasFastFMA32?
2408 return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2409}
2410
2412 SDLoc SL(Op);
2413 SDValue Src = Op.getOperand(0);
2414
2415 // result = trunc(src)
2416 // if (src > 0.0 && src != result)
2417 // result += 1.0
2418
2419 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2420
2421 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2422 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2423
2424 EVT SetCCVT =
2425 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2426
2427 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2428 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2429 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2430
2431 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2432 // TODO: Should this propagate fast-math-flags?
2433 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2434}
2435
2437 SelectionDAG &DAG) {
2438 const unsigned FractBits = 52;
2439 const unsigned ExpBits = 11;
2440
2441 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2442 Hi,
2443 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2444 DAG.getConstant(ExpBits, SL, MVT::i32));
2445 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2446 DAG.getConstant(1023, SL, MVT::i32));
2447
2448 return Exp;
2449}
2450
2452 SDLoc SL(Op);
2453 SDValue Src = Op.getOperand(0);
2454
2455 assert(Op.getValueType() == MVT::f64);
2456
2457 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2458
2459 // Extract the upper half, since this is where we will find the sign and
2460 // exponent.
2461 SDValue Hi = getHiHalf64(Src, DAG);
2462
2463 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2464
2465 const unsigned FractBits = 52;
2466
2467 // Extract the sign bit.
2468 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2469 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2470
2471 // Extend back to 64-bits.
2472 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2473 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2474
2475 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2476 const SDValue FractMask
2477 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2478
2479 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2480 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2481 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2482
2483 EVT SetCCVT =
2484 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2485
2486 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2487
2488 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2489 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2490
2491 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2492 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2493
2494 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2495}
2496
2498 SelectionDAG &DAG) const {
2499 SDLoc SL(Op);
2500 SDValue Src = Op.getOperand(0);
2501
2502 assert(Op.getValueType() == MVT::f64);
2503
2504 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2505 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2506 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2507
2508 // TODO: Should this propagate fast-math-flags?
2509
2510 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2511 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2512
2513 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2514
2515 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2516 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2517
2518 EVT SetCCVT =
2519 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2520 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2521
2522 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2523}
2524
2526 SelectionDAG &DAG) const {
2527 // FNEARBYINT and FRINT are the same, except in their handling of FP
2528 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2529 // rint, so just treat them as equivalent.
2530 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2531 Op.getOperand(0));
2532}
2533
2535 auto VT = Op.getValueType();
2536 auto Arg = Op.getOperand(0u);
2537 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2538}
2539
2540// XXX - May require not supporting f32 denormals?
2541
2542// Don't handle v2f16. The extra instructions to scalarize and repack around the
2543// compare and vselect end up producing worse code than scalarizing the whole
2544// operation.
2546 SDLoc SL(Op);
2547 SDValue X = Op.getOperand(0);
2548 EVT VT = Op.getValueType();
2549
2550 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2551
2552 // TODO: Should this propagate fast-math-flags?
2553
2554 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2555
2556 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2557
2558 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2559 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2560
2561 EVT SetCCVT =
2562 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2563
2564 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2565 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2566 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2567
2568 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2569 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2570}
2571
2573 SDLoc SL(Op);
2574 SDValue Src = Op.getOperand(0);
2575
2576 // result = trunc(src);
2577 // if (src < 0.0 && src != result)
2578 // result += -1.0.
2579
2580 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2581
2582 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2583 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2584
2585 EVT SetCCVT =
2586 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2587
2588 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2589 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2590 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2591
2592 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2593 // TODO: Should this propagate fast-math-flags?
2594 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2595}
2596
2597/// Return true if it's known that \p Src can never be an f32 denormal value.
2599 switch (Src.getOpcode()) {
2600 case ISD::FP_EXTEND:
2601 return Src.getOperand(0).getValueType() == MVT::f16;
2602 case ISD::FP16_TO_FP:
2603 case ISD::FFREXP:
2604 return true;
2606 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2607 switch (IntrinsicID) {
2608 case Intrinsic::amdgcn_frexp_mant:
2609 return true;
2610 default:
2611 return false;
2612 }
2613 }
2614 default:
2615 return false;
2616 }
2617
2618 llvm_unreachable("covered opcode switch");
2619}
2620
2622 SDNodeFlags Flags) {
2623 if (Flags.hasApproximateFuncs())
2624 return true;
2625 auto &Options = DAG.getTarget().Options;
2626 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
2627}
2628
2630 SDValue Src,
2631 SDNodeFlags Flags) {
2632 return !valueIsKnownNeverF32Denorm(Src) &&
2633 DAG.getMachineFunction()
2636}
2637
2639 SDValue Src,
2640 SDNodeFlags Flags) const {
2641 SDLoc SL(Src);
2642 EVT VT = Src.getValueType();
2643 const fltSemantics &Semantics = VT.getFltSemantics();
2644 SDValue SmallestNormal =
2645 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2646
2647 // Want to scale denormals up, but negatives and 0 work just as well on the
2648 // scaled path.
2649 SDValue IsLtSmallestNormal = DAG.getSetCC(
2650 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2651 SmallestNormal, ISD::SETOLT);
2652
2653 return IsLtSmallestNormal;
2654}
2655
2657 SDNodeFlags Flags) const {
2658 SDLoc SL(Src);
2659 EVT VT = Src.getValueType();
2660 const fltSemantics &Semantics = VT.getFltSemantics();
2661 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2662
2663 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2664 SDValue IsFinite = DAG.getSetCC(
2665 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2666 Inf, ISD::SETOLT);
2667 return IsFinite;
2668}
2669
2670/// If denormal handling is required return the scaled input to FLOG2, and the
2671/// check for denormal range. Otherwise, return null values.
2672std::pair<SDValue, SDValue>
2674 SDValue Src, SDNodeFlags Flags) const {
2675 if (!needsDenormHandlingF32(DAG, Src, Flags))
2676 return {};
2677
2678 MVT VT = MVT::f32;
2679 const fltSemantics &Semantics = APFloat::IEEEsingle();
2680 SDValue SmallestNormal =
2681 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2682
2683 SDValue IsLtSmallestNormal = DAG.getSetCC(
2684 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2685 SmallestNormal, ISD::SETOLT);
2686
2687 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2688 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2689 SDValue ScaleFactor =
2690 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2691
2692 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2693 return {ScaledInput, IsLtSmallestNormal};
2694}
2695
2697 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2698 // If we have to handle denormals, scale up the input and adjust the result.
2699
2700 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2701 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2702
2703 SDLoc SL(Op);
2704 EVT VT = Op.getValueType();
2705 SDValue Src = Op.getOperand(0);
2706 SDNodeFlags Flags = Op->getFlags();
2707
2708 if (VT == MVT::f16) {
2709 // Nothing in half is a denormal when promoted to f32.
2710 assert(!Subtarget->has16BitInsts());
2711 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2712 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2713 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2714 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2715 }
2716
2717 auto [ScaledInput, IsLtSmallestNormal] =
2718 getScaledLogInput(DAG, SL, Src, Flags);
2719 if (!ScaledInput)
2720 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2721
2722 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2723
2724 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2725 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2726 SDValue ResultOffset =
2727 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2728 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2729}
2730
2731static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2732 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2733 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2734 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2735}
2736
2738 SelectionDAG &DAG) const {
2739 SDValue X = Op.getOperand(0);
2740 EVT VT = Op.getValueType();
2741 SDNodeFlags Flags = Op->getFlags();
2742 SDLoc DL(Op);
2743
2744 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2745 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2746
2747 const auto &Options = getTargetMachine().Options;
2748 if (VT == MVT::f16 || Flags.hasApproximateFuncs() ||
2749 Options.ApproxFuncFPMath || Options.UnsafeFPMath) {
2750
2751 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2752 // Log and multiply in f32 is good enough for f16.
2753 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2754 }
2755
2756 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2757 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2758 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2759 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2760 }
2761
2762 return Lowered;
2763 }
2764
2765 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags);
2766 if (ScaledInput)
2767 X = ScaledInput;
2768
2769 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2770
2771 SDValue R;
2772 if (Subtarget->hasFastFMAF32()) {
2773 // c+cc are ln(2)/ln(10) to more than 49 bits
2774 const float c_log10 = 0x1.344134p-2f;
2775 const float cc_log10 = 0x1.09f79ep-26f;
2776
2777 // c + cc is ln(2) to more than 49 bits
2778 const float c_log = 0x1.62e42ep-1f;
2779 const float cc_log = 0x1.efa39ep-25f;
2780
2781 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2782 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2783
2784 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2785 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2786 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2787 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2788 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2789 } else {
2790 // ch+ct is ln(2)/ln(10) to more than 36 bits
2791 const float ch_log10 = 0x1.344000p-2f;
2792 const float ct_log10 = 0x1.3509f6p-18f;
2793
2794 // ch + ct is ln(2) to more than 36 bits
2795 const float ch_log = 0x1.62e000p-1f;
2796 const float ct_log = 0x1.0bfbe8p-15f;
2797
2798 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2799 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2800
2801 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2802 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2803 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2804 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2805 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2806
2807 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2808 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2809 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2810 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2811 }
2812
2813 const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) &&
2814 (Flags.hasNoInfs() || Options.NoInfsFPMath);
2815
2816 // TODO: Check if known finite from source value.
2817 if (!IsFiniteOnly) {
2818 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2819 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2820 }
2821
2822 if (IsScaled) {
2823 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2824 SDValue ShiftK =
2825 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2826 SDValue Shift =
2827 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2828 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2829 }
2830
2831 return R;
2832}
2833
2835 return LowerFLOGCommon(Op, DAG);
2836}
2837
2838// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2839// promote f16 operation.
2841 SelectionDAG &DAG, bool IsLog10,
2842 SDNodeFlags Flags) const {
2843 EVT VT = Src.getValueType();
2844 unsigned LogOp =
2845 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2846
2847 double Log2BaseInverted =
2849
2850 if (VT == MVT::f32) {
2851 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2852 if (ScaledInput) {
2853 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2854 SDValue ScaledResultOffset =
2855 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2856
2857 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2858
2859 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2860 ScaledResultOffset, Zero, Flags);
2861
2862 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2863
2864 if (Subtarget->hasFastFMAF32())
2865 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2866 Flags);
2867 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2868 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2869 }
2870 }
2871
2872 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2873 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2874
2875 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2876 Flags);
2877}
2878
2880 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2881 // If we have to handle denormals, scale up the input and adjust the result.
2882
2883 SDLoc SL(Op);
2884 EVT VT = Op.getValueType();
2885 SDValue Src = Op.getOperand(0);
2886 SDNodeFlags Flags = Op->getFlags();
2887
2888 if (VT == MVT::f16) {
2889 // Nothing in half is a denormal when promoted to f32.
2890 assert(!Subtarget->has16BitInsts());
2891 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2892 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
2893 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2894 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2895 }
2896
2897 assert(VT == MVT::f32);
2898
2899 if (!needsDenormHandlingF32(DAG, Src, Flags))
2900 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
2901
2902 // bool needs_scaling = x < -0x1.f80000p+6f;
2903 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2904
2905 // -nextafter(128.0, -1)
2906 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
2907
2908 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2909
2910 SDValue NeedsScaling =
2911 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
2912
2913 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2914 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2915
2916 SDValue AddOffset =
2917 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
2918
2919 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
2920 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
2921
2922 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
2923 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2924 SDValue ResultScale =
2925 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
2926
2927 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
2928}
2929
2931 SelectionDAG &DAG,
2932 SDNodeFlags Flags) const {
2933 EVT VT = X.getValueType();
2934 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
2935
2936 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2937 // exp2(M_LOG2E_F * f);
2938 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags);
2939 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2940 : (unsigned)ISD::FEXP2,
2941 SL, VT, Mul, Flags);
2942 }
2943
2944 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2945
2946 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
2947 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2948
2949 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2950
2951 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2952
2953 SDValue AdjustedX =
2954 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2955
2956 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
2957
2958 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
2959
2960 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
2961 SDValue AdjustedResult =
2962 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
2963
2964 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
2965 Flags);
2966}
2967
2968/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
2969/// handled correctly.
2971 SelectionDAG &DAG,
2972 SDNodeFlags Flags) const {
2973 const EVT VT = X.getValueType();
2974 const unsigned Exp2Op = VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2;
2975
2976 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2977 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
2978 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
2979 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
2980
2981 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
2982 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
2983 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
2984 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
2985 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
2986 }
2987
2988 // bool s = x < -0x1.2f7030p+5f;
2989 // x += s ? 0x1.0p+5f : 0.0f;
2990 // exp10 = exp2(x * 0x1.a92000p+1f) *
2991 // exp2(x * 0x1.4f0978p-11f) *
2992 // (s ? 0x1.9f623ep-107f : 1.0f);
2993
2994 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2995
2996 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
2997 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2998
2999 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
3000 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
3001 SDValue AdjustedX =
3002 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
3003
3004 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3005 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3006
3007 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
3008 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3009 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
3010 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3011
3012 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
3013
3014 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
3015 SDValue AdjustedResult =
3016 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
3017
3018 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
3019 Flags);
3020}
3021
3023 EVT VT = Op.getValueType();
3024 SDLoc SL(Op);
3025 SDValue X = Op.getOperand(0);
3026 SDNodeFlags Flags = Op->getFlags();
3027 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
3028
3029 if (VT.getScalarType() == MVT::f16) {
3030 // v_exp_f16 (fmul x, log2e)
3031 if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?
3032 return lowerFEXPUnsafe(X, SL, DAG, Flags);
3033
3034 if (VT.isVector())
3035 return SDValue();
3036
3037 // exp(f16 x) ->
3038 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3039
3040 // Nothing in half is a denormal when promoted to f32.
3041 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
3042 SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags);
3043 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
3044 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
3045 }
3046
3047 assert(VT == MVT::f32);
3048
3049 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3050 // library behavior. Also, is known-not-daz source sufficient?
3051 if (allowApproxFunc(DAG, Flags)) {
3052 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3053 : lowerFEXPUnsafe(X, SL, DAG, Flags);
3054 }
3055
3056 // Algorithm:
3057 //
3058 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3059 //
3060 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3061 // n = 64*m + j, 0 <= j < 64
3062 //
3063 // e^x = 2^((64*m + j + f)/64)
3064 // = (2^m) * (2^(j/64)) * 2^(f/64)
3065 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3066 //
3067 // f = x*(64/ln(2)) - n
3068 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3069 //
3070 // e^x = (2^m) * (2^(j/64)) * e^r
3071 //
3072 // (2^(j/64)) is precomputed
3073 //
3074 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3075 // e^r = 1 + q
3076 //
3077 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3078 //
3079 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3080 SDNodeFlags FlagsNoContract = Flags;
3081 FlagsNoContract.setAllowContract(false);
3082
3083 SDValue PH, PL;
3084 if (Subtarget->hasFastFMAF32()) {
3085 const float c_exp = numbers::log2ef;
3086 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3087 const float c_exp10 = 0x1.a934f0p+1f;
3088 const float cc_exp10 = 0x1.2f346ep-24f;
3089
3090 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3091 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3092
3093 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3094 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3095 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3096 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3097 } else {
3098 const float ch_exp = 0x1.714000p+0f;
3099 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3100
3101 const float ch_exp10 = 0x1.a92000p+1f;
3102 const float cl_exp10 = 0x1.4f0978p-11f;
3103
3104 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3105 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3106
3107 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3108 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3109 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3110 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3111 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3112
3113 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3114
3115 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3116 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3117 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3118 }
3119
3120 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3121
3122 // It is unsafe to contract this fsub into the PH multiply.
3123 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3124
3125 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3126 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3127 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3128
3129 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3130
3131 SDValue UnderflowCheckConst =
3132 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3133
3134 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3135 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3136 SDValue Underflow =
3137 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3138
3139 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3140 const auto &Options = getTargetMachine().Options;
3141
3142 if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) {
3143 SDValue OverflowCheckConst =
3144 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3145 SDValue Overflow =
3146 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3147 SDValue Inf =
3149 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3150 }
3151
3152 return R;
3153}
3154
3155static bool isCtlzOpc(unsigned Opc) {
3156 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3157}
3158
3159static bool isCttzOpc(unsigned Opc) {
3160 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3161}
3162
3164 SelectionDAG &DAG) const {
3165 auto SL = SDLoc(Op);
3166 auto Opc = Op.getOpcode();
3167 auto Arg = Op.getOperand(0u);
3168 auto ResultVT = Op.getValueType();
3169
3170 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3171 return {};
3172
3173 assert(isCtlzOpc(Opc));
3174 assert(ResultVT == Arg.getValueType());
3175
3176 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3177 SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
3178 SDValue NewOp;
3179
3180 if (Opc == ISD::CTLZ_ZERO_UNDEF) {
3181 NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
3182 NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
3183 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3184 } else {
3185 NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3186 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3187 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
3188 }
3189
3190 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3191}
3192
3194 SDLoc SL(Op);
3195 SDValue Src = Op.getOperand(0);
3196
3197 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3198 bool Ctlz = isCtlzOpc(Op.getOpcode());
3199 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3200
3201 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3202 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3203 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3204
3205 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3206 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3207 // (cttz hi:lo) -> (umin (ffbl src), 32)
3208 // (ctlz_zero_undef src) -> (ffbh src)
3209 // (cttz_zero_undef src) -> (ffbl src)
3210
3211 // 64-bit scalar version produce 32-bit result
3212 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3213 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3214 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3215 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3216 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3217 if (!ZeroUndef) {
3218 const SDValue ConstVal = DAG.getConstant(
3219 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3220 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3221 }
3222 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3223 }
3224
3225 SDValue Lo, Hi;
3226 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3227
3228 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3229 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3230
3231 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3232 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3233 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3234 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3235
3236 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3237 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3238 if (Ctlz)
3239 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3240 else
3241 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3242
3243 SDValue NewOpr;
3244 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3245 if (!ZeroUndef) {
3246 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3247 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3248 }
3249
3250 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3251}
3252
3254 bool Signed) const {
3255 // The regular method converting a 64-bit integer to float roughly consists of
3256 // 2 steps: normalization and rounding. In fact, after normalization, the
3257 // conversion from a 64-bit integer to a float is essentially the same as the
3258 // one from a 32-bit integer. The only difference is that it has more
3259 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3260 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3261 // converted into the correct float number. The basic steps for the unsigned
3262 // conversion are illustrated in the following pseudo code:
3263 //
3264 // f32 uitofp(i64 u) {
3265 // i32 hi, lo = split(u);
3266 // // Only count the leading zeros in hi as we have native support of the
3267 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3268 // // reduced to a 32-bit one automatically.
3269 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3270 // u <<= shamt;
3271 // hi, lo = split(u);
3272 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3273 // // convert it as a 32-bit integer and scale the result back.
3274 // return uitofp(hi) * 2^(32 - shamt);
3275 // }
3276 //
3277 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3278 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3279 // converted instead followed by negation based its sign bit.
3280
3281 SDLoc SL(Op);
3282 SDValue Src = Op.getOperand(0);
3283
3284 SDValue Lo, Hi;
3285 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3286 SDValue Sign;
3287 SDValue ShAmt;
3288 if (Signed && Subtarget->isGCN()) {
3289 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3290 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3291 // account. That is, the maximal shift is
3292 // - 32 if Lo and Hi have opposite signs;
3293 // - 33 if Lo and Hi have the same sign.
3294 //
3295 // Or, MaxShAmt = 33 + OppositeSign, where
3296 //
3297 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3298 // - -1 if Lo and Hi have opposite signs; and
3299 // - 0 otherwise.
3300 //
3301 // All in all, ShAmt is calculated as
3302 //
3303 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3304 //
3305 // or
3306 //
3307 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3308 //
3309 // to reduce the critical path.
3310 SDValue OppositeSign = DAG.getNode(
3311 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3312 DAG.getConstant(31, SL, MVT::i32));
3313 SDValue MaxShAmt =
3314 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3315 OppositeSign);
3316 // Count the leading sign bits.
3317 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
3318 // Different from unsigned conversion, the shift should be one bit less to
3319 // preserve the sign bit.
3320 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3321 DAG.getConstant(1, SL, MVT::i32));
3322 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3323 } else {
3324 if (Signed) {
3325 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3326 // absolute value first.
3327 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3328 DAG.getConstant(63, SL, MVT::i64));
3329 SDValue Abs =
3330 DAG.getNode(ISD::XOR, SL, MVT::i64,
3331 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3332 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3333 }
3334 // Count the leading zeros.
3335 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3336 // The shift amount for signed integers is [0, 32].
3337 }
3338 // Normalize the given 64-bit integer.
3339 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3340 // Split it again.
3341 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3342 // Calculate the adjust bit for rounding.
3343 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3344 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3345 DAG.getConstant(1, SL, MVT::i32), Lo);
3346 // Get the 32-bit normalized integer.
3347 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3348 // Convert the normalized 32-bit integer into f32.
3349 unsigned Opc =
3350 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3351 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3352
3353 // Finally, need to scale back the converted floating number as the original
3354 // 64-bit integer is converted as a 32-bit one.
3355 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3356 ShAmt);
3357 // On GCN, use LDEXP directly.
3358 if (Subtarget->isGCN())
3359 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3360
3361 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3362 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3363 // exponent is enough to avoid overflowing into the sign bit.
3364 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3365 DAG.getConstant(23, SL, MVT::i32));
3366 SDValue IVal =
3367 DAG.getNode(ISD::ADD, SL, MVT::i32,
3368 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3369 if (Signed) {
3370 // Set the sign bit.
3371 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3372 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3373 DAG.getConstant(31, SL, MVT::i32));
3374 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3375 }
3376 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3377}
3378
3380 bool Signed) const {
3381 SDLoc SL(Op);
3382 SDValue Src = Op.getOperand(0);
3383
3384 SDValue Lo, Hi;
3385 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3386
3388 SL, MVT::f64, Hi);
3389
3390 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3391
3392 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3393 DAG.getConstant(32, SL, MVT::i32));
3394 // TODO: Should this propagate fast-math-flags?
3395 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3396}
3397
3399 SelectionDAG &DAG) const {
3400 // TODO: Factor out code common with LowerSINT_TO_FP.
3401 EVT DestVT = Op.getValueType();
3402 SDValue Src = Op.getOperand(0);
3403 EVT SrcVT = Src.getValueType();
3404
3405 if (SrcVT == MVT::i16) {
3406 if (DestVT == MVT::f16)
3407 return Op;
3408 SDLoc DL(Op);
3409
3410 // Promote src to i32
3411 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3412 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3413 }
3414
3415 if (DestVT == MVT::bf16) {
3416 SDLoc SL(Op);
3417 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3418 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3419 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3420 }
3421
3422 if (SrcVT != MVT::i64)
3423 return Op;
3424
3425 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3426 SDLoc DL(Op);
3427
3428 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3429 SDValue FPRoundFlag =
3430 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3431 SDValue FPRound =
3432 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3433
3434 return FPRound;
3435 }
3436
3437 if (DestVT == MVT::f32)
3438 return LowerINT_TO_FP32(Op, DAG, false);
3439
3440 assert(DestVT == MVT::f64);
3441 return LowerINT_TO_FP64(Op, DAG, false);
3442}
3443
3445 SelectionDAG &DAG) const {
3446 EVT DestVT = Op.getValueType();
3447
3448 SDValue Src = Op.getOperand(0);
3449 EVT SrcVT = Src.getValueType();
3450
3451 if (SrcVT == MVT::i16) {
3452 if (DestVT == MVT::f16)
3453 return Op;
3454
3455 SDLoc DL(Op);
3456 // Promote src to i32
3457 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3458 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3459 }
3460
3461 if (DestVT == MVT::bf16) {
3462 SDLoc SL(Op);
3463 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3464 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3465 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3466 }
3467
3468 if (SrcVT != MVT::i64)
3469 return Op;
3470
3471 // TODO: Factor out code common with LowerUINT_TO_FP.
3472
3473 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3474 SDLoc DL(Op);
3475 SDValue Src = Op.getOperand(0);
3476
3477 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3478 SDValue FPRoundFlag =
3479 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3480 SDValue FPRound =
3481 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3482
3483 return FPRound;
3484 }
3485
3486 if (DestVT == MVT::f32)
3487 return LowerINT_TO_FP32(Op, DAG, true);
3488
3489 assert(DestVT == MVT::f64);
3490 return LowerINT_TO_FP64(Op, DAG, true);
3491}
3492
3494 bool Signed) const {
3495 SDLoc SL(Op);
3496
3497 SDValue Src = Op.getOperand(0);
3498 EVT SrcVT = Src.getValueType();
3499
3500 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3501
3502 // The basic idea of converting a floating point number into a pair of 32-bit
3503 // integers is illustrated as follows:
3504 //
3505 // tf := trunc(val);
3506 // hif := floor(tf * 2^-32);
3507 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3508 // hi := fptoi(hif);
3509 // lo := fptoi(lof);
3510 //
3511 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3512 SDValue Sign;
3513 if (Signed && SrcVT == MVT::f32) {
3514 // However, a 32-bit floating point number has only 23 bits mantissa and
3515 // it's not enough to hold all the significant bits of `lof` if val is
3516 // negative. To avoid the loss of precision, We need to take the absolute
3517 // value after truncating and flip the result back based on the original
3518 // signedness.
3519 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3520 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3521 DAG.getConstant(31, SL, MVT::i32));
3522 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3523 }
3524
3525 SDValue K0, K1;
3526 if (SrcVT == MVT::f64) {
3527 K0 = DAG.getConstantFP(
3528 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3529 SrcVT);
3530 K1 = DAG.getConstantFP(
3531 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3532 SrcVT);
3533 } else {
3534 K0 = DAG.getConstantFP(
3535 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3536 K1 = DAG.getConstantFP(
3537 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3538 }
3539 // TODO: Should this propagate fast-math-flags?
3540 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3541
3542 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3543
3544 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3545
3546 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3548 SL, MVT::i32, FloorMul);
3549 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3550
3551 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3552 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3553
3554 if (Signed && SrcVT == MVT::f32) {
3555 assert(Sign);
3556 // Flip the result based on the signedness, which is either all 0s or 1s.
3557 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3558 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3559 // r := xor(r, sign) - sign;
3560 Result =
3561 DAG.getNode(ISD::SUB, SL, MVT::i64,
3562 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3563 }
3564
3565 return Result;
3566}
3567
3569 SDLoc DL(Op);
3570 SDValue N0 = Op.getOperand(0);
3571
3572 // Convert to target node to get known bits
3573 if (N0.getValueType() == MVT::f32)
3574 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3575
3576 if (getTargetMachine().Options.UnsafeFPMath) {
3577 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3578 return SDValue();
3579 }
3580
3581 assert(N0.getSimpleValueType() == MVT::f64);
3582
3583 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3584 const unsigned ExpMask = 0x7ff;
3585 const unsigned ExpBiasf64 = 1023;
3586 const unsigned ExpBiasf16 = 15;
3587 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3588 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3589 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
3590 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3591 DAG.getConstant(32, DL, MVT::i64));
3592 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3593 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3594 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3595 DAG.getConstant(20, DL, MVT::i64));
3596 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3597 DAG.getConstant(ExpMask, DL, MVT::i32));
3598 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3599 // add the f16 bias (15) to get the biased exponent for the f16 format.
3600 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3601 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3602
3603 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3604 DAG.getConstant(8, DL, MVT::i32));
3605 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3606 DAG.getConstant(0xffe, DL, MVT::i32));
3607
3608 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3609 DAG.getConstant(0x1ff, DL, MVT::i32));
3610 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3611
3612 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3613 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3614
3615 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3616 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3617 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3618 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3619
3620 // N = M | (E << 12);
3621 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3622 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3623 DAG.getConstant(12, DL, MVT::i32)));
3624
3625 // B = clamp(1-E, 0, 13);
3626 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3627 One, E);
3628 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3629 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3630 DAG.getConstant(13, DL, MVT::i32));
3631
3632 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3633 DAG.getConstant(0x1000, DL, MVT::i32));
3634
3635 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3636 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3637 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3638 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3639
3640 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3641 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3642 DAG.getConstant(0x7, DL, MVT::i32));
3643 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3644 DAG.getConstant(2, DL, MVT::i32));
3645 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3646 One, Zero, ISD::SETEQ);
3647 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3648 One, Zero, ISD::SETGT);
3649 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3650 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3651
3652 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3653 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3654 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3655 I, V, ISD::SETEQ);
3656
3657 // Extract the sign bit.
3658 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3659 DAG.getConstant(16, DL, MVT::i32));
3660 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3661 DAG.getConstant(0x8000, DL, MVT::i32));
3662
3663 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3664 return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
3665}
3666
3668 SelectionDAG &DAG) const {
3669 SDValue Src = Op.getOperand(0);
3670 unsigned OpOpcode = Op.getOpcode();
3671 EVT SrcVT = Src.getValueType();
3672 EVT DestVT = Op.getValueType();
3673
3674 // Will be selected natively
3675 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3676 return Op;
3677
3678 if (SrcVT == MVT::bf16) {
3679 SDLoc DL(Op);
3680 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3681 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3682 }
3683
3684 // Promote i16 to i32
3685 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3686 SDLoc DL(Op);
3687
3688 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3689 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3690 }
3691
3692 if (DestVT != MVT::i64)
3693 return Op;
3694
3695 if (SrcVT == MVT::f16 ||
3696 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3697 SDLoc DL(Op);
3698
3699 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3700 unsigned Ext =
3702 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3703 }
3704
3705 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3706 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3707
3708 return SDValue();
3709}
3710
3712 SelectionDAG &DAG) const {
3713 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3714 MVT VT = Op.getSimpleValueType();
3715 MVT ScalarVT = VT.getScalarType();
3716
3717 assert(VT.isVector());
3718
3719 SDValue Src = Op.getOperand(0);
3720 SDLoc DL(Op);
3721
3722 // TODO: Don't scalarize on Evergreen?
3723 unsigned NElts = VT.getVectorNumElements();
3725 DAG.ExtractVectorElements(Src, Args, 0, NElts);
3726
3727 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3728 for (unsigned I = 0; I < NElts; ++I)
3729 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
3730
3731 return DAG.getBuildVector(VT, DL, Args);
3732}
3733
3734//===----------------------------------------------------------------------===//
3735// Custom DAG optimizations
3736//===----------------------------------------------------------------------===//
3737
3738static bool isU24(SDValue Op, SelectionDAG &DAG) {
3739 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3740}
3741
3742static bool isI24(SDValue Op, SelectionDAG &DAG) {
3743 EVT VT = Op.getValueType();
3744 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3745 // as unsigned 24-bit values.
3747}
3748
3751 SelectionDAG &DAG = DCI.DAG;
3752 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3753 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3754
3755 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
3756 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
3757 unsigned NewOpcode = Node24->getOpcode();
3758 if (IsIntrin) {
3759 unsigned IID = Node24->getConstantOperandVal(0);
3760 switch (IID) {
3761 case Intrinsic::amdgcn_mul_i24:
3762 NewOpcode = AMDGPUISD::MUL_I24;
3763 break;
3764 case Intrinsic::amdgcn_mul_u24:
3765 NewOpcode = AMDGPUISD::MUL_U24;
3766 break;
3767 case Intrinsic::amdgcn_mulhi_i24:
3768 NewOpcode = AMDGPUISD::MULHI_I24;
3769 break;
3770 case Intrinsic::amdgcn_mulhi_u24:
3771 NewOpcode = AMDGPUISD::MULHI_U24;
3772 break;
3773 default:
3774 llvm_unreachable("Expected 24-bit mul intrinsic");
3775 }
3776 }
3777
3778 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
3779
3780 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3781 // the operands to have other uses, but will only perform simplifications that
3782 // involve bypassing some nodes for this user.
3783 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
3784 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
3785 if (DemandedLHS || DemandedRHS)
3786 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
3787 DemandedLHS ? DemandedLHS : LHS,
3788 DemandedRHS ? DemandedRHS : RHS);
3789
3790 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3791 // operands if this node is the only user.
3792 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
3793 return SDValue(Node24, 0);
3794 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
3795 return SDValue(Node24, 0);
3796
3797 return SDValue();
3798}
3799
3800template <typename IntTy>
3802 uint32_t Width, const SDLoc &DL) {
3803 if (Width + Offset < 32) {
3804 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3805 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3806 if constexpr (std::is_signed_v<IntTy>) {
3807 return DAG.getSignedConstant(Result, DL, MVT::i32);
3808 } else {
3809 return DAG.getConstant(Result, DL, MVT::i32);
3810 }
3811 }
3812
3813 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3814}
3815
3816static bool hasVolatileUser(SDNode *Val) {
3817 for (SDNode *U : Val->users()) {
3818 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3819 if (M->isVolatile())
3820 return true;
3821 }
3822 }
3823
3824 return false;
3825}
3826
3828 // i32 vectors are the canonical memory type.
3829 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3830 return false;
3831
3832 if (!VT.isByteSized())
3833 return false;
3834
3835 unsigned Size = VT.getStoreSize();
3836
3837 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3838 return false;
3839
3840 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3841 return false;
3842
3843 return true;
3844}
3845
3846// Replace load of an illegal type with a store of a bitcast to a friendlier
3847// type.
3849 DAGCombinerInfo &DCI) const {
3850 if (!DCI.isBeforeLegalize())
3851 return SDValue();
3852
3853 LoadSDNode *LN = cast<LoadSDNode>(N);
3854 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
3855 return SDValue();
3856
3857 SDLoc SL(N);
3858 SelectionDAG &DAG = DCI.DAG;
3859 EVT VT = LN->getMemoryVT();
3860
3861 unsigned Size = VT.getStoreSize();
3862 Align Alignment = LN->getAlign();
3863 if (Alignment < Size && isTypeLegal(VT)) {
3864 unsigned IsFast;
3865 unsigned AS = LN->getAddressSpace();
3866
3867 // Expand unaligned loads earlier than legalization. Due to visitation order
3868 // problems during legalization, the emitted instructions to pack and unpack
3869 // the bytes again are not eliminated in the case of an unaligned copy.
3871 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3872 if (VT.isVector())
3873 return SplitVectorLoad(SDValue(LN, 0), DAG);
3874
3875 SDValue Ops[2];
3876 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3877
3878 return DAG.getMergeValues(Ops, SDLoc(N));
3879 }
3880
3881 if (!IsFast)
3882 return SDValue();
3883 }
3884
3885 if (!shouldCombineMemoryType(VT))
3886 return SDValue();
3887
3888 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3889
3890 SDValue NewLoad
3891 = DAG.getLoad(NewVT, SL, LN->getChain(),
3892 LN->getBasePtr(), LN->getMemOperand());
3893
3894 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3895 DCI.CombineTo(N, BC, NewLoad.getValue(1));
3896 return SDValue(N, 0);
3897}
3898
3899// Replace store of an illegal type with a store of a bitcast to a friendlier
3900// type.
3902 DAGCombinerInfo &DCI) const {
3903 if (!DCI.isBeforeLegalize())
3904 return SDValue();
3905
3906 StoreSDNode *SN = cast<StoreSDNode>(N);
3907 if (!SN->isSimple() || !ISD::isNormalStore(SN))
3908 return SDValue();
3909
3910 EVT VT = SN->getMemoryVT();
3911 unsigned Size = VT.getStoreSize();
3912
3913 SDLoc SL(N);
3914 SelectionDAG &DAG = DCI.DAG;
3915 Align Alignment = SN->getAlign();
3916 if (Alignment < Size && isTypeLegal(VT)) {
3917 unsigned IsFast;
3918 unsigned AS = SN->getAddressSpace();
3919
3920 // Expand unaligned stores earlier than legalization. Due to visitation
3921 // order problems during legalization, the emitted instructions to pack and
3922 // unpack the bytes again are not eliminated in the case of an unaligned
3923 // copy.
3925 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3926 if (VT.isVector())
3927 return SplitVectorStore(SDValue(SN, 0), DAG);
3928
3929 return expandUnalignedStore(SN, DAG);
3930 }
3931
3932 if (!IsFast)
3933 return SDValue();
3934 }
3935
3936 if (!shouldCombineMemoryType(VT))
3937 return SDValue();
3938
3939 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3940 SDValue Val = SN->getValue();
3941
3942 //DCI.AddToWorklist(Val.getNode());
3943
3944 bool OtherUses = !Val.hasOneUse();
3945 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3946 if (OtherUses) {
3947 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3948 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3949 }
3950
3951 return DAG.getStore(SN->getChain(), SL, CastVal,
3952 SN->getBasePtr(), SN->getMemOperand());
3953}
3954
3955// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3956// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3957// issues.
3959 DAGCombinerInfo &DCI) const {
3960 SelectionDAG &DAG = DCI.DAG;
3961 SDValue N0 = N->getOperand(0);
3962
3963 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3964 // (vt2 (truncate (assertzext vt0:x, vt1)))
3965 if (N0.getOpcode() == ISD::TRUNCATE) {
3966 SDValue N1 = N->getOperand(1);
3967 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3968 SDLoc SL(N);
3969
3970 SDValue Src = N0.getOperand(0);
3971 EVT SrcVT = Src.getValueType();
3972 if (SrcVT.bitsGE(ExtVT)) {
3973 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3974 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3975 }
3976 }
3977
3978 return SDValue();
3979}
3980
3982 SDNode *N, DAGCombinerInfo &DCI) const {
3983 unsigned IID = N->getConstantOperandVal(0);
3984 switch (IID) {
3985 case Intrinsic::amdgcn_mul_i24:
3986 case Intrinsic::amdgcn_mul_u24:
3987 case Intrinsic::amdgcn_mulhi_i24:
3988 case Intrinsic::amdgcn_mulhi_u24:
3989 return simplifyMul24(N, DCI);
3990 case Intrinsic::amdgcn_fract:
3991 case Intrinsic::amdgcn_rsq:
3992 case Intrinsic::amdgcn_rcp_legacy:
3993 case Intrinsic::amdgcn_rsq_legacy:
3994 case Intrinsic::amdgcn_rsq_clamp: {
3995 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3996 SDValue Src = N->getOperand(1);
3997 return Src.isUndef() ? Src : SDValue();
3998 }
3999 case Intrinsic::amdgcn_frexp_exp: {
4000 // frexp_exp (fneg x) -> frexp_exp x
4001 // frexp_exp (fabs x) -> frexp_exp x
4002 // frexp_exp (fneg (fabs x)) -> frexp_exp x
4003 SDValue Src = N->getOperand(1);
4004 SDValue PeekSign = peekFPSignOps(Src);
4005 if (PeekSign == Src)
4006 return SDValue();
4007 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),
4008 0);
4009 }
4010 default:
4011 return SDValue();
4012 }
4013}
4014
4015/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
4016/// binary operation \p Opc to it with the corresponding constant operands.
4018 DAGCombinerInfo &DCI, const SDLoc &SL,
4019 unsigned Opc, SDValue LHS,
4020 uint32_t ValLo, uint32_t ValHi) const {
4021 SelectionDAG &DAG = DCI.DAG;
4022 SDValue Lo, Hi;
4023 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
4024
4025 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
4026 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
4027
4028 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
4029 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
4030
4031 // Re-visit the ands. It's possible we eliminated one of them and it could
4032 // simplify the vector.
4033 DCI.AddToWorklist(Lo.getNode());
4034 DCI.AddToWorklist(Hi.getNode());
4035
4036 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
4037 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4038}
4039
4041 DAGCombinerInfo &DCI) const {
4042 EVT VT = N->getValueType(0);
4043
4044 ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
4045 if (!RHS)
4046 return SDValue();
4047
4048 SDValue LHS = N->getOperand(0);
4049 unsigned RHSVal = RHS->getZExtValue();
4050 if (!RHSVal)
4051 return LHS;
4052
4053 SDLoc SL(N);
4054 SelectionDAG &DAG = DCI.DAG;
4055
4056 switch (LHS->getOpcode()) {
4057 default:
4058 break;
4059 case ISD::ZERO_EXTEND:
4060 case ISD::SIGN_EXTEND:
4061 case ISD::ANY_EXTEND: {
4062 SDValue X = LHS->getOperand(0);
4063
4064 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
4065 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
4066 // Prefer build_vector as the canonical form if packed types are legal.
4067 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4068 SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
4069 { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
4070 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
4071 }
4072
4073 // shl (ext x) => zext (shl x), if shift does not overflow int
4074 if (VT != MVT::i64)
4075 break;
4076 KnownBits Known = DAG.computeKnownBits(X);
4077 unsigned LZ = Known.countMinLeadingZeros();
4078 if (LZ < RHSVal)
4079 break;
4080 EVT XVT = X.getValueType();
4081 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
4082 return DAG.getZExtOrTrunc(Shl, SL, VT);
4083 }
4084 }
4085
4086 if (VT != MVT::i64)
4087 return SDValue();
4088
4089 // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
4090
4091 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4092 // common case, splitting this into a move and a 32-bit shift is faster and
4093 // the same code size.
4094 if (RHSVal < 32)
4095 return SDValue();
4096
4097 SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
4098
4099 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
4100 SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
4101
4102 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4103
4104 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
4105 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4106}
4107
4109 DAGCombinerInfo &DCI) const {
4110 if (N->getValueType(0) != MVT::i64)
4111 return SDValue();
4112
4113 const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
4114 if (!RHS)
4115 return SDValue();
4116
4117 SelectionDAG &DAG = DCI.DAG;
4118 SDLoc SL(N);
4119 unsigned RHSVal = RHS->getZExtValue();
4120
4121 // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
4122 if (RHSVal == 32) {
4123 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
4124 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4125 DAG.getConstant(31, SL, MVT::i32));
4126
4127 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
4128 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
4129 }
4130
4131 // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
4132 if (RHSVal == 63) {
4133 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
4134 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4135 DAG.getConstant(31, SL, MVT::i32));
4136 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
4137 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
4138 }
4139
4140 return SDValue();
4141}
4142
4144 DAGCombinerInfo &DCI) const {
4145 auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
4146 if (!RHS)
4147 return SDValue();
4148
4149 EVT VT = N->getValueType(0);
4150 SDValue LHS = N->getOperand(0);
4151 unsigned ShiftAmt = RHS->getZExtValue();
4152 SelectionDAG &DAG = DCI.DAG;
4153 SDLoc SL(N);
4154
4155 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4156 // this improves the ability to match BFE patterns in isel.
4157 if (LHS.getOpcode() == ISD::AND) {
4158 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
4159 unsigned MaskIdx, MaskLen;
4160 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4161 MaskIdx == ShiftAmt) {
4162 return DAG.getNode(
4163 ISD::AND, SL, VT,
4164 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
4165 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
4166 }
4167 }
4168 }
4169
4170 if (VT != MVT::i64)
4171 return SDValue();
4172
4173 if (ShiftAmt < 32)
4174 return SDValue();
4175
4176 // srl i64:x, C for C >= 32
4177 // =>
4178 // build_pair (srl hi_32(x), C - 32), 0
4179 SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4180
4181 SDValue Hi = getHiHalf64(LHS, DAG);
4182
4183 SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
4184 SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
4185
4186 SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
4187
4188 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
4189}
4190
4192 SDNode *N, DAGCombinerInfo &DCI) const {
4193 SDLoc SL(N);
4194 SelectionDAG &DAG = DCI.DAG;
4195 EVT VT = N->getValueType(0);
4196 SDValue Src = N->getOperand(0);
4197
4198 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4199 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4200 SDValue Vec = Src.getOperand(0);
4201 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4202 SDValue Elt0 = Vec.getOperand(0);
4203 EVT EltVT = Elt0.getValueType();
4204 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4205 if (EltVT.isFloatingPoint()) {
4206 Elt0 = DAG.getNode(ISD::BITCAST, SL,
4207 EltVT.changeTypeToInteger(), Elt0);
4208 }
4209
4210 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
4211 }
4212 }
4213 }
4214
4215 // Equivalent of above for accessing the high element of a vector as an
4216 // integer operation.
4217 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4218 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4219 if (auto *K = isConstOrConstSplat(Src.getOperand(1))) {
4220 SDValue BV = stripBitcast(Src.getOperand(0));
4221 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
4222 EVT SrcEltVT = BV.getOperand(0).getValueType();
4223 unsigned SrcEltSize = SrcEltVT.getSizeInBits();
4224 unsigned BitIndex = K->getZExtValue();
4225 unsigned PartIndex = BitIndex / SrcEltSize;
4226
4227 if (PartIndex * SrcEltSize == BitIndex &&
4228 PartIndex < BV.getNumOperands()) {
4229 if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) {
4230 SDValue SrcElt =
4231 DAG.getNode(ISD::BITCAST, SL, SrcEltVT.changeTypeToInteger(),
4232 BV.getOperand(PartIndex));
4233 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
4234 }
4235 }
4236 }
4237 }
4238 }
4239
4240 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4241 //
4242 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4243 // i16 (trunc (srl (i32 (trunc x), K)))
4244 if (VT.getScalarSizeInBits() < 32) {
4245 EVT SrcVT = Src.getValueType();
4246 if (SrcVT.getScalarSizeInBits() > 32 &&
4247 (Src.getOpcode() == ISD::SRL ||
4248 Src.getOpcode() == ISD::SRA ||
4249 Src.getOpcode() == ISD::SHL)) {
4250 SDValue Amt = Src.getOperand(1);
4251 KnownBits Known = DAG.computeKnownBits(Amt);
4252
4253 // - For left shifts, do the transform as long as the shift
4254 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4255 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4256 // losing information stored in the high bits when truncating.
4257 const unsigned MaxCstSize =
4258 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4259 if (Known.getMaxValue().ule(MaxCstSize)) {
4260 EVT MidVT = VT.isVector() ?
4261 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4262 VT.getVectorNumElements()) : MVT::i32;
4263
4264 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
4265 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
4266 Src.getOperand(0));
4267 DCI.AddToWorklist(Trunc.getNode());
4268
4269 if (Amt.getValueType() != NewShiftVT) {
4270 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
4271 DCI.AddToWorklist(Amt.getNode());
4272 }
4273
4274 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
4275 Trunc, Amt);
4276 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
4277 }
4278 }
4279 }
4280
4281 return SDValue();
4282}
4283
4284// We need to specifically handle i64 mul here to avoid unnecessary conversion
4285// instructions. If we only match on the legalized i64 mul expansion,
4286// SimplifyDemandedBits will be unable to remove them because there will be
4287// multiple uses due to the separate mul + mulh[su].
4288static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4289 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4290 if (Size <= 32) {
4291 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4292 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
4293 }
4294
4295 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4296 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4297
4298 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
4299 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
4300
4301 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
4302}
4303
4304/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4305/// return SDValue().
4306static SDValue getAddOneOp(const SDNode *V) {
4307 if (V->getOpcode() != ISD::ADD)
4308 return SDValue();
4309
4310 return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();
4311}
4312
4314 DAGCombinerInfo &DCI) const {
4315 assert(N->getOpcode() == ISD::MUL);
4316 EVT VT = N->getValueType(0);
4317
4318 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4319 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4320 // unnecessarily). isDivergent() is used as an approximation of whether the
4321 // value is in an SGPR.
4322 if (!N->isDivergent())
4323 return SDValue();
4324
4325 unsigned Size = VT.getSizeInBits();
4326 if (VT.isVector() || Size > 64)
4327 return SDValue();
4328
4329 SelectionDAG &DAG = DCI.DAG;
4330 SDLoc DL(N);
4331
4332 SDValue N0 = N->getOperand(0);
4333 SDValue N1 = N->getOperand(1);
4334
4335 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4336 // matching.
4337
4338 // mul x, (add y, 1) -> add (mul x, y), x
4339 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4340 SDValue AddOp = getAddOneOp(V.getNode());
4341 if (!AddOp)
4342 return SDValue();
4343
4344 if (V.hasOneUse() || all_of(V->users(), [](const SDNode *U) -> bool {
4345 return U->getOpcode() == ISD::MUL;
4346 }))
4347 return AddOp;
4348
4349 return SDValue();
4350 };
4351
4352 // FIXME: The selection pattern is not properly checking for commuted
4353 // operands, so we have to place the mul in the LHS
4354 if (SDValue MulOper = IsFoldableAdd(N0)) {
4355 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
4356 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
4357 }
4358
4359 if (SDValue MulOper = IsFoldableAdd(N1)) {
4360 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
4361 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
4362 }
4363
4364 // There are i16 integer mul/mad.
4365 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
4366 return SDValue();
4367
4368 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4369 // in the source into any_extends if the result of the mul is truncated. Since
4370 // we can assume the high bits are whatever we want, use the underlying value
4371 // to avoid the unknown high bits from interfering.
4372 if (N0.getOpcode() == ISD::ANY_EXTEND)
4373 N0 = N0.getOperand(0);
4374
4375 if (N1.getOpcode() == ISD::ANY_EXTEND)
4376 N1 = N1.getOperand(0);
4377
4378 SDValue Mul;
4379
4380 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4381 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4382 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4383 Mul = getMul24(DAG, DL, N0, N1, Size, false);
4384 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4385 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4386 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4387 Mul = getMul24(DAG, DL, N0, N1, Size, true);
4388 } else {
4389 return SDValue();
4390 }
4391
4392 // We need to use sext even for MUL_U24, because MUL_U24 is used
4393 // for signed multiply of 8 and 16-bit types.
4394 return DAG.getSExtOrTrunc(Mul, DL, VT);
4395}
4396
4397SDValue
4399 DAGCombinerInfo &DCI) const {
4400 if (N->getValueType(0) != MVT::i32)
4401 return SDValue();
4402
4403 SelectionDAG &DAG = DCI.DAG;
4404 SDLoc DL(N);
4405
4406 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
4407 SDValue N0 = N->getOperand(0);
4408 SDValue N1 = N->getOperand(1);
4409
4410 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4411 // in the source into any_extends if the result of the mul is truncated. Since
4412 // we can assume the high bits are whatever we want, use the underlying value
4413 // to avoid the unknown high bits from interfering.
4414 if (N0.getOpcode() == ISD::ANY_EXTEND)
4415 N0 = N0.getOperand(0);
4416 if (N1.getOpcode() == ISD::ANY_EXTEND)
4417 N1 = N1.getOperand(0);
4418
4419 // Try to use two fast 24-bit multiplies (one for each half of the result)
4420 // instead of one slow extending multiply.
4421 unsigned LoOpcode = 0;
4422 unsigned HiOpcode = 0;
4423 if (Signed) {
4424 if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4425 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4426 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4427 LoOpcode = AMDGPUISD::MUL_I24;
4428 HiOpcode = AMDGPUISD::MULHI_I24;
4429 }
4430 } else {
4431 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4432 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4433 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4434 LoOpcode = AMDGPUISD::MUL_U24;
4435 HiOpcode = AMDGPUISD::MULHI_U24;
4436 }
4437 }
4438 if (!LoOpcode)
4439 return SDValue();
4440
4441 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
4442 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
4443 DCI.CombineTo(N, Lo, Hi);
4444 return SDValue(N, 0);
4445}
4446
4448 DAGCombinerInfo &DCI) const {
4449 EVT VT = N->getValueType(0);
4450
4451 if (!Subtarget->hasMulI24() || VT.isVector())
4452 return SDValue();
4453
4454 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4455 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4456 // unnecessarily). isDivergent() is used as an approximation of whether the
4457 // value is in an SGPR.
4458 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4459 // valu op anyway)
4460 if (Subtarget->hasSMulHi() && !N->isDivergent())
4461 return SDValue();
4462
4463 SelectionDAG &DAG = DCI.DAG;
4464 SDLoc DL(N);
4465
4466 SDValue N0 = N->getOperand(0);
4467 SDValue N1 = N->getOperand(1);
4468
4469 if (!isI24(N0, DAG) || !isI24(N1, DAG))
4470 return SDValue();
4471
4472 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4473 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4474
4475 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
4476 DCI.AddToWorklist(Mulhi.getNode());
4477 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
4478}
4479
4481 DAGCombinerInfo &DCI) const {
4482 EVT VT = N->getValueType(0);
4483
4484 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
4485 return SDValue();
4486
4487 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4488 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4489 // unnecessarily). isDivergent() is used as an approximation of whether the
4490 // value is in an SGPR.
4491 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4492 // valu op anyway)
4493 if (Subtarget->hasSMulHi() && !N->isDivergent())
4494 return SDValue();
4495
4496 SelectionDAG &DAG = DCI.DAG;
4497 SDLoc DL(N);
4498
4499 SDValue N0 = N->getOperand(0);
4500 SDValue N1 = N->getOperand(1);
4501
4502 if (!isU24(N0, DAG) || !isU24(N1, DAG))
4503 return SDValue();
4504
4505 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4506 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4507
4508 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
4509 DCI.AddToWorklist(Mulhi.getNode());
4510 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
4511}
4512
4513SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4514 SDValue Op,
4515 const SDLoc &DL,
4516 unsigned Opc) const {
4517 EVT VT = Op.getValueType();
4518 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
4519 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
4520 LegalVT != MVT::i16))
4521 return SDValue();
4522
4523 if (VT != MVT::i32)
4524 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
4525
4526 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
4527 if (VT != MVT::i32)
4528 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
4529
4530 return FFBX;
4531}
4532
4533// The native instructions return -1 on 0 input. Optimize out a select that
4534// produces -1 on 0.
4535//
4536// TODO: If zero is not undef, we could also do this if the output is compared
4537// against the bitwidth.
4538//
4539// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4541 SDValue LHS, SDValue RHS,
4542 DAGCombinerInfo &DCI) const {
4543 if (!isNullConstant(Cond.getOperand(1)))
4544 return SDValue();
4545
4546 SelectionDAG &DAG = DCI.DAG;
4547 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
4548 SDValue CmpLHS = Cond.getOperand(0);
4549
4550 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4551 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4552 if (CCOpcode == ISD::SETEQ &&
4553 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
4554 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {
4555 unsigned Opc =
4557 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4558 }
4559
4560 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4561 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4562 if (CCOpcode == ISD::SETNE &&
4563 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
4564 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {
4565 unsigned Opc =
4567
4568 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4569 }
4570
4571 return SDValue();
4572}
4573
4575 unsigned Op,
4576 const SDLoc &SL,
4577 SDValue Cond,
4578 SDValue N1,
4579 SDValue N2) {
4580 SelectionDAG &DAG = DCI.DAG;
4581 EVT VT = N1.getValueType();
4582
4583 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
4584 N1.getOperand(0), N2.getOperand(0));
4585 DCI.AddToWorklist(NewSelect.getNode());
4586 return DAG.getNode(Op, SL, VT, NewSelect);
4587}
4588
4589// Pull a free FP operation out of a select so it may fold into uses.
4590//
4591// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4592// select c, (fneg x), k -> fneg (select c, x, (fneg k))
4593//
4594// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
4595// select c, (fabs x), +k -> fabs (select c, x, k)
4596SDValue
4598 SDValue N) const {
4599 SelectionDAG &DAG = DCI.DAG;
4600 SDValue Cond = N.getOperand(0);
4601 SDValue LHS = N.getOperand(1);
4602 SDValue RHS = N.getOperand(2);
4603
4604 EVT VT = N.getValueType();
4605 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
4606 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
4608 return SDValue();
4609
4610 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
4611 SDLoc(N), Cond, LHS, RHS);
4612 }
4613
4614 bool Inv = false;
4615 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
4616 std::swap(LHS, RHS);
4617 Inv = true;
4618 }
4619
4620 // TODO: Support vector constants.
4621 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
4622 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
4623 !selectSupportsSourceMods(N.getNode())) {
4624 SDLoc SL(N);
4625 // If one side is an fneg/fabs and the other is a constant, we can push the
4626 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
4627 SDValue NewLHS = LHS.getOperand(0);
4628 SDValue NewRHS = RHS;
4629
4630 // Careful: if the neg can be folded up, don't try to pull it back down.
4631 bool ShouldFoldNeg = true;
4632
4633 if (NewLHS.hasOneUse()) {
4634 unsigned Opc = NewLHS.getOpcode();
4635 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
4636 ShouldFoldNeg = false;
4637 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
4638 ShouldFoldNeg = false;
4639 }
4640
4641 if (ShouldFoldNeg) {
4642 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
4643 return SDValue();
4644
4645 // We're going to be forced to use a source modifier anyway, there's no
4646 // point to pulling the negate out unless we can get a size reduction by
4647 // negating the constant.
4648 //
4649 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
4650 // about cheaper constants.
4651 if (NewLHS.getOpcode() == ISD::FABS &&
4653 return SDValue();
4654
4656 return SDValue();
4657
4658 if (LHS.getOpcode() == ISD::FNEG)
4659 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4660
4661 if (Inv)
4662 std::swap(NewLHS, NewRHS);
4663
4664 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
4665 Cond, NewLHS, NewRHS);
4666 DCI.AddToWorklist(NewSelect.getNode());
4667 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
4668 }
4669 }
4670
4671 return SDValue();
4672}
4673
4675 DAGCombinerInfo &DCI) const {
4676 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
4677 return Folded;
4678
4679 SDValue Cond = N->getOperand(0);
4680 if (Cond.getOpcode() != ISD::SETCC)
4681 return SDValue();
4682
4683 EVT VT = N->getValueType(0);
4684 SDValue LHS = Cond.getOperand(0);
4685 SDValue RHS = Cond.getOperand(1);
4686 SDValue CC = Cond.getOperand(2);
4687
4688 SDValue True = N->getOperand(1);
4689 SDValue False = N->getOperand(2);
4690
4691 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
4692 SelectionDAG &DAG = DCI.DAG;
4693 if (DAG.isConstantValueOfAnyType(True) &&
4694 !DAG.isConstantValueOfAnyType(False)) {
4695 // Swap cmp + select pair to move constant to false input.
4696 // This will allow using VOPC cndmasks more often.
4697 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
4698
4699 SDLoc SL(N);
4700 ISD::CondCode NewCC =
4701 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
4702
4703 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
4704 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
4705 }
4706
4707 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
4709 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
4710 // Revisit this node so we can catch min3/max3/med3 patterns.
4711 //DCI.AddToWorklist(MinMax.getNode());
4712 return MinMax;
4713 }
4714 }
4715
4716 // There's no reason to not do this if the condition has other uses.
4717 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
4718}
4719
4720static bool isInv2Pi(const APFloat &APF) {
4721 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
4722 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
4723 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
4724
4725 return APF.bitwiseIsEqual(KF16) ||
4726 APF.bitwiseIsEqual(KF32) ||
4727 APF.bitwiseIsEqual(KF64);
4728}
4729
4730// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
4731// additional cost to negate them.
4734 if (C->isZero())
4735 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4736
4737 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
4738 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4739
4741}
4742
4746 return false;
4747}
4748
4752 return false;
4753}
4754
4755static unsigned inverseMinMax(unsigned Opc) {
4756 switch (Opc) {
4757 case ISD::FMAXNUM:
4758 return ISD::FMINNUM;
4759 case ISD::FMINNUM:
4760 return ISD::FMAXNUM;
4761 case ISD::FMAXNUM_IEEE:
4762 return ISD::FMINNUM_IEEE;
4763 case ISD::FMINNUM_IEEE:
4764 return ISD::FMAXNUM_IEEE;
4765 case ISD::FMAXIMUM:
4766 return ISD::FMINIMUM;
4767 case ISD::FMINIMUM:
4768 return ISD::FMAXIMUM;
4773 default:
4774 llvm_unreachable("invalid min/max opcode");
4775 }
4776}
4777
4778/// \return true if it's profitable to try to push an fneg into its source
4779/// instruction.
4781 // If the input has multiple uses and we can either fold the negate down, or
4782 // the other uses cannot, give up. This both prevents unprofitable
4783 // transformations and infinite loops: we won't repeatedly try to fold around
4784 // a negate that has no 'good' form.
4785 if (N0.hasOneUse()) {
4786 // This may be able to fold into the source, but at a code size cost. Don't
4787 // fold if the fold into the user is free.
4788 if (allUsesHaveSourceMods(N, 0))
4789 return false;
4790 } else {
4791 if (fnegFoldsIntoOp(N0.getNode()) &&
4793 return false;
4794 }
4795
4796 return true;
4797}
4798
4800 DAGCombinerInfo &DCI) const {
4801 SelectionDAG &DAG = DCI.DAG;
4802 SDValue N0 = N->getOperand(0);
4803 EVT VT = N->getValueType(0);
4804
4805 unsigned Opc = N0.getOpcode();
4806
4807 if (!shouldFoldFNegIntoSrc(N, N0))
4808 return SDValue();
4809
4810 SDLoc SL(N);
4811 switch (Opc) {
4812 case ISD::FADD: {
4813 if (!mayIgnoreSignedZero(N0))
4814 return SDValue();
4815
4816 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
4817 SDValue LHS = N0.getOperand(0);
4818 SDValue RHS = N0.getOperand(1);
4819
4820 if (LHS.getOpcode() != ISD::FNEG)
4821 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
4822 else
4823 LHS = LHS.getOperand(0);
4824
4825 if (RHS.getOpcode() != ISD::FNEG)
4826 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4827 else
4828 RHS = RHS.getOperand(0);
4829
4830 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
4831 if (Res.getOpcode() != ISD::FADD)
4832 return SDValue(); // Op got folded away.
4833 if (!N0.hasOneUse())
4834 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4835 return Res;
4836 }
4837 case ISD::FMUL:
4839 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
4840 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
4841 SDValue LHS = N0.getOperand(0);
4842 SDValue RHS = N0.getOperand(1);
4843
4844 if (LHS.getOpcode() == ISD::FNEG)
4845 LHS = LHS.getOperand(0);
4846 else if (RHS.getOpcode() == ISD::FNEG)
4847 RHS = RHS.getOperand(0);
4848 else
4849 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4850
4851 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
4852 if (Res.getOpcode() != Opc)
4853 return SDValue(); // Op got folded away.
4854 if (!N0.hasOneUse())
4855 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4856 return Res;
4857 }
4858 case ISD::FMA:
4859 case ISD::FMAD: {
4860 // TODO: handle llvm.amdgcn.fma.legacy
4861 if (!mayIgnoreSignedZero(N0))
4862 return SDValue();
4863
4864 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
4865 SDValue LHS = N0.getOperand(0);
4866 SDValue MHS = N0.getOperand(1);
4867 SDValue RHS = N0.getOperand(2);
4868
4869 if (LHS.getOpcode() == ISD::FNEG)
4870 LHS = LHS.getOperand(0);
4871 else if (MHS.getOpcode() == ISD::FNEG)
4872 MHS = MHS.getOperand(0);
4873 else
4874 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
4875
4876 if (RHS.getOpcode() != ISD::FNEG)
4877 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4878 else
4879 RHS = RHS.getOperand(0);
4880
4881 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
4882 if (Res.getOpcode() != Opc)
4883 return SDValue(); // Op got folded away.
4884 if (!N0.hasOneUse())
4885 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4886 return Res;
4887 }
4888 case ISD::FMAXNUM:
4889 case ISD::FMINNUM:
4890 case ISD::FMAXNUM_IEEE:
4891 case ISD::FMINNUM_IEEE:
4892 case ISD::FMINIMUM:
4893 case ISD::FMAXIMUM:
4896 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
4897 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
4898 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
4899 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
4900
4901 SDValue LHS = N0.getOperand(0);
4902 SDValue RHS = N0.getOperand(1);
4903
4904 // 0 doesn't have a negated inline immediate.
4905 // TODO: This constant check should be generalized to other operations.
4907 return SDValue();
4908
4909 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
4910 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4911 unsigned Opposite = inverseMinMax(Opc);
4912
4913 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
4914 if (Res.getOpcode() != Opposite)
4915 return SDValue(); // Op got folded away.
4916 if (!N0.hasOneUse())
4917 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4918 return Res;
4919 }
4920 case AMDGPUISD::FMED3: {
4921 SDValue Ops[3];
4922 for (unsigned I = 0; I < 3; ++I)
4923 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
4924
4925 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
4926 if (Res.getOpcode() != AMDGPUISD::FMED3)
4927 return SDValue(); // Op got folded away.
4928
4929 if (!N0.hasOneUse()) {
4930 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
4931 DAG.ReplaceAllUsesWith(N0, Neg);
4932
4933 for (SDNode *U : Neg->users())
4934 DCI.AddToWorklist(U);
4935 }
4936
4937 return Res;
4938 }
4939 case ISD::FP_EXTEND:
4940 case ISD::FTRUNC:
4941 case ISD::FRINT:
4942 case ISD::FNEARBYINT: // XXX - Should fround be handled?
4943 case ISD::FROUNDEVEN:
4944 case ISD::FSIN:
4945 case ISD::FCANONICALIZE:
4946 case AMDGPUISD::RCP:
4949 case AMDGPUISD::SIN_HW: {
4950 SDValue CvtSrc = N0.getOperand(0);
4951 if (CvtSrc.getOpcode() == ISD::FNEG) {
4952 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
4953 // (fneg (rcp (fneg x))) -> (rcp x)
4954 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
4955 }
4956
4957 if (!N0.hasOneUse())
4958 return SDValue();
4959
4960 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
4961 // (fneg (rcp x)) -> (rcp (fneg x))
4962 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4963 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
4964 }
4965 case ISD::FP_ROUND: {
4966 SDValue CvtSrc = N0.getOperand(0);
4967
4968 if (CvtSrc.getOpcode() == ISD::FNEG) {
4969 // (fneg (fp_round (fneg x))) -> (fp_round x)
4970 return DAG.getNode(ISD::FP_ROUND, SL, VT,
4971 CvtSrc.getOperand(0), N0.getOperand(1));
4972 }
4973
4974 if (!N0.hasOneUse())
4975 return SDValue();
4976
4977 // (fneg (fp_round x)) -> (fp_round (fneg x))
4978 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4979 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
4980 }
4981 case ISD::FP16_TO_FP: {
4982 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
4983 // f16, but legalization of f16 fneg ends up pulling it out of the source.
4984 // Put the fneg back as a legal source operation that can be matched later.
4985 SDLoc SL(N);
4986
4987 SDValue Src = N0.getOperand(0);
4988 EVT SrcVT = Src.getValueType();
4989
4990 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
4991 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
4992 DAG.getConstant(0x8000, SL, SrcVT));
4993 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
4994 }
4995 case ISD::SELECT: {
4996 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
4997 // TODO: Invert conditions of foldFreeOpFromSelect
4998 return SDValue();
4999 }
5000 case ISD::BITCAST: {
5001 SDLoc SL(N);
5002 SDValue BCSrc = N0.getOperand(0);
5003 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
5004 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
5005 if (HighBits.getValueType().getSizeInBits() != 32 ||
5006 !fnegFoldsIntoOp(HighBits.getNode()))
5007 return SDValue();
5008
5009 // f64 fneg only really needs to operate on the high half of of the
5010 // register, so try to force it to an f32 operation to help make use of
5011 // source modifiers.
5012 //
5013 //
5014 // fneg (f64 (bitcast (build_vector x, y))) ->
5015 // f64 (bitcast (build_vector (bitcast i32:x to f32),
5016 // (fneg (bitcast i32:y to f32)))
5017
5018 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
5019 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
5020 SDValue CastBack =
5021 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
5022
5023 SmallVector<SDValue, 8> Ops(BCSrc->ops());
5024 Ops.back() = CastBack;
5025 DCI.AddToWorklist(NegHi.getNode());
5026 SDValue Build =
5027 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
5028 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
5029
5030 if (!N0.hasOneUse())
5031 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
5032 return Result;
5033 }
5034
5035 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
5036 BCSrc.hasOneUse()) {
5037 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
5038 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
5039
5040 // TODO: Cast back result for multiple uses is beneficial in some cases.
5041
5042 SDValue LHS =
5043 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
5044 SDValue RHS =
5045 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
5046
5047 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
5048 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
5049
5050 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
5051 NegRHS);
5052 }
5053
5054 return SDValue();
5055 }
5056 default:
5057 return SDValue();
5058 }
5059}
5060
5062 DAGCombinerInfo &DCI) const {
5063 SelectionDAG &DAG = DCI.DAG;
5064 SDValue N0 = N->getOperand(0);
5065
5066 if (!N0.hasOneUse())
5067 return SDValue();
5068
5069 switch (N0.getOpcode()) {
5070 case ISD::FP16_TO_FP: {
5071 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
5072 SDLoc SL(N);
5073 SDValue Src = N0.getOperand(0);
5074 EVT SrcVT = Src.getValueType();
5075
5076 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5077 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
5078 DAG.getConstant(0x7fff, SL, SrcVT));
5079 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
5080 }
5081 default:
5082 return SDValue();
5083 }
5084}
5085
5087 DAGCombinerInfo &DCI) const {
5088 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
5089 if (!CFP)
5090 return SDValue();
5091
5092 // XXX - Should this flush denormals?
5093 const APFloat &Val = CFP->getValueAPF();
5094 APFloat One(Val.getSemantics(), "1.0");
5095 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
5096}
5097
5099 DAGCombinerInfo &DCI) const {
5100 SelectionDAG &DAG = DCI.DAG;
5101 SDLoc DL(N);
5102
5103 switch(N->getOpcode()) {
5104 default:
5105 break;
5106 case ISD::BITCAST: {
5107 EVT DestVT = N->getValueType(0);
5108
5109 // Push casts through vector builds. This helps avoid emitting a large
5110 // number of copies when materializing floating point vector constants.
5111 //
5112 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5113 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5114 if (DestVT.isVector()) {
5115 SDValue Src = N->getOperand(0);
5116 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5119 EVT SrcVT = Src.getValueType();
5120 unsigned NElts = DestVT.getVectorNumElements();
5121
5122 if (SrcVT.getVectorNumElements() == NElts) {
5123 EVT DestEltVT = DestVT.getVectorElementType();
5124
5125 SmallVector<SDValue, 8> CastedElts;
5126 SDLoc SL(N);
5127 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5128 SDValue Elt = Src.getOperand(I);
5129 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
5130 }
5131
5132 return DAG.getBuildVector(DestVT, SL, CastedElts);
5133 }
5134 }
5135 }
5136
5137 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5138 break;
5139
5140 // Fold bitcasts of constants.
5141 //
5142 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5143 // TODO: Generalize and move to DAGCombiner
5144 SDValue Src = N->getOperand(0);
5145 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
5146 SDLoc SL(N);
5147 uint64_t CVal = C->getZExtValue();
5148 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5149 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5150 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5151 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
5152 }
5153
5154 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
5155 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5156 SDLoc SL(N);
5157 uint64_t CVal = Val.getZExtValue();
5158 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5159 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5160 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5161
5162 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
5163 }
5164
5165 break;
5166 }
5167 case ISD::SHL: {
5169 break;
5170
5171 return performShlCombine(N, DCI);
5172 }
5173 case ISD::SRL: {
5175 break;
5176
5177 return performSrlCombine(N, DCI);
5178 }
5179 case ISD::SRA: {
5181 break;
5182
5183 return performSraCombine(N, DCI);
5184 }
5185 case ISD::TRUNCATE:
5186 return performTruncateCombine(N, DCI);
5187 case ISD::MUL:
5188 return performMulCombine(N, DCI);
5189 case AMDGPUISD::MUL_U24:
5190 case AMDGPUISD::MUL_I24: {
5191 if (SDValue Simplified = simplifyMul24(N, DCI))
5192 return Simplified;
5193 break;
5194 }
5197 return simplifyMul24(N, DCI);
5198 case ISD::SMUL_LOHI:
5199 case ISD::UMUL_LOHI:
5200 return performMulLoHiCombine(N, DCI);
5201 case ISD::MULHS:
5202 return performMulhsCombine(N, DCI);
5203 case ISD::MULHU:
5204 return performMulhuCombine(N, DCI);
5205 case ISD::SELECT:
5206 return performSelectCombine(N, DCI);
5207 case ISD::FNEG:
5208 return performFNegCombine(N, DCI);
5209 case ISD::FABS:
5210 return performFAbsCombine(N, DCI);
5211 case AMDGPUISD::BFE_I32:
5212 case AMDGPUISD::BFE_U32: {
5213 assert(!N->getValueType(0).isVector() &&
5214 "Vector handling of BFE not implemented");
5215 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
5216 if (!Width)
5217 break;
5218
5219 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5220 if (WidthVal == 0)
5221 return DAG.getConstant(0, DL, MVT::i32);
5222
5223 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
5224 if (!Offset)
5225 break;
5226
5227 SDValue BitsFrom = N->getOperand(0);
5228 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5229
5230 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5231
5232 if (OffsetVal == 0) {
5233 // This is already sign / zero extended, so try to fold away extra BFEs.
5234 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5235
5236 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
5237 if (OpSignBits >= SignBits)
5238 return BitsFrom;
5239
5240 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
5241 if (Signed) {
5242 // This is a sign_extend_inreg. Replace it to take advantage of existing
5243 // DAG Combines. If not eliminated, we will match back to BFE during
5244 // selection.
5245
5246 // TODO: The sext_inreg of extended types ends, although we can could
5247 // handle them in a single BFE.
5248 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
5249 DAG.getValueType(SmallVT));
5250 }
5251
5252 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
5253 }
5254
5255 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
5256 if (Signed) {
5257 return constantFoldBFE<int32_t>(DAG,
5258 CVal->getSExtValue(),
5259 OffsetVal,
5260 WidthVal,
5261 DL);
5262 }
5263
5264 return constantFoldBFE<uint32_t>(DAG,
5265 CVal->getZExtValue(),
5266 OffsetVal,
5267 WidthVal,
5268 DL);
5269 }
5270
5271 if ((OffsetVal + WidthVal) >= 32 &&
5272 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
5273 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
5274 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
5275 BitsFrom, ShiftVal);
5276 }
5277
5278 if (BitsFrom.hasOneUse()) {
5279 APInt Demanded = APInt::getBitsSet(32,
5280 OffsetVal,
5281 OffsetVal + WidthVal);
5282
5283 KnownBits Known;
5285 !DCI.isBeforeLegalizeOps());
5286 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5287 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
5288 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
5289 DCI.CommitTargetLoweringOpt(TLO);
5290 }
5291 }
5292
5293 break;
5294 }
5295 case ISD::LOAD:
5296 return performLoadCombine(N, DCI);
5297 case ISD::STORE:
5298 return performStoreCombine(N, DCI);
5299 case AMDGPUISD::RCP:
5301 return performRcpCombine(N, DCI);
5302 case ISD::AssertZext:
5303 case ISD::AssertSext:
5304 return performAssertSZExtCombine(N, DCI);
5306 return performIntrinsicWOChainCombine(N, DCI);
5307 case AMDGPUISD::FMAD_FTZ: {
5308 SDValue N0 = N->getOperand(0);
5309 SDValue N1 = N->getOperand(1);
5310 SDValue N2 = N->getOperand(2);
5311 EVT VT = N->getValueType(0);
5312
5313 // FMAD_FTZ is a FMAD + flush denormals to zero.
5314 // We flush the inputs, the intermediate step, and the output.
5315 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
5316 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
5317 ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
5318 if (N0CFP && N1CFP && N2CFP) {
5319 const auto FTZ = [](const APFloat &V) {
5320 if (V.isDenormal()) {
5321 APFloat Zero(V.getSemantics(), 0);
5322 return V.isNegative() ? -Zero : Zero;
5323 }
5324 return V;
5325 };
5326
5327 APFloat V0 = FTZ(N0CFP->getValueAPF());
5328 APFloat V1 = FTZ(N1CFP->getValueAPF());
5329 APFloat V2 = FTZ(N2CFP->getValueAPF());
5331 V0 = FTZ(V0);
5333 return DAG.getConstantFP(FTZ(V0), DL, VT);
5334 }
5335 break;
5336 }
5337 }
5338 return SDValue();
5339}
5340
5341//===----------------------------------------------------------------------===//
5342// Helper functions
5343//===----------------------------------------------------------------------===//
5344
5346 const TargetRegisterClass *RC,
5347 Register Reg, EVT VT,
5348 const SDLoc &SL,
5349 bool RawReg) const {
5352 Register VReg;
5353
5354 if (!MRI.isLiveIn(Reg)) {
5355 VReg = MRI.createVirtualRegister(RC);
5356 MRI.addLiveIn(Reg, VReg);
5357 } else {
5358 VReg = MRI.getLiveInVirtReg(Reg);
5359 }
5360
5361 if (RawReg)
5362 return DAG.getRegister(VReg, VT);
5363
5364 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
5365}
5366
5367// This may be called multiple times, and nothing prevents creating multiple
5368// objects at the same offset. See if we already defined this object.
5370 int64_t Offset) {
5371 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5372 if (MFI.getObjectOffset(I) == Offset) {
5373 assert(MFI.getObjectSize(I) == Size);
5374 return I;
5375 }
5376 }
5377
5378 return MFI.CreateFixedObject(Size, Offset, true);
5379}
5380
5382 EVT VT,
5383 const SDLoc &SL,
5384 int64_t Offset) const {
5386 MachineFrameInfo &MFI = MF.getFrameInfo();
5387 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
5388
5389 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5390 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
5391
5392 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
5395}
5396
5398 const SDLoc &SL,
5399 SDValue Chain,
5400 SDValue ArgVal,
5401 int64_t Offset) const {
5405
5406 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
5407 // Stores to the argument stack area are relative to the stack pointer.
5408 SDValue SP =
5409 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
5410 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
5411 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
5413 return Store;
5414}
5415
5417 const TargetRegisterClass *RC,
5418 EVT VT, const SDLoc &SL,
5419 const ArgDescriptor &Arg) const {
5420 assert(Arg && "Attempting to load missing argument");
5421
5422 SDValue V = Arg.isRegister() ?
5423 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
5424 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
5425
5426 if (!Arg.isMasked())
5427 return V;
5428
5429 unsigned Mask = Arg.getMask();
5430 unsigned Shift = llvm::countr_zero<unsigned>(Mask);
5431 V = DAG.getNode(ISD::SRL, SL, VT, V,
5432 DAG.getShiftAmountConstant(Shift, VT, SL));
5433 return DAG.getNode(ISD::AND, SL, VT, V,
5434 DAG.getConstant(Mask >> Shift, SL, VT));
5435}
5436
5438 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5439 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5440 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5441 uint64_t ArgOffset =
5442 alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;
5443 switch (Param) {
5444 case FIRST_IMPLICIT:
5445 return ArgOffset;
5446 case PRIVATE_BASE:
5448 case SHARED_BASE:
5449 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5450 case QUEUE_PTR:
5451 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5452 }
5453 llvm_unreachable("unexpected implicit parameter type");
5454}
5455
5457 const MachineFunction &MF, const ImplicitParameter Param) const {
5460}
5461
5462#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
5463
5464const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
5465 switch ((AMDGPUISD::NodeType)Opcode) {
5466 case AMDGPUISD::FIRST_NUMBER: break;
5467 // AMDIL DAG nodes
5468 NODE_NAME_CASE(BRANCH_COND);
5469
5470 // AMDGPU DAG nodes
5471 NODE_NAME_CASE(IF)
5472 NODE_NAME_CASE(ELSE)
5473 NODE_NAME_CASE(LOOP)
5474 NODE_NAME_CASE(CALL)
5475 NODE_NAME_CASE(TC_RETURN)
5476 NODE_NAME_CASE(TC_RETURN_GFX)
5477 NODE_NAME_CASE(TC_RETURN_CHAIN)
5478 NODE_NAME_CASE(TRAP)
5479 NODE_NAME_CASE(RET_GLUE)
5480 NODE_NAME_CASE(WAVE_ADDRESS)
5481 NODE_NAME_CASE(RETURN_TO_EPILOG)
5482 NODE_NAME_CASE(ENDPGM)
5483 NODE_NAME_CASE(ENDPGM_TRAP)
5484 NODE_NAME_CASE(SIMULATED_TRAP)
5485 NODE_NAME_CASE(DWORDADDR)
5486 NODE_NAME_CASE(FRACT)
5487 NODE_NAME_CASE(SETCC)
5488 NODE_NAME_CASE(DENORM_MODE)
5489 NODE_NAME_CASE(FMA_W_CHAIN)
5490 NODE_NAME_CASE(FMUL_W_CHAIN)
5491 NODE_NAME_CASE(CLAMP)
5492 NODE_NAME_CASE(COS_HW)
5493 NODE_NAME_CASE(SIN_HW)
5494 NODE_NAME_CASE(FMAX_LEGACY)
5495 NODE_NAME_CASE(FMIN_LEGACY)
5496 NODE_NAME_CASE(FMAX3)
5497 NODE_NAME_CASE(SMAX3)
5498 NODE_NAME_CASE(UMAX3)
5499 NODE_NAME_CASE(FMIN3)
5500 NODE_NAME_CASE(SMIN3)
5501 NODE_NAME_CASE(UMIN3)
5502 NODE_NAME_CASE(FMED3)
5503 NODE_NAME_CASE(SMED3)
5504 NODE_NAME_CASE(UMED3)
5505 NODE_NAME_CASE(FMAXIMUM3)
5506 NODE_NAME_CASE(FMINIMUM3)
5507 NODE_NAME_CASE(FDOT2)
5508 NODE_NAME_CASE(URECIP)
5509 NODE_NAME_CASE(DIV_SCALE)
5510 NODE_NAME_CASE(DIV_FMAS)
5511 NODE_NAME_CASE(DIV_FIXUP)
5512 NODE_NAME_CASE(FMAD_FTZ)
5513 NODE_NAME_CASE(RCP)
5514 NODE_NAME_CASE(RSQ)
5515 NODE_NAME_CASE(RCP_LEGACY)
5516 NODE_NAME_CASE(RCP_IFLAG)
5517 NODE_NAME_CASE(LOG)
5518 NODE_NAME_CASE(EXP)
5519 NODE_NAME_CASE(FMUL_LEGACY)
5520 NODE_NAME_CASE(RSQ_CLAMP)
5521 NODE_NAME_CASE(FP_CLASS)
5522 NODE_NAME_CASE(DOT4)
5523 NODE_NAME_CASE(CARRY)
5524 NODE_NAME_CASE(BORROW)
5525 NODE_NAME_CASE(BFE_U32)
5526 NODE_NAME_CASE(BFE_I32)
5527 NODE_NAME_CASE(BFI)
5528 NODE_NAME_CASE(BFM)
5529 NODE_NAME_CASE(FFBH_U32)
5530 NODE_NAME_CASE(FFBH_I32)
5531 NODE_NAME_CASE(FFBL_B32)
5532 NODE_NAME_CASE(MUL_U24)
5533 NODE_NAME_CASE(MUL_I24)
5534 NODE_NAME_CASE(MULHI_U24)
5535 NODE_NAME_CASE(MULHI_I24)
5536 NODE_NAME_CASE(MAD_U24)
5537 NODE_NAME_CASE(MAD_I24)
5538 NODE_NAME_CASE(MAD_I64_I32)
5539 NODE_NAME_CASE(MAD_U64_U32)
5540 NODE_NAME_CASE(PERM)
5541 NODE_NAME_CASE(TEXTURE_FETCH)
5542 NODE_NAME_CASE(R600_EXPORT)
5543 NODE_NAME_CASE(CONST_ADDRESS)
5544 NODE_NAME_CASE(REGISTER_LOAD)
5545 NODE_NAME_CASE(REGISTER_STORE)
5546 NODE_NAME_CASE(CVT_F32_UBYTE0)
5547 NODE_NAME_CASE(CVT_F32_UBYTE1)
5548 NODE_NAME_CASE(CVT_F32_UBYTE2)
5549 NODE_NAME_CASE(CVT_F32_UBYTE3)
5550 NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
5551 NODE_NAME_CASE(CVT_PKNORM_I16_F32)
5552 NODE_NAME_CASE(CVT_PKNORM_U16_F32)
5553 NODE_NAME_CASE(CVT_PK_I16_I32)
5554 NODE_NAME_CASE(CVT_PK_U16_U32)
5555 NODE_NAME_CASE(FP_TO_FP16)
5556 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
5557 NODE_NAME_CASE(CONST_DATA_PTR)
5558 NODE_NAME_CASE(PC_ADD_REL_OFFSET)
5560 NODE_NAME_CASE(DUMMY_CHAIN)
5561 NODE_NAME_CASE(LOAD_D16_HI)
5562 NODE_NAME_CASE(LOAD_D16_LO)
5563 NODE_NAME_CASE(LOAD_D16_HI_I8)
5564 NODE_NAME_CASE(LOAD_D16_HI_U8)
5565 NODE_NAME_CASE(LOAD_D16_LO_I8)
5566 NODE_NAME_CASE(LOAD_D16_LO_U8)
5567 NODE_NAME_CASE(STORE_MSKOR)
5568 NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
5569 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
5570 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
5571 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
5572 NODE_NAME_CASE(DS_ORDERED_COUNT)
5573 NODE_NAME_CASE(ATOMIC_CMP_SWAP)
5574 NODE_NAME_CASE(BUFFER_LOAD)
5575 NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
5576 NODE_NAME_CASE(BUFFER_LOAD_USHORT)
5577 NODE_NAME_CASE(BUFFER_LOAD_BYTE)
5578 NODE_NAME_CASE(BUFFER_LOAD_SHORT)
5579 NODE_NAME_CASE(BUFFER_LOAD_TFE)
5580 NODE_NAME_CASE(BUFFER_LOAD_UBYTE_TFE)
5581 NODE_NAME_CASE(BUFFER_LOAD_USHORT_TFE)
5582 NODE_NAME_CASE(BUFFER_LOAD_BYTE_TFE)
5583 NODE_NAME_CASE(BUFFER_LOAD_SHORT_TFE)
5584 NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
5585 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
5586 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
5587 NODE_NAME_CASE(SBUFFER_LOAD)
5588 NODE_NAME_CASE(SBUFFER_LOAD_BYTE)
5589 NODE_NAME_CASE(SBUFFER_LOAD_UBYTE)
5590 NODE_NAME_CASE(SBUFFER_LOAD_SHORT)
5591 NODE_NAME_CASE(SBUFFER_LOAD_USHORT)
5592 NODE_NAME_CASE(SBUFFER_PREFETCH_DATA)
5593 NODE_NAME_CASE(BUFFER_STORE)
5594 NODE_NAME_CASE(BUFFER_STORE_BYTE)
5595 NODE_NAME_CASE(BUFFER_STORE_SHORT)
5596 NODE_NAME_CASE(BUFFER_STORE_FORMAT)
5597 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
5598 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
5599 NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
5600 NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
5601 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
5602 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
5603 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
5604 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
5605 NODE_NAME_CASE(BUFFER_ATOMIC_AND)
5606 NODE_NAME_CASE(BUFFER_ATOMIC_OR)
5607 NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
5608 NODE_NAME_CASE(BUFFER_ATOMIC_INC)
5609 NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
5610 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
5611 NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
5612 NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
5613 NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
5614 NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
5615 NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
5616 }
5617 return nullptr;
5618}
5619
5621 SelectionDAG &DAG, int Enabled,
5622 int &RefinementSteps,
5623 bool &UseOneConstNR,
5624 bool Reciprocal) const {
5625 EVT VT = Operand.getValueType();
5626
5627 if (VT == MVT::f32) {
5628 RefinementSteps = 0;
5629 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
5630 }
5631
5632 // TODO: There is also f64 rsq instruction, but the documentation is less
5633 // clear on its precision.
5634
5635 return SDValue();
5636}
5637
5639 SelectionDAG &DAG, int Enabled,
5640 int &RefinementSteps) const {
5641 EVT VT = Operand.getValueType();
5642
5643 if (VT == MVT::f32) {
5644 // Reciprocal, < 1 ulp error.
5645 //
5646 // This reciprocal approximation converges to < 0.5 ulp error with one
5647 // newton rhapson performed with two fused multiple adds (FMAs).
5648
5649 RefinementSteps = 0;
5650 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
5651 }
5652
5653 // TODO: There is also f64 rcp instruction, but the documentation is less
5654 // clear on its precision.
5655
5656 return SDValue();
5657}
5658
5659static unsigned workitemIntrinsicDim(unsigned ID) {
5660 switch (ID) {
5661 case Intrinsic::amdgcn_workitem_id_x:
5662 return 0;
5663 case Intrinsic::amdgcn_workitem_id_y:
5664 return 1;
5665 case Intrinsic::amdgcn_workitem_id_z:
5666 return 2;
5667 default:
5668 llvm_unreachable("not a workitem intrinsic");
5669 }
5670}
5671
5673 const SDValue Op, KnownBits &Known,
5674 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5675
5676 Known.resetAll(); // Don't know anything.
5677
5678 unsigned Opc = Op.getOpcode();
5679
5680 switch (Opc) {
5681 default:
5682 break;
5683 case AMDGPUISD::CARRY:
5684 case AMDGPUISD::BORROW: {
5685 Known.Zero = APInt::getHighBitsSet(32, 31);
5686 break;
5687 }
5688
5689 case AMDGPUISD::BFE_I32:
5690 case AMDGPUISD::BFE_U32: {
5691 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5692 if (!CWidth)
5693 return;
5694
5695 uint32_t Width = CWidth->getZExtValue() & 0x1f;
5696
5697 if (Opc == AMDGPUISD::BFE_U32)
5698 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
5699
5700 break;
5701 }
5702 case AMDGPUISD::FP_TO_FP16: {
5703 unsigned BitWidth = Known.getBitWidth();
5704
5705 // High bits are zero.
5707 break;
5708 }
5709 case AMDGPUISD::MUL_U24:
5710 case AMDGPUISD::MUL_I24: {
5711 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5712 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5713 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5714 RHSKnown.countMinTrailingZeros();
5715 Known.Zero.setLowBits(std::min(TrailZ, 32u));
5716 // Skip extra check if all bits are known zeros.
5717 if (TrailZ >= 32)
5718 break;
5719
5720 // Truncate to 24 bits.
5721 LHSKnown = LHSKnown.trunc(24);
5722 RHSKnown = RHSKnown.trunc(24);
5723
5724 if (Opc == AMDGPUISD::MUL_I24) {
5725 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
5726 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
5727 unsigned MaxValBits = LHSValBits + RHSValBits;
5728 if (MaxValBits > 32)
5729 break;
5730 unsigned SignBits = 32 - MaxValBits + 1;
5731 bool LHSNegative = LHSKnown.isNegative();
5732 bool LHSNonNegative = LHSKnown.isNonNegative();
5733 bool LHSPositive = LHSKnown.isStrictlyPositive();
5734 bool RHSNegative = RHSKnown.isNegative();
5735 bool RHSNonNegative = RHSKnown.isNonNegative();
5736 bool RHSPositive = RHSKnown.isStrictlyPositive();
5737
5738 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
5739 Known.Zero.setHighBits(SignBits);
5740 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
5741 Known.One.setHighBits(SignBits);
5742 } else {
5743 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
5744 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
5745 unsigned MaxValBits = LHSValBits + RHSValBits;
5746 if (MaxValBits >= 32)
5747 break;
5748 Known.Zero.setBitsFrom(MaxValBits);
5749 }
5750 break;
5751 }
5752 case AMDGPUISD::PERM: {
5753 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5754 if (!CMask)
5755 return;
5756
5757 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5758 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5759 unsigned Sel = CMask->getZExtValue();
5760
5761 for (unsigned I = 0; I < 32; I += 8) {
5762 unsigned SelBits = Sel & 0xff;
5763 if (SelBits < 4) {
5764 SelBits *= 8;
5765 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5766 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5767 } else if (SelBits < 7) {
5768 SelBits = (SelBits & 3) * 8;
5769 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5770 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5771 } else if (SelBits == 0x0c) {
5772 Known.Zero |= 0xFFull << I;
5773 } else if (SelBits > 0x0c) {
5774 Known.One |= 0xFFull << I;
5775 }
5776 Sel >>= 8;
5777 }
5778 break;
5779 }
5781 Known.Zero.setHighBits(24);
5782 break;
5783 }
5785 Known.Zero.setHighBits(16);
5786 break;
5787 }
5788 case AMDGPUISD::LDS: {
5789 auto *GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
5790 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
5791
5792 Known.Zero.setHighBits(16);
5793 Known.Zero.setLowBits(Log2(Alignment));
5794 break;
5795 }
5796 case AMDGPUISD::SMIN3:
5797 case AMDGPUISD::SMAX3:
5798 case AMDGPUISD::SMED3:
5799 case AMDGPUISD::UMIN3:
5800 case AMDGPUISD::UMAX3:
5801 case AMDGPUISD::UMED3: {
5802 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
5803 if (Known2.isUnknown())
5804 break;
5805
5806 KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5807 if (Known1.isUnknown())
5808 break;
5809
5810 KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5811 if (Known0.isUnknown())
5812 break;
5813
5814 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
5815 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
5816 Known.One = Known0.One & Known1.One & Known2.One;
5817 break;
5818 }
5820 unsigned IID = Op.getConstantOperandVal(0);
5821 switch (IID) {
5822 case Intrinsic::amdgcn_workitem_id_x:
5823 case Intrinsic::amdgcn_workitem_id_y:
5824 case Intrinsic::amdgcn_workitem_id_z: {
5825 unsigned MaxValue = Subtarget->getMaxWorkitemID(
5827 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
5828 break;
5829 }
5830 default:
5831 break;
5832 }
5833 }
5834 }
5835}
5836
5838 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
5839 unsigned Depth) const {
5840 switch (Op.getOpcode()) {
5841 case AMDGPUISD::BFE_I32: {
5842 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5843 if (!Width)
5844 return 1;
5845
5846 unsigned SignBits = 32 - Width->getZExtValue() + 1;
5847 if (!isNullConstant(Op.getOperand(1)))
5848 return SignBits;
5849
5850 // TODO: Could probably figure something out with non-0 offsets.
5851 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5852 return std::max(SignBits, Op0SignBits);
5853 }
5854
5855 case AMDGPUISD::BFE_U32: {
5856 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5857 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
5858 }
5859
5860 case AMDGPUISD::CARRY:
5861 case AMDGPUISD::BORROW:
5862 return 31;
5864 return 25;
5866 return 17;
5868 return 24;
5870 return 16;
5872 return 16;
5873 case AMDGPUISD::SMIN3:
5874 case AMDGPUISD::SMAX3:
5875 case AMDGPUISD::SMED3:
5876 case AMDGPUISD::UMIN3:
5877 case AMDGPUISD::UMAX3:
5878 case AMDGPUISD::UMED3: {
5879 unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);
5880 if (Tmp2 == 1)
5881 return 1; // Early out.
5882
5883 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
5884 if (Tmp1 == 1)
5885 return 1; // Early out.
5886
5887 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5888 if (Tmp0 == 1)
5889 return 1; // Early out.
5890
5891 return std::min({Tmp0, Tmp1, Tmp2});
5892 }
5893 default:
5894 return 1;
5895 }
5896}
5897
5900 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
5901 unsigned Depth) const {
5902 const MachineInstr *MI = MRI.getVRegDef(R);
5903 if (!MI)
5904 return 1;
5905
5906 // TODO: Check range metadata on MMO.
5907 switch (MI->getOpcode()) {
5908 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
5909 return 25;
5910 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
5911 return 17;
5912 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
5913 return 24;
5914 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
5915 return 16;
5916 case AMDGPU::G_AMDGPU_SMED3:
5917 case AMDGPU::G_AMDGPU_UMED3: {
5918 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
5919 unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);
5920 if (Tmp2 == 1)
5921 return 1;
5922 unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);
5923 if (Tmp1 == 1)
5924 return 1;
5925 unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);
5926 if (Tmp0 == 1)
5927 return 1;
5928 return std::min({Tmp0, Tmp1, Tmp2});
5929 }
5930 default:
5931 return 1;
5932 }
5933}
5934
5936 const SelectionDAG &DAG,
5937 bool SNaN,
5938 unsigned Depth) const {
5939 unsigned Opcode = Op.getOpcode();
5940 switch (Opcode) {
5943 if (SNaN)
5944 return true;
5945
5946 // TODO: Can check no nans on one of the operands for each one, but which
5947 // one?
5948 return false;
5949 }
5952 if (SNaN)
5953 return true;
5954 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
5955 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
5956 }
5957 case AMDGPUISD::FMED3:
5958 case AMDGPUISD::FMIN3:
5959 case AMDGPUISD::FMAX3:
5962 case AMDGPUISD::FMAD_FTZ: {
5963 if (SNaN)
5964 return true;
5965 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
5966 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5967 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
5968 }
5973 return true;
5974
5975 case AMDGPUISD::RCP:
5976 case AMDGPUISD::RSQ:
5978 case AMDGPUISD::RSQ_CLAMP: {
5979 if (SNaN)
5980 return true;
5981
5982 // TODO: Need is known positive check.
5983 return false;
5984 }
5985 case ISD::FLDEXP:
5986 case AMDGPUISD::FRACT: {
5987 if (SNaN)
5988 return true;
5989 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
5990 }
5994 // TODO: Refine on operands.
5995 return SNaN;
5996 case AMDGPUISD::SIN_HW:
5997 case AMDGPUISD::COS_HW: {
5998 // TODO: Need check for infinity
5999 return SNaN;
6000 }
6002 unsigned IntrinsicID = Op.getConstantOperandVal(0);
6003 // TODO: Handle more intrinsics
6004 switch (IntrinsicID) {
6005 case Intrinsic::amdgcn_cubeid:
6006 return true;
6007
6008 case Intrinsic::amdgcn_frexp_mant: {
6009 if (SNaN)
6010 return true;
6011 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6012 }
6013 case Intrinsic::amdgcn_cvt_pkrtz: {
6014 if (SNaN)
6015 return true;
6016 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6017 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6018 }
6019 case Intrinsic::amdgcn_rcp:
6020 case Intrinsic::amdgcn_rsq:
6021 case Intrinsic::amdgcn_rcp_legacy:
6022 case Intrinsic::amdgcn_rsq_legacy:
6023 case Intrinsic::amdgcn_rsq_clamp: {
6024 if (SNaN)
6025 return true;
6026
6027 // TODO: Need is known positive check.
6028 return false;
6029 }
6030 case Intrinsic::amdgcn_trig_preop:
6031 case Intrinsic::amdgcn_fdot2:
6032 // TODO: Refine on operand
6033 return SNaN;
6034 case Intrinsic::amdgcn_fma_legacy:
6035 if (SNaN)
6036 return true;
6037 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6038 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
6039 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
6040 default:
6041 return false;
6042 }
6043 }
6044 default:
6045 return false;
6046 }
6047}
6048
6050 Register N0, Register N1) const {
6051 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
6052}
unsigned const MachineRegisterInfo * MRI
static LLVM_READONLY bool hasSourceMods(const MachineInstr &MI)
static bool isInv2Pi(const APFloat &APF)
static LLVM_READONLY bool opMustUseVOP3Encoding(const MachineInstr &MI, const MachineRegisterInfo &MRI)
returns true if the operation will definitely need to use a 64-bit encoding, and thus will use a VOP3...
static unsigned inverseMinMax(unsigned Opc)
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static unsigned workitemIntrinsicDim(unsigned ID)
static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, int64_t Offset)
static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, uint32_t Width, const SDLoc &DL)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static SDValue getAddOneOp(const SDNode *V)
If V is an add of a constant 1, returns the other operand.
#define NODE_NAME_CASE(node)
static LLVM_READONLY bool selectSupportsSourceMods(const SDNode *N)
Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the type for ISD::SELECT.
static cl::opt< bool > AMDGPUBypassSlowDiv("amdgpu-bypass-slow-div", cl::desc("Skip 64-bit divide for dynamic 32-bit values"), cl::init(true))
static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, SDValue N0, SDValue N1, unsigned Size, bool Signed)
static bool fnegFoldsIntoOp(const SDNode *N)
static bool isI24(SDValue Op, SelectionDAG &DAG)
static bool isCttzOpc(unsigned Opc)
static bool isU24(SDValue Op, SelectionDAG &DAG)
static SDValue peekFPSignOps(SDValue Val)
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, unsigned Op, const SDLoc &SL, SDValue Cond, SDValue N1, SDValue N2)
static SDValue peekFNeg(SDValue Val)
static SDValue simplifyMul24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI)
static bool isCtlzOpc(unsigned Opc)
static LLVM_READNONE bool fnegFoldsIntoOpcode(unsigned Opc)
static bool hasVolatileUser(SDNode *Val)
Interface definition of the TargetLowering class that is common to all AMD GPUs.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU promote alloca to vector or LDS
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
block Block Frequency Analysis
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_READNONE
Definition: Compiler.h:299
#define LLVM_READONLY
Definition: Compiler.h:306
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Provides analysis for querying information about KnownBits during GISel passes.
IRTranslator LLVM IR MI
static LVOptions Options
Definition: LVOptions.cpp:25
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition: SHA256.cpp:34
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool Enabled
Definition: Statistic.cpp:46
Value * RHS
Value * LHS
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool hasFminFmaxLegacy() const
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool has16BitInsts() const
bool hasFastFMAF32() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
Generate Min/Max node.
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const
EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override
Return the type that should be used to zero or sign extend a zeroext/signext integer return value.
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem, unsigned AS) const override
Return true if it is expected to be cheaper to do a store of vector constant with the given size and ...
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool shouldCombineMemoryType(EVT VT) const
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isTruncateFree(EVT Src, EVT Dest) const override
bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override
SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const
TargetLowering::NegatibleCost getConstantNegateCost(const ConstantFPSDNode *C) const
SDValue LowerFLOGUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, bool IsLog10, SDNodeFlags Flags) const
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSDNodeAlwaysUniform(const SDNode *N) const override
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const
Split a vector store into multiple scalar stores.
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
SDValue LowerFLOG10(SDValue Op, SelectionDAG &DAG) const
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isConstantCheaperToNegate(SDValue N) const
bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0, Register N1) const override
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
bool isConstantCostlierToNegate(SDValue N) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const
SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
Emit approx-funcs appropriate lowering for exp10.
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSelectSupported(SelectSupportKind) const override
bool isZExtFree(Type *Src, Type *Dest) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const
SDValue getIsLtSmallestNormal(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool mayIgnoreSignedZero(SDValue Op) const
SDValue getIsFinite(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const final
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtType, EVT ExtVT) const override
Return true if it is profitable to reduce a load to a smaller type.
MVT getVectorIdxTy(const DataLayout &) const override
Returns the type to be used for the index operand of: ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT...
std::pair< SDValue, SDValue > splitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HighVT, SelectionDAG &DAG) const
Split a vector value into two parts of types LoVT and HiVT.
SDValue LowerFLOGCommon(SDValue Op, SelectionDAG &DAG) const
SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, SDValue N) const
SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const
bool isFAbsFree(EVT VT) const override
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
SDValue loadStackInputValue(SelectionDAG &DAG, EVT VT, const SDLoc &SL, int64_t Offset) const
Similar to CreateLiveInRegister, except value maybe loaded from a stack slot rather than passed in a ...
SDValue LowerFLOG2(SDValue Op, SelectionDAG &DAG) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
unsigned computeNumSignBitsForTargetInstr(GISelKnownBits &Analysis, Register R, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
static SDValue stripBitcast(SDValue Val)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override
Return a reciprocal estimate value for the input operand.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
std::pair< SDValue, SDValue > getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, SDValue Op, SDNodeFlags Flags) const
If denormal handling is required return the scaled input to FLOG2, and the check for denormal range.
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue lowerCTLZResults(SDValue Op, SelectionDAG &DAG) const
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const
SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const
SDValue performMulLoHiCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results) const
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
std::pair< EVT, EVT > getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const
Split a vector type into two parts.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue combineFMinMaxLegacyImpl(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
bool bitwiseIsEqual(const APFloat &RHS) const
Definition: APFloat.h:1410
opStatus add(const APFloat &RHS, roundingMode RM)
Definition: APFloat.h:1183
const fltSemantics & getSemantics() const
Definition: APFloat.h:1453
opStatus multiply(const APFloat &RHS, roundingMode RM)
Definition: APFloat.h:1201
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition: APFloat.h:1160
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:1100
Class for arbitrary precision integers.
Definition: APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1520
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1392
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1386
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:258
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition: APInt.h:1150
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition: APInt.h:1389
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
LLVMContext & getContext() const
void addLoc(const CCValAssign &V)
static CCValAssign getCustomMem(unsigned ValNo, MVT ValVT, int64_t Offset, MVT LocVT, LocInfo HTP)
const APFloat & getValueAPF() const
bool isNegative() const
Return true if the value is negative.
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
bool print(raw_ostream &OS, DIDumpOptions DumpOpts, const DWARFExpression *Expr, DWARFUnit *U) const
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
Diagnostic information for unsupported feature in backend.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:373
iterator_range< arg_iterator > args()
Definition: Function.h:898
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:277
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:657
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static auto integer_valuetypes()
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
int getObjectIndexBegin() const
Return the minimum frame object index.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
Definition: MachineInstr.h:71
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOInvariant
The memory access always returns the same value (or traps).
Flags getFlags() const
Return the raw flags of the source value,.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:302
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
iterator_range< user_iterator > users()
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
unsigned getNumOperands() const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:228
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:577
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getRegister(Register Reg, EVT VT)
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:503
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:857
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
Definition: SelectionDAG.h:828
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:497
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
bool isConstantValueOfAnyType(SDValue N) const
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:498
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:701
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:492
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:510
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:586
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:580
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp convert the backend supports.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
SelectSupportKind
Enum that describes what type of support for selects the target has.
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL) const
Returns the type for the shift amount of a shift opcode.
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setSupportsUnalignedAtomics(bool UnalignedSupported)
Sets whether unaligned atomic operations are supported.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const
Return true if it is profitable to reduce a load to a smaller type.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
unsigned GatherAllAliasesMaxDepth
Depth that GatherAllAliases should continue looking for chain dependencies when trying to find a more...
NegatibleCost
Enum that specifies when a float negation is beneficial.
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
void setHasMultipleConditionRegisters(bool hasManyRegs=true)
Tells the code generator that the target has multiple (allocatable) condition registers that can be u...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:81
TargetOptions Options
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
LLVM Value Representation.
Definition: Value.h:74
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
bool isIntrinsicAlwaysUniform(unsigned IntrID)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition: CallingConv.h:47
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:753
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:574
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:744
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1320
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:276
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:502
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1110
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:814
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:498
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:558
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:964
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:954
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:997
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:752
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1131
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1135
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:515
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition: ISDOpcodes.h:522
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1316
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:229
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:674
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:615
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:588
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1044
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition: ISDOpcodes.h:47
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:550
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:772
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1031
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1120
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:849
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:697
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:766
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:1176
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1050
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1287
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:286
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:539
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:1004
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:958
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:1173
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:508
@ AssertZext
Definition: ISDOpcodes.h:62
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:530
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1618
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1598
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double ln2
Definition: MathExtras.h:49
constexpr double ln10
Definition: MathExtras.h:50
constexpr float log2ef
Definition: MathExtras.h:66
constexpr double log2e
Definition: MathExtras.h:51
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
MaybeAlign getAlign(const Function &F, unsigned Index)
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:395
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:155
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:160
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
CombineLevel
Definition: DAGCombine.h:15
@ AfterLegalizeDAG
Definition: DAGCombine.h:19
@ AfterLegalizeTypes
Definition: DAGCombine.h:17
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
@ DS_Warning
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
static cl::opt< int > CostThreshold("sbvec-cost-threshold", cl::init(0), cl::Hidden, cl::desc("Vectorization cost threshold."))
APFloat neg(APFloat X)
Returns the negated value of the argument.
Definition: APFloat.h:1540
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:257
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:302
static const fltSemantics & IEEEdouble() LLVM_READNONE
Definition: APFloat.cpp:258
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:255
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
MCRegister getRegister() const
unsigned getStackOffset() const
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition: ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:390
EVT getPow2VectorType(LLVMContext &Context) const
Widens the length of the given vector EVT up to the nearest power of 2 and returns that type.
Definition: ValueTypes.h:472
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:121
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:238
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition: ValueTypes.h:425
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:465
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:407
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:376
EVT getRoundIntegerType(LLVMContext &Context) const
Rounds the bit-width of the given integer EVT up to the nearest power of two (and at least to eight),...
Definition: ValueTypes.h:414
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition: ValueTypes.h:287
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isExtended() const
Test if the given EVT is extended (as opposed to being simple).
Definition: ValueTypes.h:142
const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition: ValueTypes.cpp:320
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition: ValueTypes.h:303
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:100
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition: KnownBits.h:234
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:65
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:153
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:43
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:73
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:288
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:240
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:137
bool isStrictlyPositive() const
Returns true if this value is known to be positive.
Definition: KnownBits.h:106
bool isNegative() const
Returns true if this value is known to be negative.
Definition: KnownBits.h:97
unsigned countMaxSignificantBits() const
Returns the maximum number of bits needed to represent all possible signed values with these known bi...
Definition: KnownBits.h:261
This class contains a discriminated union of information about pointers in memory operands,...
bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
These are IR-level optimization flags that may be propagated to SDNodes.
void setAllowContract(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...