LLVM 22.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
25#include "llvm/IR/IntrinsicsAMDGPU.h"
29
30using namespace llvm;
31
32#include "AMDGPUGenCallingConv.inc"
33
35 "amdgpu-bypass-slow-div",
36 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
37 cl::init(true));
38
39// Find a larger type to do a load / store of a vector with.
41 unsigned StoreSize = VT.getStoreSizeInBits();
42 if (StoreSize <= 32)
43 return EVT::getIntegerVT(Ctx, StoreSize);
44
45 if (StoreSize % 32 == 0)
46 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
47
48 return VT;
49}
50
53}
54
56 // In order for this to be a signed 24-bit value, bit 23, must
57 // be a sign bit.
58 return DAG.ComputeMaxSignificantBits(Op);
59}
60
62 const AMDGPUSubtarget &STI)
63 : TargetLowering(TM), Subtarget(&STI) {
64 // Always lower memset, memcpy, and memmove intrinsics to load/store
65 // instructions, rather then generating calls to memset, mempcy or memmove.
69
70 // Enable ganging up loads and stores in the memcpy DAG lowering.
72
73 // Lower floating point store/load to integer store/load to reduce the number
74 // of patterns in tablegen.
76 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
77
79 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
80
82 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
83
85 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
86
88 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
89
91 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
92
94 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
95
97 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
98
100 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
101
102 setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
103 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
104
105 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
106 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
107
108 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
109 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
110
111 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
112 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
113
114 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
115 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
116
118 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
119
121 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
122
124 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
125
127 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
128
130 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
131
133 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
134
136 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
137
139 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
140
142 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
143
145 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
146
147 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
148 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
149
150 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
151 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
152
154 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
155
156 // TODO: Would be better to consume as directly legal
158 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
159
161 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
162
164 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
165
167 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);
168
170 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32);
171
173 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64);
174
176 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16);
177
179 AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16);
180
181 // There are no 64-bit extloads. These should be done as a 32-bit extload and
182 // an extension to 64-bit.
183 for (MVT VT : MVT::integer_valuetypes())
185 Expand);
186
187 for (MVT VT : MVT::integer_valuetypes()) {
188 if (VT == MVT::i64)
189 continue;
190
191 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
192 setLoadExtAction(Op, VT, MVT::i1, Promote);
193 setLoadExtAction(Op, VT, MVT::i8, Legal);
194 setLoadExtAction(Op, VT, MVT::i16, Legal);
195 setLoadExtAction(Op, VT, MVT::i32, Expand);
196 }
197 }
198
200 for (auto MemVT :
201 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
203 Expand);
204
205 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
206 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
207 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
208 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
209 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
210 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
211 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
212 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
213 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
214 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
215 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
216 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
217 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
218 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
219
220 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
221 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
222 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
223 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
224 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
225 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
226
227 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
228 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
229 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
230 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
231 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
232 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
233 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
234 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
235 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
236 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
237 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
238 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
239
241 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
242
244 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
245
247 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
248
250 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
251
253 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
254
256 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
257
259 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
260
262 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
263
265 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
266
268 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
269
271 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
272
274 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
275
277 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
278
280 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
281
283 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
284
286 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
287
289 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
290
292 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
293
295 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
296
298 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
299
301 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
302
304 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
305
307 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
308
310 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
311
313 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
314
316 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
317
319 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
320
321 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
322 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
323 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
324 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
325
326 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
327 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
328 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
329 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
330
331 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
332 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
333 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
334 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
335 setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);
336 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
337 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
338 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
339 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
340 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
341 setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
342 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
343 setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);
344 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
345
346 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
347 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
348 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
349
350 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
351 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);
352 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
353
354 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
355
356 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
357 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
358 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
359 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
360 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
361 setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);
362 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
363
364 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
365 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
366 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
367 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
368 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
369
370 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
371 setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
372 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
373
374 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
375 setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
376 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
377 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
378 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
379 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
380 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
381
382 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
383 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
384
386
387 // For R600, this is totally unsupported, just custom lower to produce an
388 // error.
390
391 // Library functions. These default to Expand, but we have instructions
392 // for them.
395 {MVT::f16, MVT::f32}, Legal);
397
399 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
401 {MVT::f16, MVT::f32, MVT::f64}, Expand);
402
405 Custom);
406
407 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
408
409 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
410
411 setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64},
412 Expand);
413
414 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
415
416 if (Subtarget->has16BitInsts()) {
417 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
419 } else {
420 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
422 }
423
425 Custom);
426
427 setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);
428 if (Subtarget->has16BitInsts()) {
430 }
431
432 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
433 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
434 // default unless marked custom/legal.
436 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
437 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
438 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64,
439 MVT::v16f64},
440 Custom);
441
442 if (isTypeLegal(MVT::f16))
444 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16},
445 Custom);
446
447 // Expand to fneg + fadd.
449
451 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
452 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
453 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
454 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
455 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
456 Custom);
457
460 {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32,
461 MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32,
462 MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32,
463 MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,
464 MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
465 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
466 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
467 Custom);
468
470 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
471
472 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
473 for (MVT VT : ScalarIntVTs) {
474 // These should use [SU]DIVREM, so set them to expand
476 Expand);
477
478 // GPU does not have divrem function for signed or unsigned.
480
481 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
483
485
486 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
488 }
489
490 // The hardware supports 32-bit FSHR, but not FSHL.
492
493 // The hardware supports 32-bit ROTR, but not ROTL.
494 setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
496
498
502 MVT::i64, Custom);
504
506 Legal);
507
510 MVT::i64, Custom);
511
512 for (auto VT : {MVT::i8, MVT::i16})
514
515 static const MVT::SimpleValueType VectorIntTypes[] = {
516 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
517 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
518
519 for (MVT VT : VectorIntTypes) {
520 // Expand the following operations for the current type by default.
533 VT, Expand);
534 }
535
536 static const MVT::SimpleValueType FloatVectorTypes[] = {
537 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
538 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
539
540 for (MVT VT : FloatVectorTypes) {
553 VT, Expand);
554 }
555
556 // This causes using an unrolled select operation rather than expansion with
557 // bit operations. This is in general better, but the alternative using BFI
558 // instructions may be better if the select sources are SGPRs.
560 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
561
563 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
564
566 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
567
569 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
570
572 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
573
575 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
576
578 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
579
581 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
582
584 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
585
587 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
588
590 setJumpIsExpensive(true);
591
594
596
597 // We want to find all load dependencies for long chains of stores to enable
598 // merging into very wide vectors. The problem is with vectors with > 4
599 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
600 // vectors are a legal type, even though we have to split the loads
601 // usually. When we can more precisely specify load legality per address
602 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
603 // smarter so that they can figure out what to do in 2 iterations without all
604 // N > 4 stores on the same chain.
606
607 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
608 // about these during lowering.
609 MaxStoresPerMemcpy = 0xffffffff;
610 MaxStoresPerMemmove = 0xffffffff;
611 MaxStoresPerMemset = 0xffffffff;
612
613 // The expansion for 64-bit division is enormous.
615 addBypassSlowDiv(64, 32);
616
627
631}
632
634 if (getTargetMachine().Options.NoSignedZerosFPMath)
635 return true;
636
637 const auto Flags = Op.getNode()->getFlags();
638 if (Flags.hasNoSignedZeros())
639 return true;
640
641 return false;
642}
643
644//===----------------------------------------------------------------------===//
645// Target Information
646//===----------------------------------------------------------------------===//
647
649static bool fnegFoldsIntoOpcode(unsigned Opc) {
650 switch (Opc) {
651 case ISD::FADD:
652 case ISD::FSUB:
653 case ISD::FMUL:
654 case ISD::FMA:
655 case ISD::FMAD:
656 case ISD::FMINNUM:
657 case ISD::FMAXNUM:
660 case ISD::FMINIMUM:
661 case ISD::FMAXIMUM:
662 case ISD::FMINIMUMNUM:
663 case ISD::FMAXIMUMNUM:
664 case ISD::SELECT:
665 case ISD::FSIN:
666 case ISD::FTRUNC:
667 case ISD::FRINT:
668 case ISD::FNEARBYINT:
669 case ISD::FROUNDEVEN:
671 case AMDGPUISD::RCP:
678 case AMDGPUISD::FMED3:
679 // TODO: handle llvm.amdgcn.fma.legacy
680 return true;
681 case ISD::BITCAST:
682 llvm_unreachable("bitcast is special cased");
683 default:
684 return false;
685 }
686}
687
688static bool fnegFoldsIntoOp(const SDNode *N) {
689 unsigned Opc = N->getOpcode();
690 if (Opc == ISD::BITCAST) {
691 // TODO: Is there a benefit to checking the conditions performFNegCombine
692 // does? We don't for the other cases.
693 SDValue BCSrc = N->getOperand(0);
694 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
695 return BCSrc.getNumOperands() == 2 &&
696 BCSrc.getOperand(1).getValueSizeInBits() == 32;
697 }
698
699 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
700 }
701
702 return fnegFoldsIntoOpcode(Opc);
703}
704
705/// \p returns true if the operation will definitely need to use a 64-bit
706/// encoding, and thus will use a VOP3 encoding regardless of the source
707/// modifiers.
709static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
710 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
711 VT == MVT::f64;
712}
713
714/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
715/// type for ISD::SELECT.
717static bool selectSupportsSourceMods(const SDNode *N) {
718 // TODO: Only applies if select will be vector
719 return N->getValueType(0) == MVT::f32;
720}
721
722// Most FP instructions support source modifiers, but this could be refined
723// slightly.
725static bool hasSourceMods(const SDNode *N) {
726 if (isa<MemSDNode>(N))
727 return false;
728
729 switch (N->getOpcode()) {
730 case ISD::CopyToReg:
731 case ISD::FDIV:
732 case ISD::FREM:
733 case ISD::INLINEASM:
737
738 // TODO: Should really be looking at the users of the bitcast. These are
739 // problematic because bitcasts are used to legalize all stores to integer
740 // types.
741 case ISD::BITCAST:
742 return false;
744 switch (N->getConstantOperandVal(0)) {
745 case Intrinsic::amdgcn_interp_p1:
746 case Intrinsic::amdgcn_interp_p2:
747 case Intrinsic::amdgcn_interp_mov:
748 case Intrinsic::amdgcn_interp_p1_f16:
749 case Intrinsic::amdgcn_interp_p2_f16:
750 return false;
751 default:
752 return true;
753 }
754 }
755 case ISD::SELECT:
757 default:
758 return true;
759 }
760}
761
763 unsigned CostThreshold) {
764 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
765 // it is truly free to use a source modifier in all cases. If there are
766 // multiple users but for each one will necessitate using VOP3, there will be
767 // a code size increase. Try to avoid increasing code size unless we know it
768 // will save on the instruction count.
769 unsigned NumMayIncreaseSize = 0;
770 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
771
772 assert(!N->use_empty());
773
774 // XXX - Should this limit number of uses to check?
775 for (const SDNode *U : N->users()) {
776 if (!hasSourceMods(U))
777 return false;
778
779 if (!opMustUseVOP3Encoding(U, VT)) {
780 if (++NumMayIncreaseSize > CostThreshold)
781 return false;
782 }
783 }
784
785 return true;
786}
787
789 ISD::NodeType ExtendKind) const {
790 assert(!VT.isVector() && "only scalar expected");
791
792 // Round to the next multiple of 32-bits.
793 unsigned Size = VT.getSizeInBits();
794 if (Size <= 32)
795 return MVT::i32;
796 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
797}
798
800 return 32;
801}
802
804 return true;
805}
806
807// The backend supports 32 and 64 bit floating point immediates.
808// FIXME: Why are we reporting vectors of FP immediates as legal?
810 bool ForCodeSize) const {
811 EVT ScalarVT = VT.getScalarType();
812 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
813 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
814}
815
816// We don't want to shrink f64 / f32 constants.
818 EVT ScalarVT = VT.getScalarType();
819 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
820}
821
823 SDNode *N, ISD::LoadExtType ExtTy, EVT NewVT,
824 std::optional<unsigned> ByteOffset) const {
825 // TODO: This may be worth removing. Check regression tests for diffs.
826 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT, ByteOffset))
827 return false;
828
829 unsigned NewSize = NewVT.getStoreSizeInBits();
830
831 // If we are reducing to a 32-bit load or a smaller multi-dword load,
832 // this is always better.
833 if (NewSize >= 32)
834 return true;
835
836 EVT OldVT = N->getValueType(0);
837 unsigned OldSize = OldVT.getStoreSizeInBits();
838
839 MemSDNode *MN = cast<MemSDNode>(N);
840 unsigned AS = MN->getAddressSpace();
841 // Do not shrink an aligned scalar load to sub-dword.
842 // Scalar engine cannot do sub-dword loads.
843 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
844 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
847 (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
848 MN->isInvariant())) &&
850 return false;
851
852 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
853 // extloads, so doing one requires using a buffer_load. In cases where we
854 // still couldn't use a scalar load, using the wider load shouldn't really
855 // hurt anything.
856
857 // If the old size already had to be an extload, there's no harm in continuing
858 // to reduce the width.
859 return (OldSize < 32);
860}
861
863 const SelectionDAG &DAG,
864 const MachineMemOperand &MMO) const {
865
866 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
867
868 if (LoadTy.getScalarType() == MVT::i32)
869 return false;
870
871 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
872 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
873
874 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
875 return false;
876
877 unsigned Fast = 0;
879 CastTy, MMO, &Fast) &&
880 Fast;
881}
882
883// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
884// profitable with the expansion for 64-bit since it's generally good to
885// speculate things.
887 return true;
888}
889
891 return true;
892}
893
895 switch (N->getOpcode()) {
896 case ISD::EntryToken:
897 case ISD::TokenFactor:
898 return true;
900 unsigned IntrID = N->getConstantOperandVal(0);
902 }
904 unsigned IntrID = N->getConstantOperandVal(1);
906 }
907 case ISD::LOAD:
908 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
910 return true;
911 return false;
912 case AMDGPUISD::SETCC: // ballot-style instruction
913 return true;
914 }
915 return false;
916}
917
919 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
920 NegatibleCost &Cost, unsigned Depth) const {
921
922 switch (Op.getOpcode()) {
923 case ISD::FMA:
924 case ISD::FMAD: {
925 // Negating a fma is not free if it has users without source mods.
926 if (!allUsesHaveSourceMods(Op.getNode()))
927 return SDValue();
928 break;
929 }
930 case AMDGPUISD::RCP: {
931 SDValue Src = Op.getOperand(0);
932 EVT VT = Op.getValueType();
933 SDLoc SL(Op);
934
935 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
936 ForCodeSize, Cost, Depth + 1);
937 if (NegSrc)
938 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
939 return SDValue();
940 }
941 default:
942 break;
943 }
944
945 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
946 ForCodeSize, Cost, Depth);
947}
948
949//===---------------------------------------------------------------------===//
950// Target Properties
951//===---------------------------------------------------------------------===//
952
955
956 // Packed operations do not have a fabs modifier.
957 return VT == MVT::f32 || VT == MVT::f64 ||
958 (Subtarget->has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16));
959}
960
963 // Report this based on the end legalized type.
964 VT = VT.getScalarType();
965 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
966}
967
969 unsigned NumElem,
970 unsigned AS) const {
971 return true;
972}
973
975 // There are few operations which truly have vector input operands. Any vector
976 // operation is going to involve operations on each component, and a
977 // build_vector will be a copy per element, so it always makes sense to use a
978 // build_vector input in place of the extracted element to avoid a copy into a
979 // super register.
980 //
981 // We should probably only do this if all users are extracts only, but this
982 // should be the common case.
983 return true;
984}
985
987 // Truncate is just accessing a subregister.
988
989 unsigned SrcSize = Source.getSizeInBits();
990 unsigned DestSize = Dest.getSizeInBits();
991
992 return DestSize < SrcSize && DestSize % 32 == 0 ;
993}
994
996 // Truncate is just accessing a subregister.
997
998 unsigned SrcSize = Source->getScalarSizeInBits();
999 unsigned DestSize = Dest->getScalarSizeInBits();
1000
1001 if (DestSize== 16 && Subtarget->has16BitInsts())
1002 return SrcSize >= 32;
1003
1004 return DestSize < SrcSize && DestSize % 32 == 0;
1005}
1006
1008 unsigned SrcSize = Src->getScalarSizeInBits();
1009 unsigned DestSize = Dest->getScalarSizeInBits();
1010
1011 if (SrcSize == 16 && Subtarget->has16BitInsts())
1012 return DestSize >= 32;
1013
1014 return SrcSize == 32 && DestSize == 64;
1015}
1016
1018 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1019 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1020 // this will enable reducing 64-bit operations the 32-bit, which is always
1021 // good.
1022
1023 if (Src == MVT::i16)
1024 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1025
1026 return Src == MVT::i32 && Dest == MVT::i64;
1027}
1028
1030 EVT DestVT) const {
1031 switch (N->getOpcode()) {
1032 case ISD::ADD:
1033 case ISD::SUB:
1034 case ISD::SHL:
1035 case ISD::SRL:
1036 case ISD::SRA:
1037 case ISD::AND:
1038 case ISD::OR:
1039 case ISD::XOR:
1040 case ISD::MUL:
1041 case ISD::SETCC:
1042 case ISD::SELECT:
1043 case ISD::SMIN:
1044 case ISD::SMAX:
1045 case ISD::UMIN:
1046 case ISD::UMAX:
1047 if (Subtarget->has16BitInsts() &&
1048 (!DestVT.isVector() || !Subtarget->hasVOP3PInsts())) {
1049 // Don't narrow back down to i16 if promoted to i32 already.
1050 if (!N->isDivergent() && DestVT.isInteger() &&
1051 DestVT.getScalarSizeInBits() > 1 &&
1052 DestVT.getScalarSizeInBits() <= 16 &&
1053 SrcVT.getScalarSizeInBits() > 16) {
1054 return false;
1055 }
1056 }
1057 return true;
1058 default:
1059 break;
1060 }
1061
1062 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1063 // limited number of native 64-bit operations. Shrinking an operation to fit
1064 // in a single 32-bit register should always be helpful. As currently used,
1065 // this is much less general than the name suggests, and is only used in
1066 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1067 // not profitable, and may actually be harmful.
1068 if (isa<LoadSDNode>(N))
1069 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1070
1071 return true;
1072}
1073
1075 const SDNode* N, CombineLevel Level) const {
1076 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1077 N->getOpcode() == ISD::SRL) &&
1078 "Expected shift op");
1079
1080 SDValue ShiftLHS = N->getOperand(0);
1081 if (!ShiftLHS->hasOneUse())
1082 return false;
1083
1084 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
1085 !ShiftLHS.getOperand(0)->hasOneUse())
1086 return false;
1087
1088 // Always commute pre-type legalization and right shifts.
1089 // We're looking for shl(or(x,y),z) patterns.
1091 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
1092 return true;
1093
1094 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1095 if (N->getValueType(0) == MVT::i32 && N->hasOneUse() &&
1096 (N->user_begin()->getOpcode() == ISD::SRA ||
1097 N->user_begin()->getOpcode() == ISD::SRL))
1098 return false;
1099
1100 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1101 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1102 if (LHS.getOpcode() != ISD::SHL)
1103 return false;
1104 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1105 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1106 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1107 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1108 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1109 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1110 };
1111 SDValue LHS = N->getOperand(0).getOperand(0);
1112 SDValue RHS = N->getOperand(0).getOperand(1);
1113 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1114}
1115
1116//===---------------------------------------------------------------------===//
1117// TargetLowering Callbacks
1118//===---------------------------------------------------------------------===//
1119
1121 bool IsVarArg) {
1122 switch (CC) {
1130 return CC_AMDGPU;
1133 return CC_AMDGPU_CS_CHAIN;
1134 case CallingConv::C:
1135 case CallingConv::Fast:
1136 case CallingConv::Cold:
1137 return CC_AMDGPU_Func;
1140 return CC_SI_Gfx;
1143 default:
1144 reportFatalUsageError("unsupported calling convention for call");
1145 }
1146}
1147
1149 bool IsVarArg) {
1150 switch (CC) {
1153 llvm_unreachable("kernels should not be handled here");
1163 return RetCC_SI_Shader;
1166 return RetCC_SI_Gfx;
1167 case CallingConv::C:
1168 case CallingConv::Fast:
1169 case CallingConv::Cold:
1170 return RetCC_AMDGPU_Func;
1171 default:
1172 reportFatalUsageError("unsupported calling convention");
1173 }
1174}
1175
1176/// The SelectionDAGBuilder will automatically promote function arguments
1177/// with illegal types. However, this does not work for the AMDGPU targets
1178/// since the function arguments are stored in memory as these illegal types.
1179/// In order to handle this properly we need to get the original types sizes
1180/// from the LLVM IR Function and fixup the ISD:InputArg values before
1181/// passing them to AnalyzeFormalArguments()
1182
1183/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1184/// input values across multiple registers. Each item in the Ins array
1185/// represents a single value that will be stored in registers. Ins[x].VT is
1186/// the value type of the value that will be stored in the register, so
1187/// whatever SDNode we lower the argument to needs to be this type.
1188///
1189/// In order to correctly lower the arguments we need to know the size of each
1190/// argument. Since Ins[x].VT gives us the size of the register that will
1191/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1192/// for the original function argument so that we can deduce the correct memory
1193/// type to use for Ins[x]. In most cases the correct memory type will be
1194/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1195/// we have a kernel argument of type v8i8, this argument will be split into
1196/// 8 parts and each part will be represented by its own item in the Ins array.
1197/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1198/// the argument before it was split. From this, we deduce that the memory type
1199/// for each individual part is i8. We pass the memory type as LocVT to the
1200/// calling convention analysis function and the register type (Ins[x].VT) as
1201/// the ValVT.
1203 CCState &State,
1204 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1205 const MachineFunction &MF = State.getMachineFunction();
1206 const Function &Fn = MF.getFunction();
1207 LLVMContext &Ctx = Fn.getParent()->getContext();
1208 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1209 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1211
1212 Align MaxAlign = Align(1);
1213 uint64_t ExplicitArgOffset = 0;
1214 const DataLayout &DL = Fn.getDataLayout();
1215
1216 unsigned InIndex = 0;
1217
1218 for (const Argument &Arg : Fn.args()) {
1219 const bool IsByRef = Arg.hasByRefAttr();
1220 Type *BaseArgTy = Arg.getType();
1221 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1222 Align Alignment = DL.getValueOrABITypeAlignment(
1223 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1224 MaxAlign = std::max(Alignment, MaxAlign);
1225 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1226
1227 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1228 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1229
1230 // We're basically throwing away everything passed into us and starting over
1231 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1232 // to us as computed in Ins.
1233 //
1234 // We also need to figure out what type legalization is trying to do to get
1235 // the correct memory offsets.
1236
1237 SmallVector<EVT, 16> ValueVTs;
1239 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1240
1241 for (unsigned Value = 0, NumValues = ValueVTs.size();
1242 Value != NumValues; ++Value) {
1243 uint64_t BasePartOffset = Offsets[Value];
1244
1245 EVT ArgVT = ValueVTs[Value];
1246 EVT MemVT = ArgVT;
1247 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1248 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1249
1250 if (NumRegs == 1) {
1251 // This argument is not split, so the IR type is the memory type.
1252 if (ArgVT.isExtended()) {
1253 // We have an extended type, like i24, so we should just use the
1254 // register type.
1255 MemVT = RegisterVT;
1256 } else {
1257 MemVT = ArgVT;
1258 }
1259 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1260 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1261 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1262 // We have a vector value which has been split into a vector with
1263 // the same scalar type, but fewer elements. This should handle
1264 // all the floating-point vector types.
1265 MemVT = RegisterVT;
1266 } else if (ArgVT.isVector() &&
1267 ArgVT.getVectorNumElements() == NumRegs) {
1268 // This arg has been split so that each element is stored in a separate
1269 // register.
1270 MemVT = ArgVT.getScalarType();
1271 } else if (ArgVT.isExtended()) {
1272 // We have an extended type, like i65.
1273 MemVT = RegisterVT;
1274 } else {
1275 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1276 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1277 if (RegisterVT.isInteger()) {
1278 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1279 } else if (RegisterVT.isVector()) {
1280 assert(!RegisterVT.getScalarType().isFloatingPoint());
1281 unsigned NumElements = RegisterVT.getVectorNumElements();
1282 assert(MemoryBits % NumElements == 0);
1283 // This vector type has been split into another vector type with
1284 // a different elements size.
1285 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1286 MemoryBits / NumElements);
1287 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1288 } else {
1289 llvm_unreachable("cannot deduce memory type.");
1290 }
1291 }
1292
1293 // Convert one element vectors to scalar.
1294 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1295 MemVT = MemVT.getScalarType();
1296
1297 // Round up vec3/vec5 argument.
1298 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1299 MemVT = MemVT.getPow2VectorType(State.getContext());
1300 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1301 MemVT = MemVT.getRoundIntegerType(State.getContext());
1302 }
1303
1304 unsigned PartOffset = 0;
1305 for (unsigned i = 0; i != NumRegs; ++i) {
1306 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1307 BasePartOffset + PartOffset,
1308 MemVT.getSimpleVT(),
1310 PartOffset += MemVT.getStoreSize();
1311 }
1312 }
1313 }
1314}
1315
1317 SDValue Chain, CallingConv::ID CallConv,
1318 bool isVarArg,
1320 const SmallVectorImpl<SDValue> &OutVals,
1321 const SDLoc &DL, SelectionDAG &DAG) const {
1322 // FIXME: Fails for r600 tests
1323 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1324 // "wave terminate should not have return values");
1325 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1326}
1327
1328//===---------------------------------------------------------------------===//
1329// Target specific lowering
1330//===---------------------------------------------------------------------===//
1331
1332/// Selects the correct CCAssignFn for a given CallingConvention value.
1334 bool IsVarArg) {
1335 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1336}
1337
1339 bool IsVarArg) {
1340 return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1341}
1342
1344 SelectionDAG &DAG,
1345 MachineFrameInfo &MFI,
1346 int ClobberedFI) const {
1347 SmallVector<SDValue, 8> ArgChains;
1348 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1349 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1350
1351 // Include the original chain at the beginning of the list. When this is
1352 // used by target LowerCall hooks, this helps legalize find the
1353 // CALLSEQ_BEGIN node.
1354 ArgChains.push_back(Chain);
1355
1356 // Add a chain value for each stack argument corresponding
1357 for (SDNode *U : DAG.getEntryNode().getNode()->users()) {
1358 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1359 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1360 if (FI->getIndex() < 0) {
1361 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1362 int64_t InLastByte = InFirstByte;
1363 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1364
1365 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1366 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1367 ArgChains.push_back(SDValue(L, 1));
1368 }
1369 }
1370 }
1371 }
1372
1373 // Build a tokenfactor for all the chains.
1374 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1375}
1376
1379 StringRef Reason) const {
1380 SDValue Callee = CLI.Callee;
1381 SelectionDAG &DAG = CLI.DAG;
1382
1383 const Function &Fn = DAG.getMachineFunction().getFunction();
1384
1385 StringRef FuncName("<unknown>");
1386
1387 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1388 FuncName = G->getSymbol();
1389 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1390 FuncName = G->getGlobal()->getName();
1391
1392 DAG.getContext()->diagnose(
1393 DiagnosticInfoUnsupported(Fn, Reason + FuncName, CLI.DL.getDebugLoc()));
1394
1395 if (!CLI.IsTailCall) {
1396 for (ISD::InputArg &Arg : CLI.Ins)
1397 InVals.push_back(DAG.getPOISON(Arg.VT));
1398 }
1399
1400 return DAG.getEntryNode();
1401}
1402
1404 SmallVectorImpl<SDValue> &InVals) const {
1405 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1406}
1407
1409 SelectionDAG &DAG) const {
1410 const Function &Fn = DAG.getMachineFunction().getFunction();
1411
1413 Fn, "unsupported dynamic alloca", SDLoc(Op).getDebugLoc()));
1414 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1415 return DAG.getMergeValues(Ops, SDLoc());
1416}
1417
1419 SelectionDAG &DAG) const {
1420 switch (Op.getOpcode()) {
1421 default:
1422 Op->print(errs(), &DAG);
1423 llvm_unreachable("Custom lowering code for this "
1424 "instruction is not implemented yet!");
1425 break;
1427 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1429 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1430 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1431 case ISD::FREM: return LowerFREM(Op, DAG);
1432 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1433 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1434 case ISD::FRINT: return LowerFRINT(Op, DAG);
1435 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1436 case ISD::FROUNDEVEN:
1437 return LowerFROUNDEVEN(Op, DAG);
1438 case ISD::FROUND: return LowerFROUND(Op, DAG);
1439 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1440 case ISD::FLOG2:
1441 return LowerFLOG2(Op, DAG);
1442 case ISD::FLOG:
1443 case ISD::FLOG10:
1444 return LowerFLOGCommon(Op, DAG);
1445 case ISD::FEXP:
1446 case ISD::FEXP10:
1447 return lowerFEXP(Op, DAG);
1448 case ISD::FEXP2:
1449 return lowerFEXP2(Op, DAG);
1450 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1451 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1452 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1453 case ISD::FP_TO_SINT:
1454 case ISD::FP_TO_UINT:
1455 return LowerFP_TO_INT(Op, DAG);
1456 case ISD::CTTZ:
1458 case ISD::CTLZ:
1460 return LowerCTLZ_CTTZ(Op, DAG);
1462 }
1463 return Op;
1464}
1465
1468 SelectionDAG &DAG) const {
1469 switch (N->getOpcode()) {
1471 // Different parts of legalization seem to interpret which type of
1472 // sign_extend_inreg is the one to check for custom lowering. The extended
1473 // from type is what really matters, but some places check for custom
1474 // lowering of the result type. This results in trying to use
1475 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1476 // nothing here and let the illegal result integer be handled normally.
1477 return;
1478 case ISD::FLOG2:
1479 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1480 Results.push_back(Lowered);
1481 return;
1482 case ISD::FLOG:
1483 case ISD::FLOG10:
1484 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1485 Results.push_back(Lowered);
1486 return;
1487 case ISD::FEXP2:
1488 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1489 Results.push_back(Lowered);
1490 return;
1491 case ISD::FEXP:
1492 case ISD::FEXP10:
1493 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1494 Results.push_back(Lowered);
1495 return;
1496 case ISD::CTLZ:
1498 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1499 Results.push_back(Lowered);
1500 return;
1501 default:
1502 return;
1503 }
1504}
1505
1507 SDValue Op,
1508 SelectionDAG &DAG) const {
1509
1510 const DataLayout &DL = DAG.getDataLayout();
1511 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1512 const GlobalValue *GV = G->getGlobal();
1513
1514 if (!MFI->isModuleEntryFunction()) {
1515 auto IsNamedBarrier = AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV));
1516 if (std::optional<uint32_t> Address =
1518 if (IsNamedBarrier) {
1519 unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
1520 MFI->recordNumNamedBarriers(Address.value(), BarCnt);
1521 }
1522 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1523 } else if (IsNamedBarrier) {
1524 llvm_unreachable("named barrier should have an assigned address");
1525 }
1526 }
1527
1528 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1529 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1530 if (!MFI->isModuleEntryFunction() &&
1531 GV->getName() != "llvm.amdgcn.module.lds" &&
1532 !AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV))) {
1533 SDLoc DL(Op);
1534 const Function &Fn = DAG.getMachineFunction().getFunction();
1536 Fn, "local memory global used by non-kernel function",
1537 DL.getDebugLoc(), DS_Warning));
1538
1539 // We currently don't have a way to correctly allocate LDS objects that
1540 // aren't directly associated with a kernel. We do force inlining of
1541 // functions that use local objects. However, if these dead functions are
1542 // not eliminated, we don't want a compile time error. Just emit a warning
1543 // and a trap, since there should be no callable path here.
1544 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1545 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1546 Trap, DAG.getRoot());
1547 DAG.setRoot(OutputChain);
1548 return DAG.getPOISON(Op.getValueType());
1549 }
1550
1551 // XXX: What does the value of G->getOffset() mean?
1552 assert(G->getOffset() == 0 &&
1553 "Do not know what to do with an non-zero offset");
1554
1555 // TODO: We could emit code to handle the initialization somewhere.
1556 // We ignore the initializer for now and legalize it to allow selection.
1557 // The initializer will anyway get errored out during assembly emission.
1558 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1559 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1560 }
1561 return SDValue();
1562}
1563
1565 SelectionDAG &DAG) const {
1567 SDLoc SL(Op);
1568
1569 EVT VT = Op.getValueType();
1570 if (VT.getVectorElementType().getSizeInBits() < 32) {
1571 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1572 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1573 unsigned NewNumElt = OpBitSize / 32;
1574 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1576 MVT::i32, NewNumElt);
1577 for (const SDUse &U : Op->ops()) {
1578 SDValue In = U.get();
1579 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1580 if (NewNumElt > 1)
1581 DAG.ExtractVectorElements(NewIn, Args);
1582 else
1583 Args.push_back(NewIn);
1584 }
1585
1586 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1587 NewNumElt * Op.getNumOperands());
1588 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1589 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1590 }
1591 }
1592
1593 for (const SDUse &U : Op->ops())
1594 DAG.ExtractVectorElements(U.get(), Args);
1595
1596 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1597}
1598
1600 SelectionDAG &DAG) const {
1601 SDLoc SL(Op);
1603 unsigned Start = Op.getConstantOperandVal(1);
1604 EVT VT = Op.getValueType();
1605 EVT SrcVT = Op.getOperand(0).getValueType();
1606
1607 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1608 unsigned NumElt = VT.getVectorNumElements();
1609 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1610 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1611
1612 // Extract 32-bit registers at a time.
1613 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1614 EVT NewVT = NumElt == 2
1615 ? MVT::i32
1616 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1617 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1618
1619 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1620 if (NumElt == 2)
1621 Tmp = Args[0];
1622 else
1623 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1624
1625 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1626 }
1627
1628 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1630
1631 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1632}
1633
1634// TODO: Handle fabs too
1636 if (Val.getOpcode() == ISD::FNEG)
1637 return Val.getOperand(0);
1638
1639 return Val;
1640}
1641
1643 if (Val.getOpcode() == ISD::FNEG)
1644 Val = Val.getOperand(0);
1645 if (Val.getOpcode() == ISD::FABS)
1646 Val = Val.getOperand(0);
1647 if (Val.getOpcode() == ISD::FCOPYSIGN)
1648 Val = Val.getOperand(0);
1649 return Val;
1650}
1651
1653 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1654 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1655 SelectionDAG &DAG = DCI.DAG;
1656 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1657 switch (CCOpcode) {
1658 case ISD::SETOEQ:
1659 case ISD::SETONE:
1660 case ISD::SETUNE:
1661 case ISD::SETNE:
1662 case ISD::SETUEQ:
1663 case ISD::SETEQ:
1664 case ISD::SETFALSE:
1665 case ISD::SETFALSE2:
1666 case ISD::SETTRUE:
1667 case ISD::SETTRUE2:
1668 case ISD::SETUO:
1669 case ISD::SETO:
1670 break;
1671 case ISD::SETULE:
1672 case ISD::SETULT: {
1673 if (LHS == True)
1674 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1675 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1676 }
1677 case ISD::SETOLE:
1678 case ISD::SETOLT:
1679 case ISD::SETLE:
1680 case ISD::SETLT: {
1681 // Ordered. Assume ordered for undefined.
1682
1683 // Only do this after legalization to avoid interfering with other combines
1684 // which might occur.
1686 !DCI.isCalledByLegalizer())
1687 return SDValue();
1688
1689 // We need to permute the operands to get the correct NaN behavior. The
1690 // selected operand is the second one based on the failing compare with NaN,
1691 // so permute it based on the compare type the hardware uses.
1692 if (LHS == True)
1693 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1694 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1695 }
1696 case ISD::SETUGE:
1697 case ISD::SETUGT: {
1698 if (LHS == True)
1699 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1700 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1701 }
1702 case ISD::SETGT:
1703 case ISD::SETGE:
1704 case ISD::SETOGE:
1705 case ISD::SETOGT: {
1707 !DCI.isCalledByLegalizer())
1708 return SDValue();
1709
1710 if (LHS == True)
1711 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1712 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1713 }
1714 case ISD::SETCC_INVALID:
1715 llvm_unreachable("Invalid setcc condcode!");
1716 }
1717 return SDValue();
1718}
1719
1720/// Generate Min/Max node
1722 SDValue LHS, SDValue RHS,
1723 SDValue True, SDValue False,
1724 SDValue CC,
1725 DAGCombinerInfo &DCI) const {
1726 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1727 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1728
1729 SelectionDAG &DAG = DCI.DAG;
1730
1731 // If we can't directly match this, try to see if we can fold an fneg to
1732 // match.
1733
1734 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
1735 ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(False);
1736 SDValue NegTrue = peekFNeg(True);
1737
1738 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1739 // fmin/fmax.
1740 //
1741 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1742 // -> fneg (fmin_legacy lhs, K)
1743 //
1744 // TODO: Use getNegatedExpression
1745 if (LHS == NegTrue && CFalse && CRHS) {
1746 APFloat NegRHS = neg(CRHS->getValueAPF());
1747 if (NegRHS == CFalse->getValueAPF()) {
1748 SDValue Combined =
1749 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1750 if (Combined)
1751 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1752 return SDValue();
1753 }
1754 }
1755
1756 return SDValue();
1757}
1758
1759std::pair<SDValue, SDValue>
1761 SDLoc SL(Op);
1762
1763 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1764
1765 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1766 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1767
1768 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1769 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1770
1771 return std::pair(Lo, Hi);
1772}
1773
1775 SDLoc SL(Op);
1776
1777 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1778 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1779 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1780}
1781
1783 SDLoc SL(Op);
1784
1785 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1786 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1787 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1788}
1789
1790// Split a vector type into two parts. The first part is a power of two vector.
1791// The second part is whatever is left over, and is a scalar if it would
1792// otherwise be a 1-vector.
1793std::pair<EVT, EVT>
1795 EVT LoVT, HiVT;
1796 EVT EltVT = VT.getVectorElementType();
1797 unsigned NumElts = VT.getVectorNumElements();
1798 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1799 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1800 HiVT = NumElts - LoNumElts == 1
1801 ? EltVT
1802 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1803 return std::pair(LoVT, HiVT);
1804}
1805
1806// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1807// scalar.
1808std::pair<SDValue, SDValue>
1810 const EVT &LoVT, const EVT &HiVT,
1811 SelectionDAG &DAG) const {
1812 EVT VT = N.getValueType();
1814 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1815 VT.getVectorNumElements() &&
1816 "More vector elements requested than available!");
1818 DAG.getVectorIdxConstant(0, DL));
1819
1820 unsigned LoNumElts = LoVT.getVectorNumElements();
1821
1822 if (HiVT.isVector()) {
1823 unsigned HiNumElts = HiVT.getVectorNumElements();
1824 if ((VT.getVectorNumElements() % HiNumElts) == 0) {
1825 // Avoid creating an extract_subvector with an index that isn't a multiple
1826 // of the result type.
1828 DAG.getConstant(LoNumElts, DL, MVT::i32));
1829 return {Lo, Hi};
1830 }
1831
1833 DAG.ExtractVectorElements(N, Elts, /*Start=*/LoNumElts,
1834 /*Count=*/HiNumElts);
1835 SDValue Hi = DAG.getBuildVector(HiVT, DL, Elts);
1836 return {Lo, Hi};
1837 }
1838
1840 DAG.getVectorIdxConstant(LoNumElts, DL));
1841 return {Lo, Hi};
1842}
1843
1845 SelectionDAG &DAG) const {
1846 LoadSDNode *Load = cast<LoadSDNode>(Op);
1847 EVT VT = Op.getValueType();
1848 SDLoc SL(Op);
1849
1850
1851 // If this is a 2 element vector, we really want to scalarize and not create
1852 // weird 1 element vectors.
1853 if (VT.getVectorNumElements() == 2) {
1854 SDValue Ops[2];
1855 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1856 return DAG.getMergeValues(Ops, SL);
1857 }
1858
1859 SDValue BasePtr = Load->getBasePtr();
1860 EVT MemVT = Load->getMemoryVT();
1861
1862 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1863
1864 EVT LoVT, HiVT;
1865 EVT LoMemVT, HiMemVT;
1866 SDValue Lo, Hi;
1867
1868 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1869 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1870 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1871
1872 unsigned Size = LoMemVT.getStoreSize();
1873 Align BaseAlign = Load->getAlign();
1874 Align HiAlign = commonAlignment(BaseAlign, Size);
1875
1876 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1877 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1878 BaseAlign, Load->getMemOperand()->getFlags());
1879 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1880 SDValue HiLoad =
1881 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1882 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1883 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1884
1885 SDValue Join;
1886 if (LoVT == HiVT) {
1887 // This is the case that the vector is power of two so was evenly split.
1888 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1889 } else {
1890 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getPOISON(VT), LoLoad,
1891 DAG.getVectorIdxConstant(0, SL));
1892 Join = DAG.getNode(
1894 VT, Join, HiLoad,
1896 }
1897
1898 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1899 LoLoad.getValue(1), HiLoad.getValue(1))};
1900
1901 return DAG.getMergeValues(Ops, SL);
1902}
1903
1905 SelectionDAG &DAG) const {
1906 LoadSDNode *Load = cast<LoadSDNode>(Op);
1907 EVT VT = Op.getValueType();
1908 SDValue BasePtr = Load->getBasePtr();
1909 EVT MemVT = Load->getMemoryVT();
1910 SDLoc SL(Op);
1911 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1912 Align BaseAlign = Load->getAlign();
1913 unsigned NumElements = MemVT.getVectorNumElements();
1914
1915 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1916 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1917 if (NumElements != 3 ||
1918 (BaseAlign < Align(8) &&
1919 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1920 return SplitVectorLoad(Op, DAG);
1921
1922 assert(NumElements == 3);
1923
1924 EVT WideVT =
1926 EVT WideMemVT =
1928 SDValue WideLoad = DAG.getExtLoad(
1929 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1930 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1931 return DAG.getMergeValues(
1932 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1933 DAG.getVectorIdxConstant(0, SL)),
1934 WideLoad.getValue(1)},
1935 SL);
1936}
1937
1939 SelectionDAG &DAG) const {
1940 StoreSDNode *Store = cast<StoreSDNode>(Op);
1941 SDValue Val = Store->getValue();
1942 EVT VT = Val.getValueType();
1943
1944 // If this is a 2 element vector, we really want to scalarize and not create
1945 // weird 1 element vectors.
1946 if (VT.getVectorNumElements() == 2)
1947 return scalarizeVectorStore(Store, DAG);
1948
1949 EVT MemVT = Store->getMemoryVT();
1950 SDValue Chain = Store->getChain();
1951 SDValue BasePtr = Store->getBasePtr();
1952 SDLoc SL(Op);
1953
1954 EVT LoVT, HiVT;
1955 EVT LoMemVT, HiMemVT;
1956 SDValue Lo, Hi;
1957
1958 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1959 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1960 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1961
1962 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1963
1964 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1965 Align BaseAlign = Store->getAlign();
1966 unsigned Size = LoMemVT.getStoreSize();
1967 Align HiAlign = commonAlignment(BaseAlign, Size);
1968
1969 SDValue LoStore =
1970 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1971 Store->getMemOperand()->getFlags());
1972 SDValue HiStore =
1973 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1974 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1975
1976 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1977}
1978
1979// This is a shortcut for integer division because we have fast i32<->f32
1980// conversions, and fast f32 reciprocal instructions. The fractional part of a
1981// float is enough to accurately represent up to a 24-bit signed integer.
1983 bool Sign) const {
1984 SDLoc DL(Op);
1985 EVT VT = Op.getValueType();
1986 SDValue LHS = Op.getOperand(0);
1987 SDValue RHS = Op.getOperand(1);
1988 MVT IntVT = MVT::i32;
1989 MVT FltVT = MVT::f32;
1990
1991 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1992 if (LHSSignBits < 9)
1993 return SDValue();
1994
1995 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1996 if (RHSSignBits < 9)
1997 return SDValue();
1998
1999 unsigned BitSize = VT.getSizeInBits();
2000 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
2001 unsigned DivBits = BitSize - SignBits;
2002 if (Sign)
2003 ++DivBits;
2004
2007
2008 SDValue jq = DAG.getConstant(1, DL, IntVT);
2009
2010 if (Sign) {
2011 // char|short jq = ia ^ ib;
2012 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
2013
2014 // jq = jq >> (bitsize - 2)
2015 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
2016 DAG.getConstant(BitSize - 2, DL, VT));
2017
2018 // jq = jq | 0x1
2019 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
2020 }
2021
2022 // int ia = (int)LHS;
2023 SDValue ia = LHS;
2024
2025 // int ib, (int)RHS;
2026 SDValue ib = RHS;
2027
2028 // float fa = (float)ia;
2029 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
2030
2031 // float fb = (float)ib;
2032 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
2033
2034 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
2035 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
2036
2037 // fq = trunc(fq);
2038 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
2039
2040 // float fqneg = -fq;
2041 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
2042
2044
2045 bool UseFmadFtz = false;
2046 if (Subtarget->isGCN()) {
2048 UseFmadFtz =
2050 }
2051
2052 // float fr = mad(fqneg, fb, fa);
2053 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2054 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
2056 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
2057
2058 // int iq = (int)fq;
2059 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
2060
2061 // fr = fabs(fr);
2062 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
2063
2064 // fb = fabs(fb);
2065 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
2066
2067 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2068
2069 // int cv = fr >= fb;
2070 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
2071
2072 // jq = (cv ? jq : 0);
2073 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
2074
2075 // dst = iq + jq;
2076 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
2077
2078 // Rem needs compensation, it's easier to recompute it
2079 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
2080 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
2081
2082 // Truncate to number of bits this divide really is.
2083 if (Sign) {
2084 SDValue InRegSize
2085 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
2086 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
2087 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
2088 } else {
2089 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
2090 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
2091 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
2092 }
2093
2094 return DAG.getMergeValues({ Div, Rem }, DL);
2095}
2096
2098 SelectionDAG &DAG,
2100 SDLoc DL(Op);
2101 EVT VT = Op.getValueType();
2102
2103 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2104
2105 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2106
2107 SDValue One = DAG.getConstant(1, DL, HalfVT);
2108 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
2109
2110 //HiLo split
2111 SDValue LHS_Lo, LHS_Hi;
2112 SDValue LHS = Op.getOperand(0);
2113 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
2114
2115 SDValue RHS_Lo, RHS_Hi;
2116 SDValue RHS = Op.getOperand(1);
2117 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2118
2119 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2121
2122 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2123 LHS_Lo, RHS_Lo);
2124
2125 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2126 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2127
2128 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2129 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2130 return;
2131 }
2132
2133 if (isTypeLegal(MVT::i64)) {
2134 // The algorithm here is based on ideas from "Software Integer Division",
2135 // Tom Rodeheffer, August 2008.
2136
2139
2140 // Compute denominator reciprocal.
2141 unsigned FMAD =
2142 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2145 : (unsigned)AMDGPUISD::FMAD_FTZ;
2146
2147 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2148 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2149 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2150 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2151 Cvt_Lo);
2152 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2153 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2154 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2155 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2156 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2157 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2158 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2159 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2160 Mul1);
2161 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2162 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2163 SDValue Rcp64 = DAG.getBitcast(VT,
2164 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2165
2166 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2167 SDValue One64 = DAG.getConstant(1, DL, VT);
2168 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2169 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2170
2171 // First round of UNR (Unsigned integer Newton-Raphson).
2172 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2173 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2174 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2175 SDValue Mulhi1_Lo, Mulhi1_Hi;
2176 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2177 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2178 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2179 Mulhi1_Lo, Zero1);
2180 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2181 Mulhi1_Hi, Add1_Lo.getValue(1));
2182 SDValue Add1 = DAG.getBitcast(VT,
2183 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2184
2185 // Second round of UNR.
2186 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2187 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2188 SDValue Mulhi2_Lo, Mulhi2_Hi;
2189 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2190 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2191 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2192 Mulhi2_Lo, Zero1);
2193 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2194 Mulhi2_Hi, Add2_Lo.getValue(1));
2195 SDValue Add2 = DAG.getBitcast(VT,
2196 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2197
2198 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2199
2200 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2201
2202 SDValue Mul3_Lo, Mul3_Hi;
2203 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2204 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2205 Mul3_Lo, Zero1);
2206 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2207 Mul3_Hi, Sub1_Lo.getValue(1));
2208 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2209 SDValue Sub1 = DAG.getBitcast(VT,
2210 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2211
2212 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2213 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2214 ISD::SETUGE);
2215 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2216 ISD::SETUGE);
2217 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2218
2219 // TODO: Here and below portions of the code can be enclosed into if/endif.
2220 // Currently control flow is unconditional and we have 4 selects after
2221 // potential endif to substitute PHIs.
2222
2223 // if C3 != 0 ...
2224 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2225 RHS_Lo, Zero1);
2226 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2227 RHS_Hi, Sub1_Lo.getValue(1));
2228 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2229 Zero, Sub2_Lo.getValue(1));
2230 SDValue Sub2 = DAG.getBitcast(VT,
2231 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2232
2233 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2234
2235 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2236 ISD::SETUGE);
2237 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2238 ISD::SETUGE);
2239 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2240
2241 // if (C6 != 0)
2242 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2243
2244 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2245 RHS_Lo, Zero1);
2246 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2247 RHS_Hi, Sub2_Lo.getValue(1));
2248 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2249 Zero, Sub3_Lo.getValue(1));
2250 SDValue Sub3 = DAG.getBitcast(VT,
2251 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2252
2253 // endif C6
2254 // endif C3
2255
2256 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2257 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2258
2259 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2260 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2261
2262 Results.push_back(Div);
2263 Results.push_back(Rem);
2264
2265 return;
2266 }
2267
2268 // r600 expandion.
2269 // Get Speculative values
2270 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2271 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2272
2273 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2274 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2275 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2276
2277 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2278 SDValue DIV_Lo = Zero;
2279
2280 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2281
2282 for (unsigned i = 0; i < halfBitWidth; ++i) {
2283 const unsigned bitPos = halfBitWidth - i - 1;
2284 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2285 // Get value of high bit
2286 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2287 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2288 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2289
2290 // Shift
2291 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2292 // Add LHS high bit
2293 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2294
2295 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2296 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2297
2298 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2299
2300 // Update REM
2301 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2302 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2303 }
2304
2305 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2306 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2307 Results.push_back(DIV);
2308 Results.push_back(REM);
2309}
2310
2312 SelectionDAG &DAG) const {
2313 SDLoc DL(Op);
2314 EVT VT = Op.getValueType();
2315
2316 if (VT == MVT::i64) {
2318 LowerUDIVREM64(Op, DAG, Results);
2319 return DAG.getMergeValues(Results, DL);
2320 }
2321
2322 if (VT == MVT::i32) {
2323 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2324 return Res;
2325 }
2326
2327 SDValue X = Op.getOperand(0);
2328 SDValue Y = Op.getOperand(1);
2329
2330 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2331 // algorithm used here.
2332
2333 // Initial estimate of inv(y).
2334 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2335
2336 // One round of UNR.
2337 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2338 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2339 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2340 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2341
2342 // Quotient/remainder estimate.
2343 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2344 SDValue R =
2345 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2346
2347 // First quotient/remainder refinement.
2348 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2349 SDValue One = DAG.getConstant(1, DL, VT);
2350 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2351 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2352 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2353 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2354 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2355
2356 // Second quotient/remainder refinement.
2357 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2358 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2359 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2360 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2361 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2362
2363 return DAG.getMergeValues({Q, R}, DL);
2364}
2365
2367 SelectionDAG &DAG) const {
2368 SDLoc DL(Op);
2369 EVT VT = Op.getValueType();
2370
2371 SDValue LHS = Op.getOperand(0);
2372 SDValue RHS = Op.getOperand(1);
2373
2374 SDValue Zero = DAG.getConstant(0, DL, VT);
2375 SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
2376
2377 if (VT == MVT::i32) {
2378 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2379 return Res;
2380 }
2381
2382 if (VT == MVT::i64 &&
2383 DAG.ComputeNumSignBits(LHS) > 32 &&
2384 DAG.ComputeNumSignBits(RHS) > 32) {
2385 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2386
2387 //HiLo split
2388 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2389 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2390 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2391 LHS_Lo, RHS_Lo);
2392 SDValue Res[2] = {
2393 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2394 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2395 };
2396 return DAG.getMergeValues(Res, DL);
2397 }
2398
2399 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2400 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2401 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2402 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2403
2404 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2405 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2406
2407 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2408 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2409
2410 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2411 SDValue Rem = Div.getValue(1);
2412
2413 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2414 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2415
2416 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2417 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2418
2419 SDValue Res[2] = {
2420 Div,
2421 Rem
2422 };
2423 return DAG.getMergeValues(Res, DL);
2424}
2425
2426// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2428 SDLoc SL(Op);
2429 EVT VT = Op.getValueType();
2430 auto Flags = Op->getFlags();
2431 SDValue X = Op.getOperand(0);
2432 SDValue Y = Op.getOperand(1);
2433
2434 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2435 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2436 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2437 // TODO: For f32 use FMAD instead if !hasFastFMA32?
2438 return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2439}
2440
2442 SDLoc SL(Op);
2443 SDValue Src = Op.getOperand(0);
2444
2445 // result = trunc(src)
2446 // if (src > 0.0 && src != result)
2447 // result += 1.0
2448
2449 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2450
2451 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2452 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2453
2454 EVT SetCCVT =
2455 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2456
2457 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2458 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2459 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2460
2461 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2462 // TODO: Should this propagate fast-math-flags?
2463 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2464}
2465
2467 SelectionDAG &DAG) {
2468 const unsigned FractBits = 52;
2469 const unsigned ExpBits = 11;
2470
2471 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2472 Hi,
2473 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2474 DAG.getConstant(ExpBits, SL, MVT::i32));
2475 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2476 DAG.getConstant(1023, SL, MVT::i32));
2477
2478 return Exp;
2479}
2480
2482 SDLoc SL(Op);
2483 SDValue Src = Op.getOperand(0);
2484
2485 assert(Op.getValueType() == MVT::f64);
2486
2487 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2488
2489 // Extract the upper half, since this is where we will find the sign and
2490 // exponent.
2491 SDValue Hi = getHiHalf64(Src, DAG);
2492
2493 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2494
2495 const unsigned FractBits = 52;
2496
2497 // Extract the sign bit.
2498 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2499 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2500
2501 // Extend back to 64-bits.
2502 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2503 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2504
2505 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2506 const SDValue FractMask
2507 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2508
2509 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2510 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2511 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2512
2513 EVT SetCCVT =
2514 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2515
2516 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2517
2518 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2519 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2520
2521 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2522 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2523
2524 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2525}
2526
2528 SelectionDAG &DAG) const {
2529 SDLoc SL(Op);
2530 SDValue Src = Op.getOperand(0);
2531
2532 assert(Op.getValueType() == MVT::f64);
2533
2534 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2535 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2536 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2537
2538 // TODO: Should this propagate fast-math-flags?
2539
2540 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2541 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2542
2543 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2544
2545 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2546 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2547
2548 EVT SetCCVT =
2549 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2550 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2551
2552 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2553}
2554
2556 SelectionDAG &DAG) const {
2557 // FNEARBYINT and FRINT are the same, except in their handling of FP
2558 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2559 // rint, so just treat them as equivalent.
2560 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2561 Op.getOperand(0));
2562}
2563
2565 auto VT = Op.getValueType();
2566 auto Arg = Op.getOperand(0u);
2567 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2568}
2569
2570// XXX - May require not supporting f32 denormals?
2571
2572// Don't handle v2f16. The extra instructions to scalarize and repack around the
2573// compare and vselect end up producing worse code than scalarizing the whole
2574// operation.
2576 SDLoc SL(Op);
2577 SDValue X = Op.getOperand(0);
2578 EVT VT = Op.getValueType();
2579
2580 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2581
2582 // TODO: Should this propagate fast-math-flags?
2583
2584 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2585
2586 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2587
2588 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2589 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2590
2591 EVT SetCCVT =
2592 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2593
2594 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2595 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2596 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2597
2598 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2599 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2600}
2601
2603 SDLoc SL(Op);
2604 SDValue Src = Op.getOperand(0);
2605
2606 // result = trunc(src);
2607 // if (src < 0.0 && src != result)
2608 // result += -1.0.
2609
2610 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2611
2612 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2613 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2614
2615 EVT SetCCVT =
2616 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2617
2618 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2619 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2620 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2621
2622 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2623 // TODO: Should this propagate fast-math-flags?
2624 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2625}
2626
2627/// Return true if it's known that \p Src can never be an f32 denormal value.
2629 switch (Src.getOpcode()) {
2630 case ISD::FP_EXTEND:
2631 return Src.getOperand(0).getValueType() == MVT::f16;
2632 case ISD::FP16_TO_FP:
2633 case ISD::FFREXP:
2634 return true;
2636 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2637 switch (IntrinsicID) {
2638 case Intrinsic::amdgcn_frexp_mant:
2639 return true;
2640 default:
2641 return false;
2642 }
2643 }
2644 default:
2645 return false;
2646 }
2647
2648 llvm_unreachable("covered opcode switch");
2649}
2650
2652 SDNodeFlags Flags) {
2653 return Flags.hasApproximateFuncs();
2654}
2655
2657 SDValue Src,
2658 SDNodeFlags Flags) {
2659 return !valueIsKnownNeverF32Denorm(Src) &&
2660 DAG.getMachineFunction()
2663}
2664
2666 SDValue Src,
2667 SDNodeFlags Flags) const {
2668 SDLoc SL(Src);
2669 EVT VT = Src.getValueType();
2670 const fltSemantics &Semantics = VT.getFltSemantics();
2671 SDValue SmallestNormal =
2672 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2673
2674 // Want to scale denormals up, but negatives and 0 work just as well on the
2675 // scaled path.
2676 SDValue IsLtSmallestNormal = DAG.getSetCC(
2677 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2678 SmallestNormal, ISD::SETOLT);
2679
2680 return IsLtSmallestNormal;
2681}
2682
2684 SDNodeFlags Flags) const {
2685 SDLoc SL(Src);
2686 EVT VT = Src.getValueType();
2687 const fltSemantics &Semantics = VT.getFltSemantics();
2688 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2689
2690 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2691 SDValue IsFinite = DAG.getSetCC(
2692 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2693 Inf, ISD::SETOLT);
2694 return IsFinite;
2695}
2696
2697/// If denormal handling is required return the scaled input to FLOG2, and the
2698/// check for denormal range. Otherwise, return null values.
2699std::pair<SDValue, SDValue>
2701 SDValue Src, SDNodeFlags Flags) const {
2702 if (!needsDenormHandlingF32(DAG, Src, Flags))
2703 return {};
2704
2705 MVT VT = MVT::f32;
2706 const fltSemantics &Semantics = APFloat::IEEEsingle();
2707 SDValue SmallestNormal =
2708 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2709
2710 SDValue IsLtSmallestNormal = DAG.getSetCC(
2711 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2712 SmallestNormal, ISD::SETOLT);
2713
2714 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2715 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2716 SDValue ScaleFactor =
2717 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2718
2719 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2720 return {ScaledInput, IsLtSmallestNormal};
2721}
2722
2724 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2725 // If we have to handle denormals, scale up the input and adjust the result.
2726
2727 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2728 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2729
2730 SDLoc SL(Op);
2731 EVT VT = Op.getValueType();
2732 SDValue Src = Op.getOperand(0);
2733 SDNodeFlags Flags = Op->getFlags();
2734
2735 if (VT == MVT::f16) {
2736 // Nothing in half is a denormal when promoted to f32.
2737 assert(!Subtarget->has16BitInsts());
2738 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2739 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2740 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2741 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2742 }
2743
2744 auto [ScaledInput, IsLtSmallestNormal] =
2745 getScaledLogInput(DAG, SL, Src, Flags);
2746 if (!ScaledInput)
2747 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2748
2749 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2750
2751 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2752 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2753 SDValue ResultOffset =
2754 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2755 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2756}
2757
2758static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2759 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2760 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2761 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2762}
2763
2765 SelectionDAG &DAG) const {
2766 SDValue X = Op.getOperand(0);
2767 EVT VT = Op.getValueType();
2768 SDNodeFlags Flags = Op->getFlags();
2769 SDLoc DL(Op);
2770
2771 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2772 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2773
2774 const auto &Options = getTargetMachine().Options;
2775 if (VT == MVT::f16 || Flags.hasApproximateFuncs()) {
2776
2777 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2778 // Log and multiply in f32 is good enough for f16.
2779 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2780 }
2781
2782 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2783 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2784 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2785 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2786 }
2787
2788 return Lowered;
2789 }
2790
2791 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags);
2792 if (ScaledInput)
2793 X = ScaledInput;
2794
2795 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2796
2797 SDValue R;
2798 if (Subtarget->hasFastFMAF32()) {
2799 // c+cc are ln(2)/ln(10) to more than 49 bits
2800 const float c_log10 = 0x1.344134p-2f;
2801 const float cc_log10 = 0x1.09f79ep-26f;
2802
2803 // c + cc is ln(2) to more than 49 bits
2804 const float c_log = 0x1.62e42ep-1f;
2805 const float cc_log = 0x1.efa39ep-25f;
2806
2807 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2808 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2809
2810 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2811 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2812 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2813 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2814 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2815 } else {
2816 // ch+ct is ln(2)/ln(10) to more than 36 bits
2817 const float ch_log10 = 0x1.344000p-2f;
2818 const float ct_log10 = 0x1.3509f6p-18f;
2819
2820 // ch + ct is ln(2) to more than 36 bits
2821 const float ch_log = 0x1.62e000p-1f;
2822 const float ct_log = 0x1.0bfbe8p-15f;
2823
2824 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2825 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2826
2827 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2828 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2829 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2830 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2831 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2832
2833 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2834 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2835 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2836 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2837 }
2838
2839 const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) &&
2840 (Flags.hasNoInfs() || Options.NoInfsFPMath);
2841
2842 // TODO: Check if known finite from source value.
2843 if (!IsFiniteOnly) {
2844 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2845 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2846 }
2847
2848 if (IsScaled) {
2849 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2850 SDValue ShiftK =
2851 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2852 SDValue Shift =
2853 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2854 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2855 }
2856
2857 return R;
2858}
2859
2861 return LowerFLOGCommon(Op, DAG);
2862}
2863
2864// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2865// promote f16 operation.
2867 SelectionDAG &DAG, bool IsLog10,
2868 SDNodeFlags Flags) const {
2869 EVT VT = Src.getValueType();
2870 unsigned LogOp =
2871 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2872
2873 double Log2BaseInverted =
2875
2876 if (VT == MVT::f32) {
2877 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2878 if (ScaledInput) {
2879 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2880 SDValue ScaledResultOffset =
2881 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2882
2883 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2884
2885 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2886 ScaledResultOffset, Zero, Flags);
2887
2888 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2889
2890 if (Subtarget->hasFastFMAF32())
2891 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2892 Flags);
2893 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2894 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2895 }
2896 }
2897
2898 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2899 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2900
2901 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2902 Flags);
2903}
2904
2906 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2907 // If we have to handle denormals, scale up the input and adjust the result.
2908
2909 SDLoc SL(Op);
2910 EVT VT = Op.getValueType();
2911 SDValue Src = Op.getOperand(0);
2912 SDNodeFlags Flags = Op->getFlags();
2913
2914 if (VT == MVT::f16) {
2915 // Nothing in half is a denormal when promoted to f32.
2916 assert(!Subtarget->has16BitInsts());
2917 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2918 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
2919 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2920 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2921 }
2922
2923 assert(VT == MVT::f32);
2924
2925 if (!needsDenormHandlingF32(DAG, Src, Flags))
2926 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
2927
2928 // bool needs_scaling = x < -0x1.f80000p+6f;
2929 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2930
2931 // -nextafter(128.0, -1)
2932 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
2933
2934 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2935
2936 SDValue NeedsScaling =
2937 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
2938
2939 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2940 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2941
2942 SDValue AddOffset =
2943 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
2944
2945 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
2946 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
2947
2948 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
2949 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2950 SDValue ResultScale =
2951 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
2952
2953 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
2954}
2955
2957 SelectionDAG &DAG,
2958 SDNodeFlags Flags) const {
2959 EVT VT = X.getValueType();
2960 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
2961
2962 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2963 // exp2(M_LOG2E_F * f);
2964 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags);
2965 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2966 : (unsigned)ISD::FEXP2,
2967 SL, VT, Mul, Flags);
2968 }
2969
2970 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2971
2972 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
2973 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2974
2975 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2976
2977 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2978
2979 SDValue AdjustedX =
2980 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2981
2982 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
2983
2984 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
2985
2986 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
2987 SDValue AdjustedResult =
2988 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
2989
2990 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
2991 Flags);
2992}
2993
2994/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
2995/// handled correctly.
2997 SelectionDAG &DAG,
2998 SDNodeFlags Flags) const {
2999 const EVT VT = X.getValueType();
3000 const unsigned Exp2Op = VT == MVT::f32 ? static_cast<unsigned>(AMDGPUISD::EXP)
3001 : static_cast<unsigned>(ISD::FEXP2);
3002
3003 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
3004 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3005 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3006 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3007
3008 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
3009 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3010 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
3011 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3012 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
3013 }
3014
3015 // bool s = x < -0x1.2f7030p+5f;
3016 // x += s ? 0x1.0p+5f : 0.0f;
3017 // exp10 = exp2(x * 0x1.a92000p+1f) *
3018 // exp2(x * 0x1.4f0978p-11f) *
3019 // (s ? 0x1.9f623ep-107f : 1.0f);
3020
3021 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3022
3023 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
3024 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
3025
3026 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
3027 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
3028 SDValue AdjustedX =
3029 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
3030
3031 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3032 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3033
3034 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
3035 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3036 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
3037 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3038
3039 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
3040
3041 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
3042 SDValue AdjustedResult =
3043 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
3044
3045 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
3046 Flags);
3047}
3048
3050 EVT VT = Op.getValueType();
3051 SDLoc SL(Op);
3052 SDValue X = Op.getOperand(0);
3053 SDNodeFlags Flags = Op->getFlags();
3054 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
3055
3056 if (VT.getScalarType() == MVT::f16) {
3057 // v_exp_f16 (fmul x, log2e)
3058 if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?
3059 return lowerFEXPUnsafe(X, SL, DAG, Flags);
3060
3061 if (VT.isVector())
3062 return SDValue();
3063
3064 // exp(f16 x) ->
3065 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3066
3067 // Nothing in half is a denormal when promoted to f32.
3068 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
3069 SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags);
3070 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
3071 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
3072 }
3073
3074 assert(VT == MVT::f32);
3075
3076 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3077 // library behavior. Also, is known-not-daz source sufficient?
3078 if (allowApproxFunc(DAG, Flags)) {
3079 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3080 : lowerFEXPUnsafe(X, SL, DAG, Flags);
3081 }
3082
3083 // Algorithm:
3084 //
3085 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3086 //
3087 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3088 // n = 64*m + j, 0 <= j < 64
3089 //
3090 // e^x = 2^((64*m + j + f)/64)
3091 // = (2^m) * (2^(j/64)) * 2^(f/64)
3092 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3093 //
3094 // f = x*(64/ln(2)) - n
3095 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3096 //
3097 // e^x = (2^m) * (2^(j/64)) * e^r
3098 //
3099 // (2^(j/64)) is precomputed
3100 //
3101 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3102 // e^r = 1 + q
3103 //
3104 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3105 //
3106 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3107 SDNodeFlags FlagsNoContract = Flags;
3108 FlagsNoContract.setAllowContract(false);
3109
3110 SDValue PH, PL;
3111 if (Subtarget->hasFastFMAF32()) {
3112 const float c_exp = numbers::log2ef;
3113 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3114 const float c_exp10 = 0x1.a934f0p+1f;
3115 const float cc_exp10 = 0x1.2f346ep-24f;
3116
3117 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3118 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3119
3120 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3121 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3122 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3123 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3124 } else {
3125 const float ch_exp = 0x1.714000p+0f;
3126 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3127
3128 const float ch_exp10 = 0x1.a92000p+1f;
3129 const float cl_exp10 = 0x1.4f0978p-11f;
3130
3131 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3132 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3133
3134 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3135 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3136 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3137 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3138 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3139
3140 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3141
3142 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3143 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3144 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3145 }
3146
3147 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3148
3149 // It is unsafe to contract this fsub into the PH multiply.
3150 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3151
3152 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3153 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3154 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3155
3156 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3157
3158 SDValue UnderflowCheckConst =
3159 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3160
3161 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3162 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3163 SDValue Underflow =
3164 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3165
3166 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3167 const auto &Options = getTargetMachine().Options;
3168
3169 if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) {
3170 SDValue OverflowCheckConst =
3171 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3172 SDValue Overflow =
3173 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3174 SDValue Inf =
3176 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3177 }
3178
3179 return R;
3180}
3181
3182static bool isCtlzOpc(unsigned Opc) {
3183 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3184}
3185
3186static bool isCttzOpc(unsigned Opc) {
3187 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3188}
3189
3191 SelectionDAG &DAG) const {
3192 auto SL = SDLoc(Op);
3193 auto Opc = Op.getOpcode();
3194 auto Arg = Op.getOperand(0u);
3195 auto ResultVT = Op.getValueType();
3196
3197 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3198 return {};
3199
3201 assert(ResultVT == Arg.getValueType());
3202
3203 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3204 SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
3205 SDValue NewOp;
3206
3207 if (Opc == ISD::CTLZ_ZERO_UNDEF) {
3208 NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
3209 NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
3210 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3211 } else {
3212 NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3213 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3214 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
3215 }
3216
3217 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3218}
3219
3221 SDLoc SL(Op);
3222 SDValue Src = Op.getOperand(0);
3223
3224 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3225 bool Ctlz = isCtlzOpc(Op.getOpcode());
3226 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3227
3228 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3229 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3230 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3231
3232 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3233 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3234 // (cttz hi:lo) -> (umin (ffbl src), 32)
3235 // (ctlz_zero_undef src) -> (ffbh src)
3236 // (cttz_zero_undef src) -> (ffbl src)
3237
3238 // 64-bit scalar version produce 32-bit result
3239 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3240 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3241 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3242 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3243 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3244 if (!ZeroUndef) {
3245 const SDValue ConstVal = DAG.getConstant(
3246 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3247 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3248 }
3249 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3250 }
3251
3252 SDValue Lo, Hi;
3253 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3254
3255 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3256 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3257
3258 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3259 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3260 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3261 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3262
3263 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3264 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3265 if (Ctlz)
3266 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3267 else
3268 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3269
3270 SDValue NewOpr;
3271 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3272 if (!ZeroUndef) {
3273 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3274 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3275 }
3276
3277 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3278}
3279
3281 bool Signed) const {
3282 // The regular method converting a 64-bit integer to float roughly consists of
3283 // 2 steps: normalization and rounding. In fact, after normalization, the
3284 // conversion from a 64-bit integer to a float is essentially the same as the
3285 // one from a 32-bit integer. The only difference is that it has more
3286 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3287 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3288 // converted into the correct float number. The basic steps for the unsigned
3289 // conversion are illustrated in the following pseudo code:
3290 //
3291 // f32 uitofp(i64 u) {
3292 // i32 hi, lo = split(u);
3293 // // Only count the leading zeros in hi as we have native support of the
3294 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3295 // // reduced to a 32-bit one automatically.
3296 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3297 // u <<= shamt;
3298 // hi, lo = split(u);
3299 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3300 // // convert it as a 32-bit integer and scale the result back.
3301 // return uitofp(hi) * 2^(32 - shamt);
3302 // }
3303 //
3304 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3305 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3306 // converted instead followed by negation based its sign bit.
3307
3308 SDLoc SL(Op);
3309 SDValue Src = Op.getOperand(0);
3310
3311 SDValue Lo, Hi;
3312 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3313 SDValue Sign;
3314 SDValue ShAmt;
3315 if (Signed && Subtarget->isGCN()) {
3316 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3317 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3318 // account. That is, the maximal shift is
3319 // - 32 if Lo and Hi have opposite signs;
3320 // - 33 if Lo and Hi have the same sign.
3321 //
3322 // Or, MaxShAmt = 33 + OppositeSign, where
3323 //
3324 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3325 // - -1 if Lo and Hi have opposite signs; and
3326 // - 0 otherwise.
3327 //
3328 // All in all, ShAmt is calculated as
3329 //
3330 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3331 //
3332 // or
3333 //
3334 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3335 //
3336 // to reduce the critical path.
3337 SDValue OppositeSign = DAG.getNode(
3338 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3339 DAG.getConstant(31, SL, MVT::i32));
3340 SDValue MaxShAmt =
3341 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3342 OppositeSign);
3343 // Count the leading sign bits.
3344 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
3345 // Different from unsigned conversion, the shift should be one bit less to
3346 // preserve the sign bit.
3347 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3348 DAG.getConstant(1, SL, MVT::i32));
3349 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3350 } else {
3351 if (Signed) {
3352 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3353 // absolute value first.
3354 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3355 DAG.getConstant(63, SL, MVT::i64));
3356 SDValue Abs =
3357 DAG.getNode(ISD::XOR, SL, MVT::i64,
3358 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3359 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3360 }
3361 // Count the leading zeros.
3362 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3363 // The shift amount for signed integers is [0, 32].
3364 }
3365 // Normalize the given 64-bit integer.
3366 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3367 // Split it again.
3368 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3369 // Calculate the adjust bit for rounding.
3370 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3371 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3372 DAG.getConstant(1, SL, MVT::i32), Lo);
3373 // Get the 32-bit normalized integer.
3374 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3375 // Convert the normalized 32-bit integer into f32.
3376 unsigned Opc =
3377 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3378 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3379
3380 // Finally, need to scale back the converted floating number as the original
3381 // 64-bit integer is converted as a 32-bit one.
3382 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3383 ShAmt);
3384 // On GCN, use LDEXP directly.
3385 if (Subtarget->isGCN())
3386 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3387
3388 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3389 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3390 // exponent is enough to avoid overflowing into the sign bit.
3391 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3392 DAG.getConstant(23, SL, MVT::i32));
3393 SDValue IVal =
3394 DAG.getNode(ISD::ADD, SL, MVT::i32,
3395 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3396 if (Signed) {
3397 // Set the sign bit.
3398 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3399 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3400 DAG.getConstant(31, SL, MVT::i32));
3401 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3402 }
3403 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3404}
3405
3407 bool Signed) const {
3408 SDLoc SL(Op);
3409 SDValue Src = Op.getOperand(0);
3410
3411 SDValue Lo, Hi;
3412 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3413
3415 SL, MVT::f64, Hi);
3416
3417 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3418
3419 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3420 DAG.getConstant(32, SL, MVT::i32));
3421 // TODO: Should this propagate fast-math-flags?
3422 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3423}
3424
3426 SelectionDAG &DAG) const {
3427 // TODO: Factor out code common with LowerSINT_TO_FP.
3428 EVT DestVT = Op.getValueType();
3429 SDValue Src = Op.getOperand(0);
3430 EVT SrcVT = Src.getValueType();
3431
3432 if (SrcVT == MVT::i16) {
3433 if (DestVT == MVT::f16)
3434 return Op;
3435 SDLoc DL(Op);
3436
3437 // Promote src to i32
3438 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3439 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3440 }
3441
3442 if (DestVT == MVT::bf16) {
3443 SDLoc SL(Op);
3444 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3445 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3446 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3447 }
3448
3449 if (SrcVT != MVT::i64)
3450 return Op;
3451
3452 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3453 SDLoc DL(Op);
3454
3455 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3456 SDValue FPRoundFlag =
3457 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3458 SDValue FPRound =
3459 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3460
3461 return FPRound;
3462 }
3463
3464 if (DestVT == MVT::f32)
3465 return LowerINT_TO_FP32(Op, DAG, false);
3466
3467 assert(DestVT == MVT::f64);
3468 return LowerINT_TO_FP64(Op, DAG, false);
3469}
3470
3472 SelectionDAG &DAG) const {
3473 EVT DestVT = Op.getValueType();
3474
3475 SDValue Src = Op.getOperand(0);
3476 EVT SrcVT = Src.getValueType();
3477
3478 if (SrcVT == MVT::i16) {
3479 if (DestVT == MVT::f16)
3480 return Op;
3481
3482 SDLoc DL(Op);
3483 // Promote src to i32
3484 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3485 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3486 }
3487
3488 if (DestVT == MVT::bf16) {
3489 SDLoc SL(Op);
3490 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3491 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3492 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3493 }
3494
3495 if (SrcVT != MVT::i64)
3496 return Op;
3497
3498 // TODO: Factor out code common with LowerUINT_TO_FP.
3499
3500 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3501 SDLoc DL(Op);
3502 SDValue Src = Op.getOperand(0);
3503
3504 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3505 SDValue FPRoundFlag =
3506 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3507 SDValue FPRound =
3508 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3509
3510 return FPRound;
3511 }
3512
3513 if (DestVT == MVT::f32)
3514 return LowerINT_TO_FP32(Op, DAG, true);
3515
3516 assert(DestVT == MVT::f64);
3517 return LowerINT_TO_FP64(Op, DAG, true);
3518}
3519
3521 bool Signed) const {
3522 SDLoc SL(Op);
3523
3524 SDValue Src = Op.getOperand(0);
3525 EVT SrcVT = Src.getValueType();
3526
3527 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3528
3529 // The basic idea of converting a floating point number into a pair of 32-bit
3530 // integers is illustrated as follows:
3531 //
3532 // tf := trunc(val);
3533 // hif := floor(tf * 2^-32);
3534 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3535 // hi := fptoi(hif);
3536 // lo := fptoi(lof);
3537 //
3538 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3539 SDValue Sign;
3540 if (Signed && SrcVT == MVT::f32) {
3541 // However, a 32-bit floating point number has only 23 bits mantissa and
3542 // it's not enough to hold all the significant bits of `lof` if val is
3543 // negative. To avoid the loss of precision, We need to take the absolute
3544 // value after truncating and flip the result back based on the original
3545 // signedness.
3546 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3547 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3548 DAG.getConstant(31, SL, MVT::i32));
3549 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3550 }
3551
3552 SDValue K0, K1;
3553 if (SrcVT == MVT::f64) {
3554 K0 = DAG.getConstantFP(
3555 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3556 SrcVT);
3557 K1 = DAG.getConstantFP(
3558 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3559 SrcVT);
3560 } else {
3561 K0 = DAG.getConstantFP(
3562 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3563 K1 = DAG.getConstantFP(
3564 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3565 }
3566 // TODO: Should this propagate fast-math-flags?
3567 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3568
3569 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3570
3571 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3572
3573 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3575 SL, MVT::i32, FloorMul);
3576 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3577
3578 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3579 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3580
3581 if (Signed && SrcVT == MVT::f32) {
3582 assert(Sign);
3583 // Flip the result based on the signedness, which is either all 0s or 1s.
3584 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3585 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3586 // r := xor(r, sign) - sign;
3587 Result =
3588 DAG.getNode(ISD::SUB, SL, MVT::i64,
3589 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3590 }
3591
3592 return Result;
3593}
3594
3596 SDLoc DL(Op);
3597 SDValue N0 = Op.getOperand(0);
3598
3599 // Convert to target node to get known bits
3600 if (N0.getValueType() == MVT::f32)
3601 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3602
3603 if (Op->getFlags().hasApproximateFuncs()) {
3604 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3605 return SDValue();
3606 }
3607
3608 return LowerF64ToF16Safe(N0, DL, DAG);
3609}
3610
3611// return node in i32
3613 SelectionDAG &DAG) const {
3614 assert(Src.getSimpleValueType() == MVT::f64);
3615
3616 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3617 // TODO: We can generate better code for True16.
3618 const unsigned ExpMask = 0x7ff;
3619 const unsigned ExpBiasf64 = 1023;
3620 const unsigned ExpBiasf16 = 15;
3621 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3622 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3623 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Src);
3624 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3625 DAG.getConstant(32, DL, MVT::i64));
3626 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3627 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3628 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3629 DAG.getConstant(20, DL, MVT::i64));
3630 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3631 DAG.getConstant(ExpMask, DL, MVT::i32));
3632 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3633 // add the f16 bias (15) to get the biased exponent for the f16 format.
3634 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3635 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3636
3637 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3638 DAG.getConstant(8, DL, MVT::i32));
3639 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3640 DAG.getConstant(0xffe, DL, MVT::i32));
3641
3642 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3643 DAG.getConstant(0x1ff, DL, MVT::i32));
3644 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3645
3646 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3647 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3648
3649 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3650 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3651 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3652 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3653
3654 // N = M | (E << 12);
3655 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3656 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3657 DAG.getConstant(12, DL, MVT::i32)));
3658
3659 // B = clamp(1-E, 0, 13);
3660 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3661 One, E);
3662 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3663 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3664 DAG.getConstant(13, DL, MVT::i32));
3665
3666 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3667 DAG.getConstant(0x1000, DL, MVT::i32));
3668
3669 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3670 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3671 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3672 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3673
3674 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3675 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3676 DAG.getConstant(0x7, DL, MVT::i32));
3677 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3678 DAG.getConstant(2, DL, MVT::i32));
3679 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3680 One, Zero, ISD::SETEQ);
3681 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3682 One, Zero, ISD::SETGT);
3683 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3684 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3685
3686 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3687 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3688 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3689 I, V, ISD::SETEQ);
3690
3691 // Extract the sign bit.
3692 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3693 DAG.getConstant(16, DL, MVT::i32));
3694 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3695 DAG.getConstant(0x8000, DL, MVT::i32));
3696
3697 return DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3698}
3699
3701 SelectionDAG &DAG) const {
3702 SDValue Src = Op.getOperand(0);
3703 unsigned OpOpcode = Op.getOpcode();
3704 EVT SrcVT = Src.getValueType();
3705 EVT DestVT = Op.getValueType();
3706
3707 // Will be selected natively
3708 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3709 return Op;
3710
3711 if (SrcVT == MVT::bf16) {
3712 SDLoc DL(Op);
3713 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3714 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3715 }
3716
3717 // Promote i16 to i32
3718 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3719 SDLoc DL(Op);
3720
3721 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3722 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3723 }
3724
3725 if (DestVT != MVT::i64)
3726 return Op;
3727
3728 if (SrcVT == MVT::f16 ||
3729 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3730 SDLoc DL(Op);
3731
3732 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3733 unsigned Ext =
3735 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3736 }
3737
3738 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3739 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3740
3741 return SDValue();
3742}
3743
3745 SelectionDAG &DAG) const {
3746 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3747 MVT VT = Op.getSimpleValueType();
3748 MVT ScalarVT = VT.getScalarType();
3749
3750 assert(VT.isVector());
3751
3752 SDValue Src = Op.getOperand(0);
3753 SDLoc DL(Op);
3754
3755 // TODO: Don't scalarize on Evergreen?
3756 unsigned NElts = VT.getVectorNumElements();
3758 DAG.ExtractVectorElements(Src, Args, 0, NElts);
3759
3760 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3761 for (unsigned I = 0; I < NElts; ++I)
3762 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
3763
3764 return DAG.getBuildVector(VT, DL, Args);
3765}
3766
3767//===----------------------------------------------------------------------===//
3768// Custom DAG optimizations
3769//===----------------------------------------------------------------------===//
3770
3771static bool isU24(SDValue Op, SelectionDAG &DAG) {
3772 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3773}
3774
3775static bool isI24(SDValue Op, SelectionDAG &DAG) {
3776 EVT VT = Op.getValueType();
3777 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3778 // as unsigned 24-bit values.
3780}
3781
3784 SelectionDAG &DAG = DCI.DAG;
3785 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3786 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3787
3788 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
3789 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
3790 unsigned NewOpcode = Node24->getOpcode();
3791 if (IsIntrin) {
3792 unsigned IID = Node24->getConstantOperandVal(0);
3793 switch (IID) {
3794 case Intrinsic::amdgcn_mul_i24:
3795 NewOpcode = AMDGPUISD::MUL_I24;
3796 break;
3797 case Intrinsic::amdgcn_mul_u24:
3798 NewOpcode = AMDGPUISD::MUL_U24;
3799 break;
3800 case Intrinsic::amdgcn_mulhi_i24:
3801 NewOpcode = AMDGPUISD::MULHI_I24;
3802 break;
3803 case Intrinsic::amdgcn_mulhi_u24:
3804 NewOpcode = AMDGPUISD::MULHI_U24;
3805 break;
3806 default:
3807 llvm_unreachable("Expected 24-bit mul intrinsic");
3808 }
3809 }
3810
3811 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
3812
3813 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3814 // the operands to have other uses, but will only perform simplifications that
3815 // involve bypassing some nodes for this user.
3816 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
3817 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
3818 if (DemandedLHS || DemandedRHS)
3819 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
3820 DemandedLHS ? DemandedLHS : LHS,
3821 DemandedRHS ? DemandedRHS : RHS);
3822
3823 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3824 // operands if this node is the only user.
3825 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
3826 return SDValue(Node24, 0);
3827 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
3828 return SDValue(Node24, 0);
3829
3830 return SDValue();
3831}
3832
3833template <typename IntTy>
3835 uint32_t Width, const SDLoc &DL) {
3836 if (Width + Offset < 32) {
3837 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3838 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3839 if constexpr (std::is_signed_v<IntTy>) {
3840 return DAG.getSignedConstant(Result, DL, MVT::i32);
3841 } else {
3842 return DAG.getConstant(Result, DL, MVT::i32);
3843 }
3844 }
3845
3846 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3847}
3848
3849static bool hasVolatileUser(SDNode *Val) {
3850 for (SDNode *U : Val->users()) {
3851 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3852 if (M->isVolatile())
3853 return true;
3854 }
3855 }
3856
3857 return false;
3858}
3859
3861 // i32 vectors are the canonical memory type.
3862 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3863 return false;
3864
3865 if (!VT.isByteSized())
3866 return false;
3867
3868 unsigned Size = VT.getStoreSize();
3869
3870 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3871 return false;
3872
3873 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3874 return false;
3875
3876 return true;
3877}
3878
3879// Replace load of an illegal type with a bitcast from a load of a friendlier
3880// type.
3882 DAGCombinerInfo &DCI) const {
3883 if (!DCI.isBeforeLegalize())
3884 return SDValue();
3885
3886 LoadSDNode *LN = cast<LoadSDNode>(N);
3887 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
3888 return SDValue();
3889
3890 SDLoc SL(N);
3891 SelectionDAG &DAG = DCI.DAG;
3892 EVT VT = LN->getMemoryVT();
3893
3894 unsigned Size = VT.getStoreSize();
3895 Align Alignment = LN->getAlign();
3896 if (Alignment < Size && isTypeLegal(VT)) {
3897 unsigned IsFast;
3898 unsigned AS = LN->getAddressSpace();
3899
3900 // Expand unaligned loads earlier than legalization. Due to visitation order
3901 // problems during legalization, the emitted instructions to pack and unpack
3902 // the bytes again are not eliminated in the case of an unaligned copy.
3904 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3905 if (VT.isVector())
3906 return SplitVectorLoad(SDValue(LN, 0), DAG);
3907
3908 SDValue Ops[2];
3909 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3910
3911 return DAG.getMergeValues(Ops, SDLoc(N));
3912 }
3913
3914 if (!IsFast)
3915 return SDValue();
3916 }
3917
3918 if (!shouldCombineMemoryType(VT))
3919 return SDValue();
3920
3921 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3922
3923 SDValue NewLoad
3924 = DAG.getLoad(NewVT, SL, LN->getChain(),
3925 LN->getBasePtr(), LN->getMemOperand());
3926
3927 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3928 DCI.CombineTo(N, BC, NewLoad.getValue(1));
3929 return SDValue(N, 0);
3930}
3931
3932// Replace store of an illegal type with a store of a bitcast to a friendlier
3933// type.
3935 DAGCombinerInfo &DCI) const {
3936 if (!DCI.isBeforeLegalize())
3937 return SDValue();
3938
3939 StoreSDNode *SN = cast<StoreSDNode>(N);
3940 if (!SN->isSimple() || !ISD::isNormalStore(SN))
3941 return SDValue();
3942
3943 EVT VT = SN->getMemoryVT();
3944 unsigned Size = VT.getStoreSize();
3945
3946 SDLoc SL(N);
3947 SelectionDAG &DAG = DCI.DAG;
3948 Align Alignment = SN->getAlign();
3949 if (Alignment < Size && isTypeLegal(VT)) {
3950 unsigned IsFast;
3951 unsigned AS = SN->getAddressSpace();
3952
3953 // Expand unaligned stores earlier than legalization. Due to visitation
3954 // order problems during legalization, the emitted instructions to pack and
3955 // unpack the bytes again are not eliminated in the case of an unaligned
3956 // copy.
3958 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3959 if (VT.isVector())
3960 return SplitVectorStore(SDValue(SN, 0), DAG);
3961
3962 return expandUnalignedStore(SN, DAG);
3963 }
3964
3965 if (!IsFast)
3966 return SDValue();
3967 }
3968
3969 if (!shouldCombineMemoryType(VT))
3970 return SDValue();
3971
3972 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3973 SDValue Val = SN->getValue();
3974
3975 //DCI.AddToWorklist(Val.getNode());
3976
3977 bool OtherUses = !Val.hasOneUse();
3978 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3979 if (OtherUses) {
3980 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3981 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3982 }
3983
3984 return DAG.getStore(SN->getChain(), SL, CastVal,
3985 SN->getBasePtr(), SN->getMemOperand());
3986}
3987
3988// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3989// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3990// issues.
3992 DAGCombinerInfo &DCI) const {
3993 SelectionDAG &DAG = DCI.DAG;
3994 SDValue N0 = N->getOperand(0);
3995
3996 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3997 // (vt2 (truncate (assertzext vt0:x, vt1)))
3998 if (N0.getOpcode() == ISD::TRUNCATE) {
3999 SDValue N1 = N->getOperand(1);
4000 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
4001 SDLoc SL(N);
4002
4003 SDValue Src = N0.getOperand(0);
4004 EVT SrcVT = Src.getValueType();
4005 if (SrcVT.bitsGE(ExtVT)) {
4006 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
4007 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
4008 }
4009 }
4010
4011 return SDValue();
4012}
4013
4015 SDNode *N, DAGCombinerInfo &DCI) const {
4016 unsigned IID = N->getConstantOperandVal(0);
4017 switch (IID) {
4018 case Intrinsic::amdgcn_mul_i24:
4019 case Intrinsic::amdgcn_mul_u24:
4020 case Intrinsic::amdgcn_mulhi_i24:
4021 case Intrinsic::amdgcn_mulhi_u24:
4022 return simplifyMul24(N, DCI);
4023 case Intrinsic::amdgcn_fract:
4024 case Intrinsic::amdgcn_rsq:
4025 case Intrinsic::amdgcn_rcp_legacy:
4026 case Intrinsic::amdgcn_rsq_legacy:
4027 case Intrinsic::amdgcn_rsq_clamp:
4028 case Intrinsic::amdgcn_tanh:
4029 case Intrinsic::amdgcn_prng_b32: {
4030 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
4031 SDValue Src = N->getOperand(1);
4032 return Src.isUndef() ? Src : SDValue();
4033 }
4034 case Intrinsic::amdgcn_frexp_exp: {
4035 // frexp_exp (fneg x) -> frexp_exp x
4036 // frexp_exp (fabs x) -> frexp_exp x
4037 // frexp_exp (fneg (fabs x)) -> frexp_exp x
4038 SDValue Src = N->getOperand(1);
4039 SDValue PeekSign = peekFPSignOps(Src);
4040 if (PeekSign == Src)
4041 return SDValue();
4042 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),
4043 0);
4044 }
4045 default:
4046 return SDValue();
4047 }
4048}
4049
4050/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
4051/// binary operation \p Opc to it with the corresponding constant operands.
4053 DAGCombinerInfo &DCI, const SDLoc &SL,
4054 unsigned Opc, SDValue LHS,
4055 uint32_t ValLo, uint32_t ValHi) const {
4056 SelectionDAG &DAG = DCI.DAG;
4057 SDValue Lo, Hi;
4058 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
4059
4060 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
4061 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
4062
4063 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
4064 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
4065
4066 // Re-visit the ands. It's possible we eliminated one of them and it could
4067 // simplify the vector.
4068 DCI.AddToWorklist(Lo.getNode());
4069 DCI.AddToWorklist(Hi.getNode());
4070
4071 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
4072 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4073}
4074
4076 DAGCombinerInfo &DCI) const {
4077 EVT VT = N->getValueType(0);
4078 SDValue LHS = N->getOperand(0);
4079 SDValue RHS = N->getOperand(1);
4080 ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
4081 SDLoc SL(N);
4082 SelectionDAG &DAG = DCI.DAG;
4083
4084 unsigned RHSVal;
4085 if (CRHS) {
4086 RHSVal = CRHS->getZExtValue();
4087 if (!RHSVal)
4088 return LHS;
4089
4090 switch (LHS->getOpcode()) {
4091 default:
4092 break;
4093 case ISD::ZERO_EXTEND:
4094 case ISD::SIGN_EXTEND:
4095 case ISD::ANY_EXTEND: {
4096 SDValue X = LHS->getOperand(0);
4097
4098 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
4099 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
4100 // Prefer build_vector as the canonical form if packed types are legal.
4101 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4102 SDValue Vec = DAG.getBuildVector(
4103 MVT::v2i16, SL,
4104 {DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0)});
4105 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
4106 }
4107
4108 // shl (ext x) => zext (shl x), if shift does not overflow int
4109 if (VT != MVT::i64)
4110 break;
4111 KnownBits Known = DAG.computeKnownBits(X);
4112 unsigned LZ = Known.countMinLeadingZeros();
4113 if (LZ < RHSVal)
4114 break;
4115 EVT XVT = X.getValueType();
4116 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(CRHS, 0));
4117 return DAG.getZExtOrTrunc(Shl, SL, VT);
4118 }
4119 }
4120 }
4121
4122 if (VT.getScalarType() != MVT::i64)
4123 return SDValue();
4124
4125 // i64 (shl x, C) -> (build_pair 0, (shl x, C - 32))
4126
4127 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4128 // common case, splitting this into a move and a 32-bit shift is faster and
4129 // the same code size.
4130 KnownBits Known = DAG.computeKnownBits(RHS);
4131
4132 EVT ElementType = VT.getScalarType();
4133 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4134 EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
4135 : TargetScalarType;
4136
4137 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4138 return SDValue();
4139 SDValue ShiftAmt;
4140
4141 if (CRHS) {
4142 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4143 TargetType);
4144 } else {
4145 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4146 const SDValue ShiftMask =
4147 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4148 // This AND instruction will clamp out of bounds shift values.
4149 // It will also be removed during later instruction selection.
4150 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4151 }
4152
4153 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, TargetType, LHS);
4154 SDValue NewShift =
4155 DAG.getNode(ISD::SHL, SL, TargetType, Lo, ShiftAmt, N->getFlags());
4156
4157 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4158 SDValue Vec;
4159
4160 if (VT.isVector()) {
4161 EVT ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4162 unsigned NElts = TargetType.getVectorNumElements();
4164 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4165
4166 DAG.ExtractVectorElements(NewShift, HiOps, 0, NElts);
4167 for (unsigned I = 0; I != NElts; ++I)
4168 HiAndLoOps[2 * I + 1] = HiOps[I];
4169 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4170 } else {
4171 EVT ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4172 Vec = DAG.getBuildVector(ConcatType, SL, {Zero, NewShift});
4173 }
4174 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4175}
4176
4178 DAGCombinerInfo &DCI) const {
4179 SDValue RHS = N->getOperand(1);
4180 ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
4181 EVT VT = N->getValueType(0);
4182 SDValue LHS = N->getOperand(0);
4183 SelectionDAG &DAG = DCI.DAG;
4184 SDLoc SL(N);
4185
4186 if (VT.getScalarType() != MVT::i64)
4187 return SDValue();
4188
4189 // For C >= 32
4190 // i64 (sra x, C) -> (build_pair (sra hi_32(x), C - 32), sra hi_32(x), 31))
4191
4192 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4193 // common case, splitting this into a move and a 32-bit shift is faster and
4194 // the same code size.
4195 KnownBits Known = DAG.computeKnownBits(RHS);
4196
4197 EVT ElementType = VT.getScalarType();
4198 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4199 EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
4200 : TargetScalarType;
4201
4202 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4203 return SDValue();
4204
4205 SDValue ShiftFullAmt =
4206 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4207 SDValue ShiftAmt;
4208 if (CRHS) {
4209 unsigned RHSVal = CRHS->getZExtValue();
4210 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4211 TargetType);
4212 } else if (Known.getMinValue().getZExtValue() ==
4213 (ElementType.getSizeInBits() - 1)) {
4214 ShiftAmt = ShiftFullAmt;
4215 } else {
4216 SDValue truncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4217 const SDValue ShiftMask =
4218 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4219 // This AND instruction will clamp out of bounds shift values.
4220 // It will also be removed during later instruction selection.
4221 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, truncShiftAmt, ShiftMask);
4222 }
4223
4224 EVT ConcatType;
4225 SDValue Hi;
4226 SDLoc LHSSL(LHS);
4227 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4228 if (VT.isVector()) {
4229 unsigned NElts = TargetType.getVectorNumElements();
4230 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4231 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4232 SmallVector<SDValue, 8> HiOps(NElts);
4233 SmallVector<SDValue, 16> HiAndLoOps;
4234
4235 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, 0, NElts * 2);
4236 for (unsigned I = 0; I != NElts; ++I) {
4237 HiOps[I] = HiAndLoOps[2 * I + 1];
4238 }
4239 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4240 } else {
4241 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4242 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4243 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4244 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4245 }
4246
4247 KnownBits KnownLHS = DAG.computeKnownBits(LHS);
4248 SDValue HiShift;
4249 if (KnownLHS.isNegative()) {
4250 HiShift = DAG.getAllOnesConstant(SL, TargetType);
4251 } else {
4252 Hi = DAG.getFreeze(Hi);
4253 HiShift = DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftFullAmt);
4254 }
4255 SDValue NewShift =
4256 DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4257
4258 SDValue Vec;
4259 if (VT.isVector()) {
4260 unsigned NElts = TargetType.getVectorNumElements();
4263 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2);
4264
4265 DAG.ExtractVectorElements(HiShift, HiOps, 0, NElts);
4266 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4267 for (unsigned I = 0; I != NElts; ++I) {
4268 HiAndLoOps[2 * I + 1] = HiOps[I];
4269 HiAndLoOps[2 * I] = LoOps[I];
4270 }
4271 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4272 } else {
4273 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, HiShift});
4274 }
4275 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4276}
4277
4279 DAGCombinerInfo &DCI) const {
4280 SDValue RHS = N->getOperand(1);
4281 ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
4282 EVT VT = N->getValueType(0);
4283 SDValue LHS = N->getOperand(0);
4284 SelectionDAG &DAG = DCI.DAG;
4285 SDLoc SL(N);
4286 unsigned RHSVal;
4287
4288 if (CRHS) {
4289 RHSVal = CRHS->getZExtValue();
4290
4291 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4292 // this improves the ability to match BFE patterns in isel.
4293 if (LHS.getOpcode() == ISD::AND) {
4294 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
4295 unsigned MaskIdx, MaskLen;
4296 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4297 MaskIdx == RHSVal) {
4298 return DAG.getNode(ISD::AND, SL, VT,
4299 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0),
4300 N->getOperand(1)),
4301 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1),
4302 N->getOperand(1)));
4303 }
4304 }
4305 }
4306 }
4307
4308 if (VT.getScalarType() != MVT::i64)
4309 return SDValue();
4310
4311 // for C >= 32
4312 // i64 (srl x, C) -> (build_pair (srl hi_32(x), C - 32), 0)
4313
4314 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4315 // common case, splitting this into a move and a 32-bit shift is faster and
4316 // the same code size.
4317 KnownBits Known = DAG.computeKnownBits(RHS);
4318
4319 EVT ElementType = VT.getScalarType();
4320 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4321 EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
4322 : TargetScalarType;
4323
4324 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4325 return SDValue();
4326
4327 SDValue ShiftAmt;
4328 if (CRHS) {
4329 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4330 TargetType);
4331 } else {
4332 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4333 const SDValue ShiftMask =
4334 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4335 // This AND instruction will clamp out of bounds shift values.
4336 // It will also be removed during later instruction selection.
4337 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4338 }
4339
4340 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4341 EVT ConcatType;
4342 SDValue Hi;
4343 SDLoc LHSSL(LHS);
4344 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4345 if (VT.isVector()) {
4346 unsigned NElts = TargetType.getVectorNumElements();
4347 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4348 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4349 SmallVector<SDValue, 8> HiOps(NElts);
4350 SmallVector<SDValue, 16> HiAndLoOps;
4351
4352 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, /*Start=*/0, NElts * 2);
4353 for (unsigned I = 0; I != NElts; ++I)
4354 HiOps[I] = HiAndLoOps[2 * I + 1];
4355 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4356 } else {
4357 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4358 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4359 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4360 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4361 }
4362
4363 SDValue NewShift =
4364 DAG.getNode(ISD::SRL, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4365
4366 SDValue Vec;
4367 if (VT.isVector()) {
4368 unsigned NElts = TargetType.getVectorNumElements();
4370 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4371
4372 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4373 for (unsigned I = 0; I != NElts; ++I)
4374 HiAndLoOps[2 * I] = LoOps[I];
4375 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4376 } else {
4377 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, Zero});
4378 }
4379 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4380}
4381
4383 SDNode *N, DAGCombinerInfo &DCI) const {
4384 SDLoc SL(N);
4385 SelectionDAG &DAG = DCI.DAG;
4386 EVT VT = N->getValueType(0);
4387 SDValue Src = N->getOperand(0);
4388
4389 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4390 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4391 SDValue Vec = Src.getOperand(0);
4392 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4393 SDValue Elt0 = Vec.getOperand(0);
4394 EVT EltVT = Elt0.getValueType();
4395 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4396 if (EltVT.isFloatingPoint()) {
4397 Elt0 = DAG.getNode(ISD::BITCAST, SL,
4398 EltVT.changeTypeToInteger(), Elt0);
4399 }
4400
4401 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
4402 }
4403 }
4404 }
4405
4406 // Equivalent of above for accessing the high element of a vector as an
4407 // integer operation.
4408 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4409 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4410 if (auto *K = isConstOrConstSplat(Src.getOperand(1))) {
4411 SDValue BV = stripBitcast(Src.getOperand(0));
4412 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
4413 EVT SrcEltVT = BV.getOperand(0).getValueType();
4414 unsigned SrcEltSize = SrcEltVT.getSizeInBits();
4415 unsigned BitIndex = K->getZExtValue();
4416 unsigned PartIndex = BitIndex / SrcEltSize;
4417
4418 if (PartIndex * SrcEltSize == BitIndex &&
4419 PartIndex < BV.getNumOperands()) {
4420 if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) {
4421 SDValue SrcElt =
4422 DAG.getNode(ISD::BITCAST, SL, SrcEltVT.changeTypeToInteger(),
4423 BV.getOperand(PartIndex));
4424 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
4425 }
4426 }
4427 }
4428 }
4429 }
4430
4431 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4432 //
4433 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4434 // i16 (trunc (srl (i32 (trunc x), K)))
4435 if (VT.getScalarSizeInBits() < 32) {
4436 EVT SrcVT = Src.getValueType();
4437 if (SrcVT.getScalarSizeInBits() > 32 &&
4438 (Src.getOpcode() == ISD::SRL ||
4439 Src.getOpcode() == ISD::SRA ||
4440 Src.getOpcode() == ISD::SHL)) {
4441 SDValue Amt = Src.getOperand(1);
4442 KnownBits Known = DAG.computeKnownBits(Amt);
4443
4444 // - For left shifts, do the transform as long as the shift
4445 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4446 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4447 // losing information stored in the high bits when truncating.
4448 const unsigned MaxCstSize =
4449 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4450 if (Known.getMaxValue().ule(MaxCstSize)) {
4451 EVT MidVT = VT.isVector() ?
4452 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4453 VT.getVectorNumElements()) : MVT::i32;
4454
4455 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
4456 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
4457 Src.getOperand(0));
4458 DCI.AddToWorklist(Trunc.getNode());
4459
4460 if (Amt.getValueType() != NewShiftVT) {
4461 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
4462 DCI.AddToWorklist(Amt.getNode());
4463 }
4464
4465 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
4466 Trunc, Amt);
4467 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
4468 }
4469 }
4470 }
4471
4472 return SDValue();
4473}
4474
4475// We need to specifically handle i64 mul here to avoid unnecessary conversion
4476// instructions. If we only match on the legalized i64 mul expansion,
4477// SimplifyDemandedBits will be unable to remove them because there will be
4478// multiple uses due to the separate mul + mulh[su].
4479static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4480 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4481 if (Size <= 32) {
4482 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4483 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
4484 }
4485
4486 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4487 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4488
4489 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
4490 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
4491
4492 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
4493}
4494
4495/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4496/// return SDValue().
4497static SDValue getAddOneOp(const SDNode *V) {
4498 if (V->getOpcode() != ISD::ADD)
4499 return SDValue();
4500
4501 return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();
4502}
4503
4505 DAGCombinerInfo &DCI) const {
4506 assert(N->getOpcode() == ISD::MUL);
4507 EVT VT = N->getValueType(0);
4508
4509 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4510 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4511 // unnecessarily). isDivergent() is used as an approximation of whether the
4512 // value is in an SGPR.
4513 if (!N->isDivergent())
4514 return SDValue();
4515
4516 unsigned Size = VT.getSizeInBits();
4517 if (VT.isVector() || Size > 64)
4518 return SDValue();
4519
4520 SelectionDAG &DAG = DCI.DAG;
4521 SDLoc DL(N);
4522
4523 SDValue N0 = N->getOperand(0);
4524 SDValue N1 = N->getOperand(1);
4525
4526 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4527 // matching.
4528
4529 // mul x, (add y, 1) -> add (mul x, y), x
4530 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4531 SDValue AddOp = getAddOneOp(V.getNode());
4532 if (!AddOp)
4533 return SDValue();
4534
4535 if (V.hasOneUse() || all_of(V->users(), [](const SDNode *U) -> bool {
4536 return U->getOpcode() == ISD::MUL;
4537 }))
4538 return AddOp;
4539
4540 return SDValue();
4541 };
4542
4543 // FIXME: The selection pattern is not properly checking for commuted
4544 // operands, so we have to place the mul in the LHS
4545 if (SDValue MulOper = IsFoldableAdd(N0)) {
4546 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
4547 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
4548 }
4549
4550 if (SDValue MulOper = IsFoldableAdd(N1)) {
4551 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
4552 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
4553 }
4554
4555 // There are i16 integer mul/mad.
4556 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
4557 return SDValue();
4558
4559 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4560 // in the source into any_extends if the result of the mul is truncated. Since
4561 // we can assume the high bits are whatever we want, use the underlying value
4562 // to avoid the unknown high bits from interfering.
4563 if (N0.getOpcode() == ISD::ANY_EXTEND)
4564 N0 = N0.getOperand(0);
4565
4566 if (N1.getOpcode() == ISD::ANY_EXTEND)
4567 N1 = N1.getOperand(0);
4568
4569 SDValue Mul;
4570
4571 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4572 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4573 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4574 Mul = getMul24(DAG, DL, N0, N1, Size, false);
4575 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4576 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4577 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4578 Mul = getMul24(DAG, DL, N0, N1, Size, true);
4579 } else {
4580 return SDValue();
4581 }
4582
4583 // We need to use sext even for MUL_U24, because MUL_U24 is used
4584 // for signed multiply of 8 and 16-bit types.
4585 return DAG.getSExtOrTrunc(Mul, DL, VT);
4586}
4587
4588SDValue
4590 DAGCombinerInfo &DCI) const {
4591 if (N->getValueType(0) != MVT::i32)
4592 return SDValue();
4593
4594 SelectionDAG &DAG = DCI.DAG;
4595 SDLoc DL(N);
4596
4597 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
4598 SDValue N0 = N->getOperand(0);
4599 SDValue N1 = N->getOperand(1);
4600
4601 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4602 // in the source into any_extends if the result of the mul is truncated. Since
4603 // we can assume the high bits are whatever we want, use the underlying value
4604 // to avoid the unknown high bits from interfering.
4605 if (N0.getOpcode() == ISD::ANY_EXTEND)
4606 N0 = N0.getOperand(0);
4607 if (N1.getOpcode() == ISD::ANY_EXTEND)
4608 N1 = N1.getOperand(0);
4609
4610 // Try to use two fast 24-bit multiplies (one for each half of the result)
4611 // instead of one slow extending multiply.
4612 unsigned LoOpcode = 0;
4613 unsigned HiOpcode = 0;
4614 if (Signed) {
4615 if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4616 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4617 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4618 LoOpcode = AMDGPUISD::MUL_I24;
4619 HiOpcode = AMDGPUISD::MULHI_I24;
4620 }
4621 } else {
4622 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4623 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4624 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4625 LoOpcode = AMDGPUISD::MUL_U24;
4626 HiOpcode = AMDGPUISD::MULHI_U24;
4627 }
4628 }
4629 if (!LoOpcode)
4630 return SDValue();
4631
4632 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
4633 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
4634 DCI.CombineTo(N, Lo, Hi);
4635 return SDValue(N, 0);
4636}
4637
4639 DAGCombinerInfo &DCI) const {
4640 EVT VT = N->getValueType(0);
4641
4642 if (!Subtarget->hasMulI24() || VT.isVector())
4643 return SDValue();
4644
4645 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4646 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4647 // unnecessarily). isDivergent() is used as an approximation of whether the
4648 // value is in an SGPR.
4649 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4650 // valu op anyway)
4651 if (Subtarget->hasSMulHi() && !N->isDivergent())
4652 return SDValue();
4653
4654 SelectionDAG &DAG = DCI.DAG;
4655 SDLoc DL(N);
4656
4657 SDValue N0 = N->getOperand(0);
4658 SDValue N1 = N->getOperand(1);
4659
4660 if (!isI24(N0, DAG) || !isI24(N1, DAG))
4661 return SDValue();
4662
4663 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4664 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4665
4666 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
4667 DCI.AddToWorklist(Mulhi.getNode());
4668 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
4669}
4670
4672 DAGCombinerInfo &DCI) const {
4673 EVT VT = N->getValueType(0);
4674
4675 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
4676 return SDValue();
4677
4678 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4679 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4680 // unnecessarily). isDivergent() is used as an approximation of whether the
4681 // value is in an SGPR.
4682 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4683 // valu op anyway)
4684 if (Subtarget->hasSMulHi() && !N->isDivergent())
4685 return SDValue();
4686
4687 SelectionDAG &DAG = DCI.DAG;
4688 SDLoc DL(N);
4689
4690 SDValue N0 = N->getOperand(0);
4691 SDValue N1 = N->getOperand(1);
4692
4693 if (!isU24(N0, DAG) || !isU24(N1, DAG))
4694 return SDValue();
4695
4696 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4697 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4698
4699 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
4700 DCI.AddToWorklist(Mulhi.getNode());
4701 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
4702}
4703
4704SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4705 SDValue Op,
4706 const SDLoc &DL,
4707 unsigned Opc) const {
4708 EVT VT = Op.getValueType();
4709 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
4710 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
4711 LegalVT != MVT::i16))
4712 return SDValue();
4713
4714 if (VT != MVT::i32)
4715 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
4716
4717 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
4718 if (VT != MVT::i32)
4719 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
4720
4721 return FFBX;
4722}
4723
4724// The native instructions return -1 on 0 input. Optimize out a select that
4725// produces -1 on 0.
4726//
4727// TODO: If zero is not undef, we could also do this if the output is compared
4728// against the bitwidth.
4729//
4730// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4732 SDValue LHS, SDValue RHS,
4733 DAGCombinerInfo &DCI) const {
4734 if (!isNullConstant(Cond.getOperand(1)))
4735 return SDValue();
4736
4737 SelectionDAG &DAG = DCI.DAG;
4738 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
4739 SDValue CmpLHS = Cond.getOperand(0);
4740
4741 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4742 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4743 if (CCOpcode == ISD::SETEQ &&
4744 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
4745 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {
4746 unsigned Opc =
4748 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4749 }
4750
4751 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4752 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4753 if (CCOpcode == ISD::SETNE &&
4754 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
4755 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {
4756 unsigned Opc =
4758
4759 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4760 }
4761
4762 return SDValue();
4763}
4764
4766 unsigned Op,
4767 const SDLoc &SL,
4768 SDValue Cond,
4769 SDValue N1,
4770 SDValue N2) {
4771 SelectionDAG &DAG = DCI.DAG;
4772 EVT VT = N1.getValueType();
4773
4774 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
4775 N1.getOperand(0), N2.getOperand(0));
4776 DCI.AddToWorklist(NewSelect.getNode());
4777 return DAG.getNode(Op, SL, VT, NewSelect);
4778}
4779
4780// Pull a free FP operation out of a select so it may fold into uses.
4781//
4782// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4783// select c, (fneg x), k -> fneg (select c, x, (fneg k))
4784//
4785// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
4786// select c, (fabs x), +k -> fabs (select c, x, k)
4787SDValue
4789 SDValue N) const {
4790 SelectionDAG &DAG = DCI.DAG;
4791 SDValue Cond = N.getOperand(0);
4792 SDValue LHS = N.getOperand(1);
4793 SDValue RHS = N.getOperand(2);
4794
4795 EVT VT = N.getValueType();
4796 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
4797 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
4799 return SDValue();
4800
4801 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
4802 SDLoc(N), Cond, LHS, RHS);
4803 }
4804
4805 bool Inv = false;
4806 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
4807 std::swap(LHS, RHS);
4808 Inv = true;
4809 }
4810
4811 // TODO: Support vector constants.
4812 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
4813 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
4814 !selectSupportsSourceMods(N.getNode())) {
4815 SDLoc SL(N);
4816 // If one side is an fneg/fabs and the other is a constant, we can push the
4817 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
4818 SDValue NewLHS = LHS.getOperand(0);
4819 SDValue NewRHS = RHS;
4820
4821 // Careful: if the neg can be folded up, don't try to pull it back down.
4822 bool ShouldFoldNeg = true;
4823
4824 if (NewLHS.hasOneUse()) {
4825 unsigned Opc = NewLHS.getOpcode();
4826 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
4827 ShouldFoldNeg = false;
4828 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
4829 ShouldFoldNeg = false;
4830 }
4831
4832 if (ShouldFoldNeg) {
4833 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
4834 return SDValue();
4835
4836 // We're going to be forced to use a source modifier anyway, there's no
4837 // point to pulling the negate out unless we can get a size reduction by
4838 // negating the constant.
4839 //
4840 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
4841 // about cheaper constants.
4842 if (NewLHS.getOpcode() == ISD::FABS &&
4844 return SDValue();
4845
4847 return SDValue();
4848
4849 if (LHS.getOpcode() == ISD::FNEG)
4850 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4851
4852 if (Inv)
4853 std::swap(NewLHS, NewRHS);
4854
4855 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
4856 Cond, NewLHS, NewRHS);
4857 DCI.AddToWorklist(NewSelect.getNode());
4858 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
4859 }
4860 }
4861
4862 return SDValue();
4863}
4864
4866 DAGCombinerInfo &DCI) const {
4867 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
4868 return Folded;
4869
4870 SDValue Cond = N->getOperand(0);
4871 if (Cond.getOpcode() != ISD::SETCC)
4872 return SDValue();
4873
4874 EVT VT = N->getValueType(0);
4875 SDValue LHS = Cond.getOperand(0);
4876 SDValue RHS = Cond.getOperand(1);
4877 SDValue CC = Cond.getOperand(2);
4878
4879 SDValue True = N->getOperand(1);
4880 SDValue False = N->getOperand(2);
4881
4882 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
4883 SelectionDAG &DAG = DCI.DAG;
4884 if (DAG.isConstantValueOfAnyType(True) &&
4885 !DAG.isConstantValueOfAnyType(False)) {
4886 // Swap cmp + select pair to move constant to false input.
4887 // This will allow using VOPC cndmasks more often.
4888 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
4889
4890 SDLoc SL(N);
4891 ISD::CondCode NewCC =
4892 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
4893
4894 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
4895 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
4896 }
4897
4898 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
4900 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
4901 // Revisit this node so we can catch min3/max3/med3 patterns.
4902 //DCI.AddToWorklist(MinMax.getNode());
4903 return MinMax;
4904 }
4905 }
4906
4907 // There's no reason to not do this if the condition has other uses.
4908 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
4909}
4910
4911static bool isInv2Pi(const APFloat &APF) {
4912 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
4913 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
4914 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
4915
4916 return APF.bitwiseIsEqual(KF16) ||
4917 APF.bitwiseIsEqual(KF32) ||
4918 APF.bitwiseIsEqual(KF64);
4919}
4920
4921// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
4922// additional cost to negate them.
4925 if (C->isZero())
4926 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4927
4928 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
4929 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4930
4932}
4933
4937 return false;
4938}
4939
4943 return false;
4944}
4945
4946static unsigned inverseMinMax(unsigned Opc) {
4947 switch (Opc) {
4948 case ISD::FMAXNUM:
4949 return ISD::FMINNUM;
4950 case ISD::FMINNUM:
4951 return ISD::FMAXNUM;
4952 case ISD::FMAXNUM_IEEE:
4953 return ISD::FMINNUM_IEEE;
4954 case ISD::FMINNUM_IEEE:
4955 return ISD::FMAXNUM_IEEE;
4956 case ISD::FMAXIMUM:
4957 return ISD::FMINIMUM;
4958 case ISD::FMINIMUM:
4959 return ISD::FMAXIMUM;
4960 case ISD::FMAXIMUMNUM:
4961 return ISD::FMINIMUMNUM;
4962 case ISD::FMINIMUMNUM:
4963 return ISD::FMAXIMUMNUM;
4968 default:
4969 llvm_unreachable("invalid min/max opcode");
4970 }
4971}
4972
4973/// \return true if it's profitable to try to push an fneg into its source
4974/// instruction.
4976 // If the input has multiple uses and we can either fold the negate down, or
4977 // the other uses cannot, give up. This both prevents unprofitable
4978 // transformations and infinite loops: we won't repeatedly try to fold around
4979 // a negate that has no 'good' form.
4980 if (N0.hasOneUse()) {
4981 // This may be able to fold into the source, but at a code size cost. Don't
4982 // fold if the fold into the user is free.
4983 if (allUsesHaveSourceMods(N, 0))
4984 return false;
4985 } else {
4986 if (fnegFoldsIntoOp(N0.getNode()) &&
4988 return false;
4989 }
4990
4991 return true;
4992}
4993
4995 DAGCombinerInfo &DCI) const {
4996 SelectionDAG &DAG = DCI.DAG;
4997 SDValue N0 = N->getOperand(0);
4998 EVT VT = N->getValueType(0);
4999
5000 unsigned Opc = N0.getOpcode();
5001
5002 if (!shouldFoldFNegIntoSrc(N, N0))
5003 return SDValue();
5004
5005 SDLoc SL(N);
5006 switch (Opc) {
5007 case ISD::FADD: {
5008 if (!mayIgnoreSignedZero(N0))
5009 return SDValue();
5010
5011 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
5012 SDValue LHS = N0.getOperand(0);
5013 SDValue RHS = N0.getOperand(1);
5014
5015 if (LHS.getOpcode() != ISD::FNEG)
5016 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5017 else
5018 LHS = LHS.getOperand(0);
5019
5020 if (RHS.getOpcode() != ISD::FNEG)
5021 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5022 else
5023 RHS = RHS.getOperand(0);
5024
5025 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
5026 if (Res.getOpcode() != ISD::FADD)
5027 return SDValue(); // Op got folded away.
5028 if (!N0.hasOneUse())
5029 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5030 return Res;
5031 }
5032 case ISD::FMUL:
5034 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
5035 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
5036 SDValue LHS = N0.getOperand(0);
5037 SDValue RHS = N0.getOperand(1);
5038
5039 if (LHS.getOpcode() == ISD::FNEG)
5040 LHS = LHS.getOperand(0);
5041 else if (RHS.getOpcode() == ISD::FNEG)
5042 RHS = RHS.getOperand(0);
5043 else
5044 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5045
5046 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
5047 if (Res.getOpcode() != Opc)
5048 return SDValue(); // Op got folded away.
5049 if (!N0.hasOneUse())
5050 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5051 return Res;
5052 }
5053 case ISD::FMA:
5054 case ISD::FMAD: {
5055 // TODO: handle llvm.amdgcn.fma.legacy
5056 if (!mayIgnoreSignedZero(N0))
5057 return SDValue();
5058
5059 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
5060 SDValue LHS = N0.getOperand(0);
5061 SDValue MHS = N0.getOperand(1);
5062 SDValue RHS = N0.getOperand(2);
5063
5064 if (LHS.getOpcode() == ISD::FNEG)
5065 LHS = LHS.getOperand(0);
5066 else if (MHS.getOpcode() == ISD::FNEG)
5067 MHS = MHS.getOperand(0);
5068 else
5069 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
5070
5071 if (RHS.getOpcode() != ISD::FNEG)
5072 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5073 else
5074 RHS = RHS.getOperand(0);
5075
5076 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
5077 if (Res.getOpcode() != Opc)
5078 return SDValue(); // Op got folded away.
5079 if (!N0.hasOneUse())
5080 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5081 return Res;
5082 }
5083 case ISD::FMAXNUM:
5084 case ISD::FMINNUM:
5085 case ISD::FMAXNUM_IEEE:
5086 case ISD::FMINNUM_IEEE:
5087 case ISD::FMINIMUM:
5088 case ISD::FMAXIMUM:
5089 case ISD::FMINIMUMNUM:
5090 case ISD::FMAXIMUMNUM:
5093 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
5094 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
5095 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
5096 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
5097
5098 SDValue LHS = N0.getOperand(0);
5099 SDValue RHS = N0.getOperand(1);
5100
5101 // 0 doesn't have a negated inline immediate.
5102 // TODO: This constant check should be generalized to other operations.
5104 return SDValue();
5105
5106 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5107 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5108 unsigned Opposite = inverseMinMax(Opc);
5109
5110 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
5111 if (Res.getOpcode() != Opposite)
5112 return SDValue(); // Op got folded away.
5113 if (!N0.hasOneUse())
5114 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5115 return Res;
5116 }
5117 case AMDGPUISD::FMED3: {
5118 SDValue Ops[3];
5119 for (unsigned I = 0; I < 3; ++I)
5120 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
5121
5122 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
5123 if (Res.getOpcode() != AMDGPUISD::FMED3)
5124 return SDValue(); // Op got folded away.
5125
5126 if (!N0.hasOneUse()) {
5127 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
5128 DAG.ReplaceAllUsesWith(N0, Neg);
5129
5130 for (SDNode *U : Neg->users())
5131 DCI.AddToWorklist(U);
5132 }
5133
5134 return Res;
5135 }
5136 case ISD::FP_EXTEND:
5137 case ISD::FTRUNC:
5138 case ISD::FRINT:
5139 case ISD::FNEARBYINT: // XXX - Should fround be handled?
5140 case ISD::FROUNDEVEN:
5141 case ISD::FSIN:
5142 case ISD::FCANONICALIZE:
5143 case AMDGPUISD::RCP:
5146 case AMDGPUISD::SIN_HW: {
5147 SDValue CvtSrc = N0.getOperand(0);
5148 if (CvtSrc.getOpcode() == ISD::FNEG) {
5149 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
5150 // (fneg (rcp (fneg x))) -> (rcp x)
5151 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
5152 }
5153
5154 if (!N0.hasOneUse())
5155 return SDValue();
5156
5157 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
5158 // (fneg (rcp x)) -> (rcp (fneg x))
5159 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5160 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
5161 }
5162 case ISD::FP_ROUND: {
5163 SDValue CvtSrc = N0.getOperand(0);
5164
5165 if (CvtSrc.getOpcode() == ISD::FNEG) {
5166 // (fneg (fp_round (fneg x))) -> (fp_round x)
5167 return DAG.getNode(ISD::FP_ROUND, SL, VT,
5168 CvtSrc.getOperand(0), N0.getOperand(1));
5169 }
5170
5171 if (!N0.hasOneUse())
5172 return SDValue();
5173
5174 // (fneg (fp_round x)) -> (fp_round (fneg x))
5175 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5176 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
5177 }
5178 case ISD::FP16_TO_FP: {
5179 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
5180 // f16, but legalization of f16 fneg ends up pulling it out of the source.
5181 // Put the fneg back as a legal source operation that can be matched later.
5182 SDLoc SL(N);
5183
5184 SDValue Src = N0.getOperand(0);
5185 EVT SrcVT = Src.getValueType();
5186
5187 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
5188 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
5189 DAG.getConstant(0x8000, SL, SrcVT));
5190 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
5191 }
5192 case ISD::SELECT: {
5193 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5194 // TODO: Invert conditions of foldFreeOpFromSelect
5195 return SDValue();
5196 }
5197 case ISD::BITCAST: {
5198 SDLoc SL(N);
5199 SDValue BCSrc = N0.getOperand(0);
5200 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
5201 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
5202 if (HighBits.getValueType().getSizeInBits() != 32 ||
5203 !fnegFoldsIntoOp(HighBits.getNode()))
5204 return SDValue();
5205
5206 // f64 fneg only really needs to operate on the high half of of the
5207 // register, so try to force it to an f32 operation to help make use of
5208 // source modifiers.
5209 //
5210 //
5211 // fneg (f64 (bitcast (build_vector x, y))) ->
5212 // f64 (bitcast (build_vector (bitcast i32:x to f32),
5213 // (fneg (bitcast i32:y to f32)))
5214
5215 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
5216 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
5217 SDValue CastBack =
5218 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
5219
5220 SmallVector<SDValue, 8> Ops(BCSrc->ops());
5221 Ops.back() = CastBack;
5222 DCI.AddToWorklist(NegHi.getNode());
5223 SDValue Build =
5224 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
5225 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
5226
5227 if (!N0.hasOneUse())
5228 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
5229 return Result;
5230 }
5231
5232 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
5233 BCSrc.hasOneUse()) {
5234 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
5235 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
5236
5237 // TODO: Cast back result for multiple uses is beneficial in some cases.
5238
5239 SDValue LHS =
5240 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
5241 SDValue RHS =
5242 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
5243
5244 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
5245 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
5246
5247 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
5248 NegRHS);
5249 }
5250
5251 return SDValue();
5252 }
5253 default:
5254 return SDValue();
5255 }
5256}
5257
5259 DAGCombinerInfo &DCI) const {
5260 SelectionDAG &DAG = DCI.DAG;
5261 SDValue N0 = N->getOperand(0);
5262
5263 if (!N0.hasOneUse())
5264 return SDValue();
5265
5266 switch (N0.getOpcode()) {
5267 case ISD::FP16_TO_FP: {
5268 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
5269 SDLoc SL(N);
5270 SDValue Src = N0.getOperand(0);
5271 EVT SrcVT = Src.getValueType();
5272
5273 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5274 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
5275 DAG.getConstant(0x7fff, SL, SrcVT));
5276 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
5277 }
5278 default:
5279 return SDValue();
5280 }
5281}
5282
5284 DAGCombinerInfo &DCI) const {
5285 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
5286 if (!CFP)
5287 return SDValue();
5288
5289 // XXX - Should this flush denormals?
5290 const APFloat &Val = CFP->getValueAPF();
5291 APFloat One(Val.getSemantics(), "1.0");
5292 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
5293}
5294
5296 DAGCombinerInfo &DCI) const {
5297 SelectionDAG &DAG = DCI.DAG;
5298 SDLoc DL(N);
5299
5300 switch(N->getOpcode()) {
5301 default:
5302 break;
5303 case ISD::BITCAST: {
5304 EVT DestVT = N->getValueType(0);
5305
5306 // Push casts through vector builds. This helps avoid emitting a large
5307 // number of copies when materializing floating point vector constants.
5308 //
5309 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5310 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5311 if (DestVT.isVector()) {
5312 SDValue Src = N->getOperand(0);
5313 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5316 EVT SrcVT = Src.getValueType();
5317 unsigned NElts = DestVT.getVectorNumElements();
5318
5319 if (SrcVT.getVectorNumElements() == NElts) {
5320 EVT DestEltVT = DestVT.getVectorElementType();
5321
5322 SmallVector<SDValue, 8> CastedElts;
5323 SDLoc SL(N);
5324 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5325 SDValue Elt = Src.getOperand(I);
5326 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
5327 }
5328
5329 return DAG.getBuildVector(DestVT, SL, CastedElts);
5330 }
5331 }
5332 }
5333
5334 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5335 break;
5336
5337 // Fold bitcasts of constants.
5338 //
5339 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5340 // TODO: Generalize and move to DAGCombiner
5341 SDValue Src = N->getOperand(0);
5342 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
5343 SDLoc SL(N);
5344 uint64_t CVal = C->getZExtValue();
5345 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5346 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5347 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5348 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
5349 }
5350
5351 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
5352 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5353 SDLoc SL(N);
5354 uint64_t CVal = Val.getZExtValue();
5355 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5356 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5357 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5358
5359 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
5360 }
5361
5362 break;
5363 }
5364 case ISD::SHL:
5365 case ISD::SRA:
5366 case ISD::SRL: {
5367 // Range metadata can be invalidated when loads are converted to legal types
5368 // (e.g. v2i64 -> v4i32).
5369 // Try to convert vector shl/sra/srl before type legalization so that range
5370 // metadata can be utilized.
5371 if (!(N->getValueType(0).isVector() &&
5374 break;
5375 if (N->getOpcode() == ISD::SHL)
5376 return performShlCombine(N, DCI);
5377 if (N->getOpcode() == ISD::SRA)
5378 return performSraCombine(N, DCI);
5379 return performSrlCombine(N, DCI);
5380 }
5381 case ISD::TRUNCATE:
5382 return performTruncateCombine(N, DCI);
5383 case ISD::MUL:
5384 return performMulCombine(N, DCI);
5385 case AMDGPUISD::MUL_U24:
5386 case AMDGPUISD::MUL_I24: {
5387 if (SDValue Simplified = simplifyMul24(N, DCI))
5388 return Simplified;
5389 break;
5390 }
5393 return simplifyMul24(N, DCI);
5394 case ISD::SMUL_LOHI:
5395 case ISD::UMUL_LOHI:
5396 return performMulLoHiCombine(N, DCI);
5397 case ISD::MULHS:
5398 return performMulhsCombine(N, DCI);
5399 case ISD::MULHU:
5400 return performMulhuCombine(N, DCI);
5401 case ISD::SELECT:
5402 return performSelectCombine(N, DCI);
5403 case ISD::FNEG:
5404 return performFNegCombine(N, DCI);
5405 case ISD::FABS:
5406 return performFAbsCombine(N, DCI);
5407 case AMDGPUISD::BFE_I32:
5408 case AMDGPUISD::BFE_U32: {
5409 assert(!N->getValueType(0).isVector() &&
5410 "Vector handling of BFE not implemented");
5411 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
5412 if (!Width)
5413 break;
5414
5415 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5416 if (WidthVal == 0)
5417 return DAG.getConstant(0, DL, MVT::i32);
5418
5419 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
5420 if (!Offset)
5421 break;
5422
5423 SDValue BitsFrom = N->getOperand(0);
5424 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5425
5426 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5427
5428 if (OffsetVal == 0) {
5429 // This is already sign / zero extended, so try to fold away extra BFEs.
5430 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5431
5432 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
5433 if (OpSignBits >= SignBits)
5434 return BitsFrom;
5435
5436 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
5437 if (Signed) {
5438 // This is a sign_extend_inreg. Replace it to take advantage of existing
5439 // DAG Combines. If not eliminated, we will match back to BFE during
5440 // selection.
5441
5442 // TODO: The sext_inreg of extended types ends, although we can could
5443 // handle them in a single BFE.
5444 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
5445 DAG.getValueType(SmallVT));
5446 }
5447
5448 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
5449 }
5450
5451 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
5452 if (Signed) {
5453 return constantFoldBFE<int32_t>(DAG,
5454 CVal->getSExtValue(),
5455 OffsetVal,
5456 WidthVal,
5457 DL);
5458 }
5459
5460 return constantFoldBFE<uint32_t>(DAG,
5461 CVal->getZExtValue(),
5462 OffsetVal,
5463 WidthVal,
5464 DL);
5465 }
5466
5467 if ((OffsetVal + WidthVal) >= 32 &&
5468 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
5469 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
5470 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
5471 BitsFrom, ShiftVal);
5472 }
5473
5474 if (BitsFrom.hasOneUse()) {
5475 APInt Demanded = APInt::getBitsSet(32,
5476 OffsetVal,
5477 OffsetVal + WidthVal);
5478
5479 KnownBits Known;
5481 !DCI.isBeforeLegalizeOps());
5482 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5483 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
5484 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
5485 DCI.CommitTargetLoweringOpt(TLO);
5486 }
5487 }
5488
5489 break;
5490 }
5491 case ISD::LOAD:
5492 return performLoadCombine(N, DCI);
5493 case ISD::STORE:
5494 return performStoreCombine(N, DCI);
5495 case AMDGPUISD::RCP:
5497 return performRcpCombine(N, DCI);
5498 case ISD::AssertZext:
5499 case ISD::AssertSext:
5500 return performAssertSZExtCombine(N, DCI);
5502 return performIntrinsicWOChainCombine(N, DCI);
5503 case AMDGPUISD::FMAD_FTZ: {
5504 SDValue N0 = N->getOperand(0);
5505 SDValue N1 = N->getOperand(1);
5506 SDValue N2 = N->getOperand(2);
5507 EVT VT = N->getValueType(0);
5508
5509 // FMAD_FTZ is a FMAD + flush denormals to zero.
5510 // We flush the inputs, the intermediate step, and the output.
5511 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
5512 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
5513 ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
5514 if (N0CFP && N1CFP && N2CFP) {
5515 const auto FTZ = [](const APFloat &V) {
5516 if (V.isDenormal()) {
5517 APFloat Zero(V.getSemantics(), 0);
5518 return V.isNegative() ? -Zero : Zero;
5519 }
5520 return V;
5521 };
5522
5523 APFloat V0 = FTZ(N0CFP->getValueAPF());
5524 APFloat V1 = FTZ(N1CFP->getValueAPF());
5525 APFloat V2 = FTZ(N2CFP->getValueAPF());
5527 V0 = FTZ(V0);
5529 return DAG.getConstantFP(FTZ(V0), DL, VT);
5530 }
5531 break;
5532 }
5533 }
5534 return SDValue();
5535}
5536
5537//===----------------------------------------------------------------------===//
5538// Helper functions
5539//===----------------------------------------------------------------------===//
5540
5542 const TargetRegisterClass *RC,
5543 Register Reg, EVT VT,
5544 const SDLoc &SL,
5545 bool RawReg) const {
5548 Register VReg;
5549
5550 if (!MRI.isLiveIn(Reg)) {
5551 VReg = MRI.createVirtualRegister(RC);
5552 MRI.addLiveIn(Reg, VReg);
5553 } else {
5554 VReg = MRI.getLiveInVirtReg(Reg);
5555 }
5556
5557 if (RawReg)
5558 return DAG.getRegister(VReg, VT);
5559
5560 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
5561}
5562
5563// This may be called multiple times, and nothing prevents creating multiple
5564// objects at the same offset. See if we already defined this object.
5566 int64_t Offset) {
5567 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5568 if (MFI.getObjectOffset(I) == Offset) {
5569 assert(MFI.getObjectSize(I) == Size);
5570 return I;
5571 }
5572 }
5573
5574 return MFI.CreateFixedObject(Size, Offset, true);
5575}
5576
5578 EVT VT,
5579 const SDLoc &SL,
5580 int64_t Offset) const {
5582 MachineFrameInfo &MFI = MF.getFrameInfo();
5583 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
5584
5585 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5586 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
5587
5588 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
5591}
5592
5594 const SDLoc &SL,
5595 SDValue Chain,
5596 SDValue ArgVal,
5597 int64_t Offset) const {
5601
5602 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
5603 // Stores to the argument stack area are relative to the stack pointer.
5604 SDValue SP =
5605 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
5606 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
5607 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
5609 return Store;
5610}
5611
5613 const TargetRegisterClass *RC,
5614 EVT VT, const SDLoc &SL,
5615 const ArgDescriptor &Arg) const {
5616 assert(Arg && "Attempting to load missing argument");
5617
5618 SDValue V = Arg.isRegister() ?
5619 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
5620 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
5621
5622 if (!Arg.isMasked())
5623 return V;
5624
5625 unsigned Mask = Arg.getMask();
5626 unsigned Shift = llvm::countr_zero<unsigned>(Mask);
5627 V = DAG.getNode(ISD::SRL, SL, VT, V,
5628 DAG.getShiftAmountConstant(Shift, VT, SL));
5629 return DAG.getNode(ISD::AND, SL, VT, V,
5630 DAG.getConstant(Mask >> Shift, SL, VT));
5631}
5632
5634 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5635 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5636 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5637 uint64_t ArgOffset =
5638 alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;
5639 switch (Param) {
5640 case FIRST_IMPLICIT:
5641 return ArgOffset;
5642 case PRIVATE_BASE:
5644 case SHARED_BASE:
5645 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5646 case QUEUE_PTR:
5647 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5648 }
5649 llvm_unreachable("unexpected implicit parameter type");
5650}
5651
5653 const MachineFunction &MF, const ImplicitParameter Param) const {
5656}
5657
5658#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
5659
5660const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
5661 switch ((AMDGPUISD::NodeType)Opcode) {
5662 case AMDGPUISD::FIRST_NUMBER: break;
5663 // AMDIL DAG nodes
5664 NODE_NAME_CASE(BRANCH_COND);
5665
5666 // AMDGPU DAG nodes
5667 NODE_NAME_CASE(IF)
5668 NODE_NAME_CASE(ELSE)
5669 NODE_NAME_CASE(LOOP)
5670 NODE_NAME_CASE(CALL)
5671 NODE_NAME_CASE(TC_RETURN)
5672 NODE_NAME_CASE(TC_RETURN_GFX)
5673 NODE_NAME_CASE(TC_RETURN_CHAIN)
5674 NODE_NAME_CASE(TC_RETURN_CHAIN_DVGPR)
5675 NODE_NAME_CASE(TRAP)
5676 NODE_NAME_CASE(RET_GLUE)
5677 NODE_NAME_CASE(WAVE_ADDRESS)
5678 NODE_NAME_CASE(RETURN_TO_EPILOG)
5679 NODE_NAME_CASE(ENDPGM)
5680 NODE_NAME_CASE(ENDPGM_TRAP)
5681 NODE_NAME_CASE(SIMULATED_TRAP)
5682 NODE_NAME_CASE(DWORDADDR)
5683 NODE_NAME_CASE(FRACT)
5684 NODE_NAME_CASE(SETCC)
5685 NODE_NAME_CASE(DENORM_MODE)
5686 NODE_NAME_CASE(FMA_W_CHAIN)
5687 NODE_NAME_CASE(FMUL_W_CHAIN)
5688 NODE_NAME_CASE(CLAMP)
5689 NODE_NAME_CASE(COS_HW)
5690 NODE_NAME_CASE(SIN_HW)
5691 NODE_NAME_CASE(FMAX_LEGACY)
5692 NODE_NAME_CASE(FMIN_LEGACY)
5693 NODE_NAME_CASE(FMAX3)
5694 NODE_NAME_CASE(SMAX3)
5695 NODE_NAME_CASE(UMAX3)
5696 NODE_NAME_CASE(FMIN3)
5697 NODE_NAME_CASE(SMIN3)
5698 NODE_NAME_CASE(UMIN3)
5699 NODE_NAME_CASE(FMED3)
5700 NODE_NAME_CASE(SMED3)
5701 NODE_NAME_CASE(UMED3)
5702 NODE_NAME_CASE(FMAXIMUM3)
5703 NODE_NAME_CASE(FMINIMUM3)
5704 NODE_NAME_CASE(FDOT2)
5705 NODE_NAME_CASE(URECIP)
5706 NODE_NAME_CASE(DIV_SCALE)
5707 NODE_NAME_CASE(DIV_FMAS)
5708 NODE_NAME_CASE(DIV_FIXUP)
5709 NODE_NAME_CASE(FMAD_FTZ)
5710 NODE_NAME_CASE(RCP)
5711 NODE_NAME_CASE(RSQ)
5712 NODE_NAME_CASE(RCP_LEGACY)
5713 NODE_NAME_CASE(RCP_IFLAG)
5714 NODE_NAME_CASE(LOG)
5715 NODE_NAME_CASE(EXP)
5716 NODE_NAME_CASE(FMUL_LEGACY)
5717 NODE_NAME_CASE(RSQ_CLAMP)
5718 NODE_NAME_CASE(FP_CLASS)
5719 NODE_NAME_CASE(DOT4)
5720 NODE_NAME_CASE(CARRY)
5721 NODE_NAME_CASE(BORROW)
5722 NODE_NAME_CASE(BFE_U32)
5723 NODE_NAME_CASE(BFE_I32)
5724 NODE_NAME_CASE(BFI)
5725 NODE_NAME_CASE(BFM)
5726 NODE_NAME_CASE(FFBH_U32)
5727 NODE_NAME_CASE(FFBH_I32)
5728 NODE_NAME_CASE(FFBL_B32)
5729 NODE_NAME_CASE(MUL_U24)
5730 NODE_NAME_CASE(MUL_I24)
5731 NODE_NAME_CASE(MULHI_U24)
5732 NODE_NAME_CASE(MULHI_I24)
5733 NODE_NAME_CASE(MAD_U24)
5734 NODE_NAME_CASE(MAD_I24)
5735 NODE_NAME_CASE(MAD_I64_I32)
5736 NODE_NAME_CASE(MAD_U64_U32)
5737 NODE_NAME_CASE(PERM)
5738 NODE_NAME_CASE(TEXTURE_FETCH)
5739 NODE_NAME_CASE(R600_EXPORT)
5740 NODE_NAME_CASE(CONST_ADDRESS)
5741 NODE_NAME_CASE(REGISTER_LOAD)
5742 NODE_NAME_CASE(REGISTER_STORE)
5743 NODE_NAME_CASE(CVT_F32_UBYTE0)
5744 NODE_NAME_CASE(CVT_F32_UBYTE1)
5745 NODE_NAME_CASE(CVT_F32_UBYTE2)
5746 NODE_NAME_CASE(CVT_F32_UBYTE3)
5747 NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
5748 NODE_NAME_CASE(CVT_PKNORM_I16_F32)
5749 NODE_NAME_CASE(CVT_PKNORM_U16_F32)
5750 NODE_NAME_CASE(CVT_PK_I16_I32)
5751 NODE_NAME_CASE(CVT_PK_U16_U32)
5752 NODE_NAME_CASE(FP_TO_FP16)
5753 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
5754 NODE_NAME_CASE(CONST_DATA_PTR)
5755 NODE_NAME_CASE(PC_ADD_REL_OFFSET)
5756 NODE_NAME_CASE(PC_ADD_REL_OFFSET64)
5758 NODE_NAME_CASE(DUMMY_CHAIN)
5759 NODE_NAME_CASE(LOAD_D16_HI)
5760 NODE_NAME_CASE(LOAD_D16_LO)
5761 NODE_NAME_CASE(LOAD_D16_HI_I8)
5762 NODE_NAME_CASE(LOAD_D16_HI_U8)
5763 NODE_NAME_CASE(LOAD_D16_LO_I8)
5764 NODE_NAME_CASE(LOAD_D16_LO_U8)
5765 NODE_NAME_CASE(STORE_MSKOR)
5766 NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
5767 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
5768 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
5769 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
5770 NODE_NAME_CASE(DS_ORDERED_COUNT)
5771 NODE_NAME_CASE(ATOMIC_CMP_SWAP)
5772 NODE_NAME_CASE(BUFFER_LOAD)
5773 NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
5774 NODE_NAME_CASE(BUFFER_LOAD_USHORT)
5775 NODE_NAME_CASE(BUFFER_LOAD_BYTE)
5776 NODE_NAME_CASE(BUFFER_LOAD_SHORT)
5777 NODE_NAME_CASE(BUFFER_LOAD_TFE)
5778 NODE_NAME_CASE(BUFFER_LOAD_UBYTE_TFE)
5779 NODE_NAME_CASE(BUFFER_LOAD_USHORT_TFE)
5780 NODE_NAME_CASE(BUFFER_LOAD_BYTE_TFE)
5781 NODE_NAME_CASE(BUFFER_LOAD_SHORT_TFE)
5782 NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
5783 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
5784 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
5785 NODE_NAME_CASE(SBUFFER_LOAD)
5786 NODE_NAME_CASE(SBUFFER_LOAD_BYTE)
5787 NODE_NAME_CASE(SBUFFER_LOAD_UBYTE)
5788 NODE_NAME_CASE(SBUFFER_LOAD_SHORT)
5789 NODE_NAME_CASE(SBUFFER_LOAD_USHORT)
5790 NODE_NAME_CASE(SBUFFER_PREFETCH_DATA)
5791 NODE_NAME_CASE(BUFFER_STORE)
5792 NODE_NAME_CASE(BUFFER_STORE_BYTE)
5793 NODE_NAME_CASE(BUFFER_STORE_SHORT)
5794 NODE_NAME_CASE(BUFFER_STORE_FORMAT)
5795 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
5796 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
5797 NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
5798 NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
5799 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
5800 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
5801 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
5802 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
5803 NODE_NAME_CASE(BUFFER_ATOMIC_AND)
5804 NODE_NAME_CASE(BUFFER_ATOMIC_OR)
5805 NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
5806 NODE_NAME_CASE(BUFFER_ATOMIC_INC)
5807 NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
5808 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
5809 NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
5810 NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
5811 NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
5812 NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
5813 NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
5814 NODE_NAME_CASE(WHOLE_WAVE_SETUP)
5815 NODE_NAME_CASE(WHOLE_WAVE_RETURN)
5816 }
5817 return nullptr;
5818}
5819
5821 SelectionDAG &DAG, int Enabled,
5822 int &RefinementSteps,
5823 bool &UseOneConstNR,
5824 bool Reciprocal) const {
5825 EVT VT = Operand.getValueType();
5826
5827 if (VT == MVT::f32) {
5828 RefinementSteps = 0;
5829 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
5830 }
5831
5832 // TODO: There is also f64 rsq instruction, but the documentation is less
5833 // clear on its precision.
5834
5835 return SDValue();
5836}
5837
5839 SelectionDAG &DAG, int Enabled,
5840 int &RefinementSteps) const {
5841 EVT VT = Operand.getValueType();
5842
5843 if (VT == MVT::f32) {
5844 // Reciprocal, < 1 ulp error.
5845 //
5846 // This reciprocal approximation converges to < 0.5 ulp error with one
5847 // newton rhapson performed with two fused multiple adds (FMAs).
5848
5849 RefinementSteps = 0;
5850 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
5851 }
5852
5853 // TODO: There is also f64 rcp instruction, but the documentation is less
5854 // clear on its precision.
5855
5856 return SDValue();
5857}
5858
5859static unsigned workitemIntrinsicDim(unsigned ID) {
5860 switch (ID) {
5861 case Intrinsic::amdgcn_workitem_id_x:
5862 return 0;
5863 case Intrinsic::amdgcn_workitem_id_y:
5864 return 1;
5865 case Intrinsic::amdgcn_workitem_id_z:
5866 return 2;
5867 default:
5868 llvm_unreachable("not a workitem intrinsic");
5869 }
5870}
5871
5873 const SDValue Op, KnownBits &Known,
5874 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5875
5876 Known.resetAll(); // Don't know anything.
5877
5878 unsigned Opc = Op.getOpcode();
5879
5880 switch (Opc) {
5881 default:
5882 break;
5883 case AMDGPUISD::CARRY:
5884 case AMDGPUISD::BORROW: {
5885 Known.Zero = APInt::getHighBitsSet(32, 31);
5886 break;
5887 }
5888
5889 case AMDGPUISD::BFE_I32:
5890 case AMDGPUISD::BFE_U32: {
5891 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5892 if (!CWidth)
5893 return;
5894
5895 uint32_t Width = CWidth->getZExtValue() & 0x1f;
5896
5897 if (Opc == AMDGPUISD::BFE_U32)
5898 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
5899
5900 break;
5901 }
5902 case AMDGPUISD::FP_TO_FP16: {
5903 unsigned BitWidth = Known.getBitWidth();
5904
5905 // High bits are zero.
5907 break;
5908 }
5909 case AMDGPUISD::MUL_U24:
5910 case AMDGPUISD::MUL_I24: {
5911 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5912 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5913 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5914 RHSKnown.countMinTrailingZeros();
5915 Known.Zero.setLowBits(std::min(TrailZ, 32u));
5916 // Skip extra check if all bits are known zeros.
5917 if (TrailZ >= 32)
5918 break;
5919
5920 // Truncate to 24 bits.
5921 LHSKnown = LHSKnown.trunc(24);
5922 RHSKnown = RHSKnown.trunc(24);
5923
5924 if (Opc == AMDGPUISD::MUL_I24) {
5925 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
5926 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
5927 unsigned MaxValBits = LHSValBits + RHSValBits;
5928 if (MaxValBits > 32)
5929 break;
5930 unsigned SignBits = 32 - MaxValBits + 1;
5931 bool LHSNegative = LHSKnown.isNegative();
5932 bool LHSNonNegative = LHSKnown.isNonNegative();
5933 bool LHSPositive = LHSKnown.isStrictlyPositive();
5934 bool RHSNegative = RHSKnown.isNegative();
5935 bool RHSNonNegative = RHSKnown.isNonNegative();
5936 bool RHSPositive = RHSKnown.isStrictlyPositive();
5937
5938 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
5939 Known.Zero.setHighBits(SignBits);
5940 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
5941 Known.One.setHighBits(SignBits);
5942 } else {
5943 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
5944 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
5945 unsigned MaxValBits = LHSValBits + RHSValBits;
5946 if (MaxValBits >= 32)
5947 break;
5948 Known.Zero.setBitsFrom(MaxValBits);
5949 }
5950 break;
5951 }
5952 case AMDGPUISD::PERM: {
5953 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5954 if (!CMask)
5955 return;
5956
5957 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5958 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5959 unsigned Sel = CMask->getZExtValue();
5960
5961 for (unsigned I = 0; I < 32; I += 8) {
5962 unsigned SelBits = Sel & 0xff;
5963 if (SelBits < 4) {
5964 SelBits *= 8;
5965 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5966 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5967 } else if (SelBits < 7) {
5968 SelBits = (SelBits & 3) * 8;
5969 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5970 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5971 } else if (SelBits == 0x0c) {
5972 Known.Zero |= 0xFFull << I;
5973 } else if (SelBits > 0x0c) {
5974 Known.One |= 0xFFull << I;
5975 }
5976 Sel >>= 8;
5977 }
5978 break;
5979 }
5981 Known.Zero.setHighBits(24);
5982 break;
5983 }
5985 Known.Zero.setHighBits(16);
5986 break;
5987 }
5988 case AMDGPUISD::LDS: {
5989 auto *GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
5990 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
5991
5992 Known.Zero.setHighBits(16);
5993 Known.Zero.setLowBits(Log2(Alignment));
5994 break;
5995 }
5996 case AMDGPUISD::SMIN3:
5997 case AMDGPUISD::SMAX3:
5998 case AMDGPUISD::SMED3:
5999 case AMDGPUISD::UMIN3:
6000 case AMDGPUISD::UMAX3:
6001 case AMDGPUISD::UMED3: {
6002 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
6003 if (Known2.isUnknown())
6004 break;
6005
6006 KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
6007 if (Known1.isUnknown())
6008 break;
6009
6010 KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
6011 if (Known0.isUnknown())
6012 break;
6013
6014 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
6015 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
6016 Known.One = Known0.One & Known1.One & Known2.One;
6017 break;
6018 }
6020 unsigned IID = Op.getConstantOperandVal(0);
6021 switch (IID) {
6022 case Intrinsic::amdgcn_workitem_id_x:
6023 case Intrinsic::amdgcn_workitem_id_y:
6024 case Intrinsic::amdgcn_workitem_id_z: {
6025 unsigned MaxValue = Subtarget->getMaxWorkitemID(
6027 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
6028 break;
6029 }
6030 default:
6031 break;
6032 }
6033 }
6034 }
6035}
6036
6038 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6039 unsigned Depth) const {
6040 switch (Op.getOpcode()) {
6041 case AMDGPUISD::BFE_I32: {
6042 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6043 if (!Width)
6044 return 1;
6045
6046 unsigned SignBits = 32 - Width->getZExtValue() + 1;
6047 if (!isNullConstant(Op.getOperand(1)))
6048 return SignBits;
6049
6050 // TODO: Could probably figure something out with non-0 offsets.
6051 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
6052 return std::max(SignBits, Op0SignBits);
6053 }
6054
6055 case AMDGPUISD::BFE_U32: {
6056 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6057 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
6058 }
6059
6060 case AMDGPUISD::CARRY:
6061 case AMDGPUISD::BORROW:
6062 return 31;
6064 return 25;
6066 return 17;
6068 return 24;
6070 return 16;
6072 return 16;
6073 case AMDGPUISD::SMIN3:
6074 case AMDGPUISD::SMAX3:
6075 case AMDGPUISD::SMED3:
6076 case AMDGPUISD::UMIN3:
6077 case AMDGPUISD::UMAX3:
6078 case AMDGPUISD::UMED3: {
6079 unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);
6080 if (Tmp2 == 1)
6081 return 1; // Early out.
6082
6083 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
6084 if (Tmp1 == 1)
6085 return 1; // Early out.
6086
6087 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
6088 if (Tmp0 == 1)
6089 return 1; // Early out.
6090
6091 return std::min({Tmp0, Tmp1, Tmp2});
6092 }
6093 default:
6094 return 1;
6095 }
6096}
6097
6099 GISelValueTracking &Analysis, Register R, const APInt &DemandedElts,
6100 const MachineRegisterInfo &MRI, unsigned Depth) const {
6101 const MachineInstr *MI = MRI.getVRegDef(R);
6102 if (!MI)
6103 return 1;
6104
6105 // TODO: Check range metadata on MMO.
6106 switch (MI->getOpcode()) {
6107 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
6108 return 25;
6109 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
6110 return 17;
6111 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
6112 return 24;
6113 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
6114 return 16;
6115 case AMDGPU::G_AMDGPU_SMED3:
6116 case AMDGPU::G_AMDGPU_UMED3: {
6117 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
6118 unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);
6119 if (Tmp2 == 1)
6120 return 1;
6121 unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);
6122 if (Tmp1 == 1)
6123 return 1;
6124 unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);
6125 if (Tmp0 == 1)
6126 return 1;
6127 return std::min({Tmp0, Tmp1, Tmp2});
6128 }
6129 default:
6130 return 1;
6131 }
6132}
6133
6135 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6136 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
6137 unsigned Opcode = Op.getOpcode();
6138 switch (Opcode) {
6139 case AMDGPUISD::BFE_I32:
6140 case AMDGPUISD::BFE_U32:
6141 return false;
6142 }
6144 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
6145}
6146
6148 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN,
6149 unsigned Depth) const {
6150 unsigned Opcode = Op.getOpcode();
6151 switch (Opcode) {
6154 if (SNaN)
6155 return true;
6156
6157 // TODO: Can check no nans on one of the operands for each one, but which
6158 // one?
6159 return false;
6160 }
6163 if (SNaN)
6164 return true;
6165 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6166 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6167 }
6168 case AMDGPUISD::FMED3:
6169 case AMDGPUISD::FMIN3:
6170 case AMDGPUISD::FMAX3:
6173 case AMDGPUISD::FMAD_FTZ: {
6174 if (SNaN)
6175 return true;
6176 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6177 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6178 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6179 }
6184 return true;
6185
6186 case AMDGPUISD::RCP:
6187 case AMDGPUISD::RSQ:
6189 case AMDGPUISD::RSQ_CLAMP: {
6190 if (SNaN)
6191 return true;
6192
6193 // TODO: Need is known positive check.
6194 return false;
6195 }
6196 case ISD::FLDEXP:
6197 case AMDGPUISD::FRACT: {
6198 if (SNaN)
6199 return true;
6200 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
6201 }
6205 // TODO: Refine on operands.
6206 return SNaN;
6207 case AMDGPUISD::SIN_HW:
6208 case AMDGPUISD::COS_HW: {
6209 // TODO: Need check for infinity
6210 return SNaN;
6211 }
6213 unsigned IntrinsicID = Op.getConstantOperandVal(0);
6214 // TODO: Handle more intrinsics
6215 switch (IntrinsicID) {
6216 case Intrinsic::amdgcn_cubeid:
6217 case Intrinsic::amdgcn_cvt_off_f32_i4:
6218 return true;
6219
6220 case Intrinsic::amdgcn_frexp_mant: {
6221 if (SNaN)
6222 return true;
6223 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6224 }
6225 case Intrinsic::amdgcn_cvt_pkrtz: {
6226 if (SNaN)
6227 return true;
6228 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6229 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6230 }
6231 case Intrinsic::amdgcn_rcp:
6232 case Intrinsic::amdgcn_rsq:
6233 case Intrinsic::amdgcn_rcp_legacy:
6234 case Intrinsic::amdgcn_rsq_legacy:
6235 case Intrinsic::amdgcn_rsq_clamp:
6236 case Intrinsic::amdgcn_tanh: {
6237 if (SNaN)
6238 return true;
6239
6240 // TODO: Need is known positive check.
6241 return false;
6242 }
6243 case Intrinsic::amdgcn_trig_preop:
6244 case Intrinsic::amdgcn_fdot2:
6245 // TODO: Refine on operand
6246 return SNaN;
6247 case Intrinsic::amdgcn_fma_legacy:
6248 if (SNaN)
6249 return true;
6250 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6251 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
6252 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
6253 default:
6254 return false;
6255 }
6256 }
6257 default:
6258 return false;
6259 }
6260}
6261
6263 Register N0, Register N1) const {
6264 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
6265}
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static LLVM_READONLY bool hasSourceMods(const MachineInstr &MI)
static bool isInv2Pi(const APFloat &APF)
static LLVM_READONLY bool opMustUseVOP3Encoding(const MachineInstr &MI, const MachineRegisterInfo &MRI)
returns true if the operation will definitely need to use a 64-bit encoding, and thus will use a VOP3...
static unsigned inverseMinMax(unsigned Opc)
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static unsigned workitemIntrinsicDim(unsigned ID)
static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, int64_t Offset)
static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, uint32_t Width, const SDLoc &DL)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static SDValue getAddOneOp(const SDNode *V)
If V is an add of a constant 1, returns the other operand.
#define NODE_NAME_CASE(node)
static LLVM_READONLY bool selectSupportsSourceMods(const SDNode *N)
Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the type for ISD::SELECT.
static cl::opt< bool > AMDGPUBypassSlowDiv("amdgpu-bypass-slow-div", cl::desc("Skip 64-bit divide for dynamic 32-bit values"), cl::init(true))
static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, SDValue N0, SDValue N1, unsigned Size, bool Signed)
static bool fnegFoldsIntoOp(const SDNode *N)
static bool isI24(SDValue Op, SelectionDAG &DAG)
static bool isCttzOpc(unsigned Opc)
static bool isU24(SDValue Op, SelectionDAG &DAG)
static SDValue peekFPSignOps(SDValue Val)
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, unsigned Op, const SDLoc &SL, SDValue Cond, SDValue N1, SDValue N2)
static SDValue peekFNeg(SDValue Val)
static SDValue simplifyMul24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI)
static bool isCtlzOpc(unsigned Opc)
static LLVM_READNONE bool fnegFoldsIntoOpcode(unsigned Opc)
static bool hasVolatileUser(SDNode *Val)
Interface definition of the TargetLowering class that is common to all AMD GPUs.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU promote alloca to vector or LDS
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
block Block Frequency Analysis
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_READNONE
Definition: Compiler.h:315
#define LLVM_READONLY
Definition: Compiler.h:322
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Provides analysis for querying information about KnownBits during GISel passes.
IRTranslator LLVM IR MI
static LVOptions Options
Definition: LVOptions.cpp:25
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition: SHA256.cpp:34
static bool Enabled
Definition: Statistic.cpp:46
Value * RHS
Value * LHS
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
void recordNumNamedBarriers(uint32_t GVAddr, unsigned BarCnt)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool hasFminFmaxLegacy() const
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool has16BitInsts() const
bool hasFastFMAF32() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
Generate Min/Max node.
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const
EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override
Return the type that should be used to zero or sign extend a zeroext/signext integer return value.
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem, unsigned AS) const override
Return true if it is expected to be cheaper to do a store of vector constant with the given size and ...
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool shouldCombineMemoryType(EVT VT) const
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isTruncateFree(EVT Src, EVT Dest) const override
bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override
SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const
TargetLowering::NegatibleCost getConstantNegateCost(const ConstantFPSDNode *C) const
SDValue LowerFLOGUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, bool IsLog10, SDNodeFlags Flags) const
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSDNodeAlwaysUniform(const SDNode *N) const override
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const
Split a vector store into multiple scalar stores.
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
SDValue LowerFLOG10(SDValue Op, SelectionDAG &DAG) const
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const
unsigned computeNumSignBitsForTargetInstr(GISelValueTracking &Analysis, Register R, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isConstantCheaperToNegate(SDValue N) const
bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0, Register N1) const override
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
bool isConstantCostlierToNegate(SDValue N) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const
SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
Emit approx-funcs appropriate lowering for exp10.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtType, EVT ExtVT, std::optional< unsigned > ByteOffset) const override
Return true if it is profitable to reduce a load to a smaller type.
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSelectSupported(SelectSupportKind) const override
bool isZExtFree(Type *Src, Type *Dest) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const
SDValue getIsLtSmallestNormal(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool mayIgnoreSignedZero(SDValue Op) const
SDValue getIsFinite(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const final
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
std::pair< SDValue, SDValue > splitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HighVT, SelectionDAG &DAG) const
Split a vector value into two parts of types LoVT and HiVT.
SDValue LowerFLOGCommon(SDValue Op, SelectionDAG &DAG) const
SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, SDValue N) const
SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const
bool isFAbsFree(EVT VT) const override
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
SDValue loadStackInputValue(SelectionDAG &DAG, EVT VT, const SDLoc &SL, int64_t Offset) const
Similar to CreateLiveInRegister, except value maybe loaded from a stack slot rather than passed in a ...
SDValue LowerFLOG2(SDValue Op, SelectionDAG &DAG) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
static SDValue stripBitcast(SDValue Val)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override
Return a reciprocal estimate value for the input operand.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
std::pair< SDValue, SDValue > getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, SDValue Op, SDNodeFlags Flags) const
If denormal handling is required return the scaled input to FLOG2, and the check for denormal range.
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue lowerCTLZResults(SDValue Op, SelectionDAG &DAG) const
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const
SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const
SDValue performMulLoHiCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results) const
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
std::pair< EVT, EVT > getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const
Split a vector type into two parts.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue combineFMinMaxLegacyImpl(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
unsigned getVectorIdxWidth(const DataLayout &) const override
Returns the type to be used for the index operand vector operations.
bool bitwiseIsEqual(const APFloat &RHS) const
Definition: APFloat.h:1414
opStatus add(const APFloat &RHS, roundingMode RM)
Definition: APFloat.h:1181
const fltSemantics & getSemantics() const
Definition: APFloat.h:1457
opStatus multiply(const APFloat &RHS, roundingMode RM)
Definition: APFloat.h:1199
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition: APFloat.h:1158
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:1098
Class for arbitrary precision integers.
Definition: APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1540
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1391
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1385
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:258
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition: APInt.h:1150
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition: APInt.h:1388
This class represents an incoming formal argument to a Function.
Definition: Argument.h:32
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
LLVMContext & getContext() const
void addLoc(const CCValAssign &V)
static CCValAssign getCustomMem(unsigned ValNo, MVT ValVT, int64_t Offset, MVT LocVT, LocInfo HTP)
const APFloat & getValueAPF() const
bool isNegative() const
Return true if the value is negative.
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
Diagnostic information for unsupported feature in backend.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:363
iterator_range< arg_iterator > args()
Definition: Function.h:890
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:270
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:663
Type * getValueType() const
Definition: GlobalValue.h:298
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static auto integer_valuetypes()
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
int getObjectIndexBegin() const
Return the minimum frame object index.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
Definition: MachineInstr.h:72
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOInvariant
The memory access always returns the same value (or traps).
Flags getFlags() const
Return the raw flags of the source value,.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:285
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
iterator_range< user_iterator > users()
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:229
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:578
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:504
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:868
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
Definition: SelectionDAG.h:839
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:498
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
bool isConstantValueOfAnyType(SDValue N) const
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:707
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:493
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:511
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:587
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:581
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
size_t size() const
Definition: SmallVector.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset=std::nullopt) const
Return true if it is profitable to reduce a load to a smaller type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp to/from int conversion the backend supports.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
SelectSupportKind
Enum that describes what type of support for selects the target has.
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL) const
Returns the type for the shift amount of a shift opcode.
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setSupportsUnalignedAtomics(bool UnalignedSupported)
Sets whether unaligned atomic operations are supported.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
unsigned GatherAllAliasesMaxDepth
Depth that GatherAllAliases should continue looking for chain dependencies when trying to find a more...
NegatibleCost
Enum that specifies when a float negation is beneficial.
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:83
TargetOptions Options
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
LLVM Value Representation.
Definition: Value.h:75
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:322
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
bool isIntrinsicAlwaysUniform(unsigned IntrID)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isUniformMMO(const MachineMemOperand *MMO)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition: CallingConv.h:47
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:801
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:774
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:765
@ ConstantFP
Definition: ISDOpcodes.h:87
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1351
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:289
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:259
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1141
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:511
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:410
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:275
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:985
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:975
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:249
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:1018
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:826
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:773
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:1002
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1162
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1166
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:528
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition: ISDOpcodes.h:535
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:778
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1347
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:242
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:601
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1075
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition: ISDOpcodes.h:48
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:793
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
Definition: ISDOpcodes.h:1059
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1151
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:718
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:960
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:323
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:1207
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1081
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:908
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:730
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1318
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:200
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:299
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:53
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:1025
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:941
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:979
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:1204
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:838
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:521
@ AssertZext
Definition: ISDOpcodes.h:63
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
Definition: ISDOpcodes.h:1086
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:543
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1685
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1665
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
constexpr double ln2
Definition: MathExtras.h:49
constexpr double ln10
Definition: MathExtras.h:50
constexpr float log2ef
Definition: MathExtras.h:66
constexpr double log2e
Definition: MathExtras.h:51
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:477
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1744
MaybeAlign getAlign(const CallInst &I, unsigned Index)
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:390
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:203
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:159
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:164
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
CombineLevel
Definition: DAGCombine.h:15
@ AfterLegalizeDAG
Definition: DAGCombine.h:19
@ BeforeLegalizeTypes
Definition: DAGCombine.h:16
@ AfterLegalizeTypes
Definition: DAGCombine.h:17
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:119
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:223
@ DS_Warning
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
static cl::opt< int > CostThreshold("sbvec-cost-threshold", cl::init(0), cl::Hidden, cl::desc("Vectorization cost threshold."))
APFloat neg(APFloat X)
Returns the negated value of the argument.
Definition: APFloat.h:1569
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition: Error.cpp:180
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:858
#define N
static LLVM_ABI const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:266
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:304
static LLVM_ABI const fltSemantics & IEEEdouble() LLVM_READNONE
Definition: APFloat.cpp:267
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:264
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
MCRegister getRegister() const
unsigned getStackOffset() const
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition: ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:390
EVT getPow2VectorType(LLVMContext &Context) const
Widens the length of the given vector EVT up to the nearest power of 2 and returns that type.
Definition: ValueTypes.h:472
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:121
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:458
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:238
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition: ValueTypes.h:425
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:465
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:407
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:376
EVT getRoundIntegerType(LLVMContext &Context) const
Rounds the bit-width of the given integer EVT up to the nearest power of two (and at least to eight),...
Definition: ValueTypes.h:414
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition: ValueTypes.h:287
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isExtended() const
Test if the given EVT is extended (as opposed to being simple).
Definition: ValueTypes.h:142
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition: ValueTypes.cpp:330
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition: ValueTypes.h:303
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:101
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition: KnownBits.h:235
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:66
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:154
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:44
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:74
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:289
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:241
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:138
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition: KnownBits.h:122
bool isStrictlyPositive() const
Returns true if this value is known to be positive.
Definition: KnownBits.h:107
bool isNegative() const
Returns true if this value is known to be negative.
Definition: KnownBits.h:98
unsigned countMaxSignificantBits() const
Returns the maximum number of bits needed to represent all possible signed values with these known bi...
Definition: KnownBits.h:262
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
These are IR-level optimization flags that may be propagated to SDNodes.
void setAllowContract(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...