LLVM 22.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
16#include "X86.h"
17#include "X86FrameLowering.h"
18#include "X86InstrBuilder.h"
19#include "X86IntrinsicsInfo.h"
21#include "X86TargetMachine.h"
23#include "llvm/ADT/SmallSet.h"
25#include "llvm/ADT/Statistic.h"
43#include "llvm/IR/CallingConv.h"
44#include "llvm/IR/Constants.h"
47#include "llvm/IR/Function.h"
48#include "llvm/IR/GlobalAlias.h"
50#include "llvm/IR/IRBuilder.h"
52#include "llvm/IR/Intrinsics.h"
54#include "llvm/MC/MCAsmInfo.h"
55#include "llvm/MC/MCContext.h"
56#include "llvm/MC/MCExpr.h"
57#include "llvm/MC/MCSymbol.h"
59#include "llvm/Support/Debug.h"
64#include <algorithm>
65#include <bitset>
66#include <cctype>
67#include <numeric>
68using namespace llvm;
69
70#define DEBUG_TYPE "x86-isel"
71
73 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
75 "Sets the preferable loop alignment for experiments (as log2 bytes) "
76 "for innermost loops only. If specified, this option overrides "
77 "alignment set by x86-experimental-pref-loop-alignment."),
79
81 "x86-br-merging-base-cost", cl::init(2),
83 "Sets the cost threshold for when multiple conditionals will be merged "
84 "into one branch versus be split in multiple branches. Merging "
85 "conditionals saves branches at the cost of additional instructions. "
86 "This value sets the instruction cost limit, below which conditionals "
87 "will be merged, and above which conditionals will be split. Set to -1 "
88 "to never merge branches."),
90
92 "x86-br-merging-ccmp-bias", cl::init(6),
93 cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target "
94 "supports conditional compare instructions."),
96
97static cl::opt<bool>
98 WidenShift("x86-widen-shift", cl::init(true),
99 cl::desc("Replace narrow shifts with wider shifts."),
100 cl::Hidden);
101
103 "x86-br-merging-likely-bias", cl::init(0),
104 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
105 "that all conditionals will be executed. For example for merging "
106 "the conditionals (a == b && c > d), if its known that a == b is "
107 "likely, then it is likely that if the conditionals are split "
108 "both sides will be executed, so it may be desirable to increase "
109 "the instruction cost threshold. Set to -1 to never merge likely "
110 "branches."),
111 cl::Hidden);
112
114 "x86-br-merging-unlikely-bias", cl::init(-1),
115 cl::desc(
116 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
117 "that all conditionals will be executed. For example for merging "
118 "the conditionals (a == b && c > d), if its known that a == b is "
119 "unlikely, then it is unlikely that if the conditionals are split "
120 "both sides will be executed, so it may be desirable to decrease "
121 "the instruction cost threshold. Set to -1 to never merge unlikely "
122 "branches."),
123 cl::Hidden);
124
126 "mul-constant-optimization", cl::init(true),
127 cl::desc("Replace 'mul x, Const' with more effective instructions like "
128 "SHIFT, LEA, etc."),
129 cl::Hidden);
130
132 const X86Subtarget &STI)
133 : TargetLowering(TM), Subtarget(STI) {
134 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
135 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
136
137 // Set up the TargetLowering object.
138
139 // X86 is weird. It always uses i8 for shift amounts and setcc results.
141 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
143
144 // X86 instruction cache is coherent with its data cache so we can use the
145 // default expansion to a no-op.
147
148 // For 64-bit, since we have so many registers, use the ILP scheduler.
149 // For 32-bit, use the register pressure specific scheduling.
150 // For Atom, always use ILP scheduling.
151 if (Subtarget.isAtom())
153 else if (Subtarget.is64Bit())
155 else
157 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
158 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
159
160 // Bypass expensive divides and use cheaper ones.
161 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
162 if (Subtarget.hasSlowDivide32())
163 addBypassSlowDiv(32, 8);
164 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
165 addBypassSlowDiv(64, 32);
166 }
167
168 if (Subtarget.canUseCMPXCHG16B())
170 else if (Subtarget.canUseCMPXCHG8B())
172 else
174
175 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
176
178
179 // Set up the register classes.
180 addRegisterClass(MVT::i8, &X86::GR8RegClass);
181 addRegisterClass(MVT::i16, &X86::GR16RegClass);
182 addRegisterClass(MVT::i32, &X86::GR32RegClass);
183 if (Subtarget.is64Bit())
184 addRegisterClass(MVT::i64, &X86::GR64RegClass);
185
186 for (MVT VT : MVT::integer_valuetypes())
188
189 // We don't accept any truncstore of integer registers.
190 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
191 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
192 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
193 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
194 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
195 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
196
197 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
198
199 // SETOEQ and SETUNE require checking two conditions.
200 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
203 }
204
205 // Integer absolute.
206 if (Subtarget.canUseCMOV()) {
207 setOperationAction(ISD::ABS , MVT::i16 , Custom);
208 setOperationAction(ISD::ABS , MVT::i32 , Custom);
209 if (Subtarget.is64Bit())
210 setOperationAction(ISD::ABS , MVT::i64 , Custom);
211 }
212
213 // Absolute difference.
214 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
215 setOperationAction(Op , MVT::i8 , Custom);
216 setOperationAction(Op , MVT::i16 , Custom);
217 setOperationAction(Op , MVT::i32 , Custom);
218 if (Subtarget.is64Bit())
219 setOperationAction(Op , MVT::i64 , Custom);
220 }
221
222 // Signed saturation subtraction.
226 if (Subtarget.is64Bit())
228
229 // Funnel shifts.
230 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
231 // For slow shld targets we only lower for code size.
232 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
233
234 setOperationAction(ShiftOp , MVT::i8 , Custom);
235 setOperationAction(ShiftOp , MVT::i16 , Custom);
236 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
237 if (Subtarget.is64Bit())
238 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
239 }
240
241 if (!Subtarget.useSoftFloat()) {
242 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
243 // operation.
248 // We have an algorithm for SSE2, and we turn this into a 64-bit
249 // FILD or VCVTUSI2SS/SD for other targets.
252 // We have an algorithm for SSE2->double, and we turn this into a
253 // 64-bit FILD followed by conditional FADD for other targets.
256
257 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
258 // this operation.
261 // SSE has no i16 to fp conversion, only i32. We promote in the handler
262 // to allow f80 to use i16 and f64 to use i16 with sse1 only
265 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
268 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
269 // are Legal, f80 is custom lowered.
272
273 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
274 // this operation.
276 // FIXME: This doesn't generate invalid exception when it should. PR44019.
282 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
283 // are Legal, f80 is custom lowered.
286
287 // Handle FP_TO_UINT by promoting the destination to a larger signed
288 // conversion.
290 // FIXME: This doesn't generate invalid exception when it should. PR44019.
293 // FIXME: This doesn't generate invalid exception when it should. PR44019.
299
300 setOperationAction(ISD::LRINT, MVT::f32, Custom);
301 setOperationAction(ISD::LRINT, MVT::f64, Custom);
302 setOperationAction(ISD::LLRINT, MVT::f32, Custom);
303 setOperationAction(ISD::LLRINT, MVT::f64, Custom);
304
305 if (!Subtarget.is64Bit()) {
306 setOperationAction(ISD::LRINT, MVT::i64, Custom);
307 setOperationAction(ISD::LLRINT, MVT::i64, Custom);
308 }
309 }
310
311 if (Subtarget.hasSSE2()) {
312 // Custom lowering for saturating float to int conversions.
313 // We handle promotion to larger result types manually.
314 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
317 }
320 if (Subtarget.is64Bit()) {
323 }
324 }
325 if (Subtarget.hasAVX10_2()) {
330 for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
331 MVT::v4i64}) {
334 }
335 if (Subtarget.is64Bit()) {
338 }
339 }
340
341 // Handle address space casts between mixed sized pointers.
342 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
343 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
344
345 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
346 if (!Subtarget.hasSSE2()) {
347 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
348 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
351 if (Subtarget.is64Bit()) {
352 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
353 // Without SSE, i64->f64 goes through memory.
354 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
355 }
356 } else if (!Subtarget.is64Bit())
357 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
358
359 // Scalar integer divide and remainder are lowered to use operations that
360 // produce two results, to match the available instructions. This exposes
361 // the two-result form to trivial CSE, which is able to combine x/y and x%y
362 // into a single instruction.
363 //
364 // Scalar integer multiply-high is also lowered to use two-result
365 // operations, to match the available instructions. However, plain multiply
366 // (low) operations are left as Legal, as there are single-result
367 // instructions for this in x86. Using the two-result multiply instructions
368 // when both high and low results are needed must be arranged by dagcombine.
369 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
376 }
377
378 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
379 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
380 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
381 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
382 setOperationAction(ISD::BR_CC, VT, Expand);
384 }
385 if (Subtarget.is64Bit())
390
391 setOperationAction(ISD::FREM , MVT::f32 , Expand);
392 setOperationAction(ISD::FREM , MVT::f64 , Expand);
393 setOperationAction(ISD::FREM , MVT::f80 , Expand);
394 setOperationAction(ISD::FREM , MVT::f128 , Expand);
395
396 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
398 setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom);
399 setOperationAction(ISD::GET_FPENV_MEM , MVT::Other, Custom);
400 setOperationAction(ISD::SET_FPENV_MEM , MVT::Other, Custom);
401 setOperationAction(ISD::RESET_FPENV , MVT::Other, Custom);
402 }
403
404 // Promote the i8 variants and force them on up to i32 which has a shorter
405 // encoding.
406 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
408 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
409 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
410 // promote that too.
411 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
413
414 if (!Subtarget.hasBMI()) {
415 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
417 if (Subtarget.is64Bit()) {
418 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
420 }
421 }
422
423 if (Subtarget.hasLZCNT()) {
424 // When promoting the i8 variants, force them to i32 for a shorter
425 // encoding.
426 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
428 } else {
429 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
430 if (VT == MVT::i64 && !Subtarget.is64Bit())
431 continue;
434 }
435 }
436
437 for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
438 ISD::STRICT_FP_TO_FP16}) {
439 // Special handling for half-precision floating point conversions.
440 // If we don't have F16C support, then lower half float conversions
441 // into library calls.
443 Op, MVT::f32,
444 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
445 // There's never any support for operations beyond MVT::f32.
446 setOperationAction(Op, MVT::f64, Expand);
447 setOperationAction(Op, MVT::f80, Expand);
448 setOperationAction(Op, MVT::f128, Expand);
449 }
450
451 for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
452 setOperationAction(ISD::STRICT_FP_TO_BF16, VT, Expand);
453 setOperationAction(ISD::STRICT_BF16_TO_FP, VT, Expand);
454 }
455
456 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
457 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
458 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
459 setTruncStoreAction(VT, MVT::f16, Expand);
460 setTruncStoreAction(VT, MVT::bf16, Expand);
461
462 setOperationAction(ISD::BF16_TO_FP, VT, Expand);
463 setOperationAction(ISD::FP_TO_BF16, VT, Custom);
464 }
465
469 if (Subtarget.is64Bit())
471 if (Subtarget.hasPOPCNT()) {
472 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
473 // popcntw is longer to encode than popcntl and also has a false dependency
474 // on the dest that popcntl hasn't had since Cannon Lake.
475 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
476 } else {
481 }
482
483 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
484
485 if (!Subtarget.hasMOVBE())
487
488 // X86 wants to expand cmov itself.
489 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
494 }
495 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
496 if (VT == MVT::i64 && !Subtarget.is64Bit())
497 continue;
500 }
501
502 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
505
507 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
508 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
512
513 // Darwin ABI issue.
514 for (auto VT : { MVT::i32, MVT::i64 }) {
515 if (VT == MVT::i64 && !Subtarget.is64Bit())
516 continue;
523 }
524
525 // 64-bit shl, sra, srl (iff 32-bit x86)
526 for (auto VT : { MVT::i32, MVT::i64 }) {
527 if (VT == MVT::i64 && !Subtarget.is64Bit())
528 continue;
532 }
533
534 if (Subtarget.hasSSEPrefetch())
535 setOperationAction(ISD::PREFETCH , MVT::Other, Custom);
536
537 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
538
539 // Expand certain atomics
540 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
541 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
542 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
543 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
544 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
545 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
546 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
547 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
548 }
549
550 if (!Subtarget.is64Bit())
551 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
552
553 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
554 // All CPUs supporting AVX will atomically load/store aligned 128-bit
555 // values, so we can emit [V]MOVAPS/[V]MOVDQA.
556 setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
557 setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
558 }
559
560 if (Subtarget.canUseCMPXCHG16B())
561 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
562
563 // FIXME - use subtarget debug flags
564 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
565 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
566 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
567 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
568 }
569
572
573 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
574 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
575
576 setOperationAction(ISD::TRAP, MVT::Other, Legal);
577 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
578 if (Subtarget.isTargetPS())
579 setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);
580 else
581 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
582
583 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
584 setOperationAction(ISD::VASTART , MVT::Other, Custom);
585 setOperationAction(ISD::VAEND , MVT::Other, Expand);
586 bool Is64Bit = Subtarget.is64Bit();
587 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
588 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
589
590 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
591 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
592
593 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
594
595 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
596 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
597 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
598
600
601 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
602 setOperationAction(ISD::FABS, VT, Action);
603 setOperationAction(ISD::FNEG, VT, Action);
605 setOperationAction(ISD::FREM, VT, Action);
606 setOperationAction(ISD::FMA, VT, Action);
607 setOperationAction(ISD::FMINNUM, VT, Action);
608 setOperationAction(ISD::FMAXNUM, VT, Action);
609 setOperationAction(ISD::FMINIMUM, VT, Action);
610 setOperationAction(ISD::FMAXIMUM, VT, Action);
611 setOperationAction(ISD::FMINIMUMNUM, VT, Action);
612 setOperationAction(ISD::FMAXIMUMNUM, VT, Action);
613 setOperationAction(ISD::FSIN, VT, Action);
614 setOperationAction(ISD::FCOS, VT, Action);
615 setOperationAction(ISD::FSINCOS, VT, Action);
616 setOperationAction(ISD::FTAN, VT, Action);
617 setOperationAction(ISD::FSQRT, VT, Action);
618 setOperationAction(ISD::FPOW, VT, Action);
619 setOperationAction(ISD::FPOWI, VT, Action);
620 setOperationAction(ISD::FLOG, VT, Action);
621 setOperationAction(ISD::FLOG2, VT, Action);
622 setOperationAction(ISD::FLOG10, VT, Action);
623 setOperationAction(ISD::FEXP, VT, Action);
624 setOperationAction(ISD::FEXP2, VT, Action);
625 setOperationAction(ISD::FEXP10, VT, Action);
626 setOperationAction(ISD::FCEIL, VT, Action);
627 setOperationAction(ISD::FFLOOR, VT, Action);
628 setOperationAction(ISD::FNEARBYINT, VT, Action);
629 setOperationAction(ISD::FRINT, VT, Action);
630 setOperationAction(ISD::BR_CC, VT, Action);
631 setOperationAction(ISD::SETCC, VT, Action);
634 setOperationAction(ISD::FROUND, VT, Action);
635 setOperationAction(ISD::FROUNDEVEN, VT, Action);
636 setOperationAction(ISD::FTRUNC, VT, Action);
637 setOperationAction(ISD::FLDEXP, VT, Action);
638 };
639
640 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
641 // f16, f32 and f64 use SSE.
642 // Set up the FP register classes.
643 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
644 : &X86::FR16RegClass);
645 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
646 : &X86::FR32RegClass);
647 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
648 : &X86::FR64RegClass);
649
650 // Disable f32->f64 extload as we can only generate this in one instruction
651 // under optsize. So its easier to pattern match (fpext (load)) for that
652 // case instead of needing to emit 2 instructions for extload in the
653 // non-optsize case.
654 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
655
656 for (auto VT : { MVT::f32, MVT::f64 }) {
657 // Use ANDPD to simulate FABS.
658 setOperationAction(ISD::FABS, VT, Custom);
659
660 // Use XORP to simulate FNEG.
661 setOperationAction(ISD::FNEG, VT, Custom);
662
663 // Use ANDPD and ORPD to simulate FCOPYSIGN.
665
666 // These might be better off as horizontal vector ops.
669
670 // We don't support sin/cos/fmod
671 setOperationAction(ISD::FSIN , VT, Expand);
672 setOperationAction(ISD::FCOS , VT, Expand);
673 setOperationAction(ISD::FSINCOS, VT, Expand);
674 }
675
676 // Half type will be promoted by default.
677 setF16Action(MVT::f16, Promote);
682 setOperationAction(ISD::FABS, MVT::f16, Custom);
683 setOperationAction(ISD::FNEG, MVT::f16, Custom);
686 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
687 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
688
719 setOperationAction(ISD::LRINT, MVT::f16, Expand);
720 setOperationAction(ISD::LLRINT, MVT::f16, Expand);
721
722 // Lower this to MOVMSK plus an AND.
725
726 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
727 (UseX87 || Is64Bit)) {
728 // Use SSE for f32, x87 for f64.
729 // Set up the FP register classes.
730 addRegisterClass(MVT::f32, &X86::FR32RegClass);
731 if (UseX87)
732 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
733
734 // Use ANDPS to simulate FABS.
735 setOperationAction(ISD::FABS , MVT::f32, Custom);
736
737 // Use XORP to simulate FNEG.
738 setOperationAction(ISD::FNEG , MVT::f32, Custom);
739
740 if (UseX87)
742
743 // Use ANDPS and ORPS to simulate FCOPYSIGN.
744 if (UseX87)
747
748 // We don't support sin/cos/fmod
749 setOperationAction(ISD::FSIN , MVT::f32, Expand);
750 setOperationAction(ISD::FCOS , MVT::f32, Expand);
751 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
752
753 if (UseX87) {
754 // Always expand sin/cos functions even though x87 has an instruction.
755 setOperationAction(ISD::FSIN, MVT::f64, Expand);
756 setOperationAction(ISD::FCOS, MVT::f64, Expand);
757 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
758 }
759 } else if (UseX87) {
760 // f32 and f64 in x87.
761 // Set up the FP register classes.
762 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
763 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
764
765 for (auto VT : { MVT::f32, MVT::f64 }) {
768
769 // Always expand sin/cos functions even though x87 has an instruction.
770 setOperationAction(ISD::FSIN , VT, Expand);
771 setOperationAction(ISD::FCOS , VT, Expand);
772 setOperationAction(ISD::FSINCOS, VT, Expand);
773 }
774 }
775
776 // Expand FP32 immediates into loads from the stack, save special cases.
777 if (isTypeLegal(MVT::f32)) {
778 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
779 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
780 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
781 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
782 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
783 } else // SSE immediates.
784 addLegalFPImmediate(APFloat(+0.0f)); // xorps
785 }
786 // Expand FP64 immediates into loads from the stack, save special cases.
787 if (isTypeLegal(MVT::f64)) {
788 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
789 addLegalFPImmediate(APFloat(+0.0)); // FLD0
790 addLegalFPImmediate(APFloat(+1.0)); // FLD1
791 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
792 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
793 } else // SSE immediates.
794 addLegalFPImmediate(APFloat(+0.0)); // xorpd
795 }
796 // Support fp16 0 immediate.
797 if (isTypeLegal(MVT::f16))
798 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
799
800 // Handle constrained floating-point operations of scalar.
813
814 // We don't support FMA.
817
818 // f80 always uses X87.
819 if (UseX87) {
820 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
823 {
825 addLegalFPImmediate(TmpFlt); // FLD0
826 TmpFlt.changeSign();
827 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
828
829 bool ignored;
830 APFloat TmpFlt2(+1.0);
832 &ignored);
833 addLegalFPImmediate(TmpFlt2); // FLD1
834 TmpFlt2.changeSign();
835 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
836 }
837
838 // Always expand sin/cos functions even though x87 has an instruction.
839 // clang-format off
840 setOperationAction(ISD::FSIN , MVT::f80, Expand);
841 setOperationAction(ISD::FCOS , MVT::f80, Expand);
842 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
843 setOperationAction(ISD::FTAN , MVT::f80, Expand);
844 setOperationAction(ISD::FASIN , MVT::f80, Expand);
845 setOperationAction(ISD::FACOS , MVT::f80, Expand);
846 setOperationAction(ISD::FATAN , MVT::f80, Expand);
847 setOperationAction(ISD::FATAN2 , MVT::f80, Expand);
848 setOperationAction(ISD::FSINH , MVT::f80, Expand);
849 setOperationAction(ISD::FCOSH , MVT::f80, Expand);
850 setOperationAction(ISD::FTANH , MVT::f80, Expand);
851 // clang-format on
852
853 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
854 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
855 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
856 setOperationAction(ISD::FRINT, MVT::f80, Expand);
857 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
858 setOperationAction(ISD::FROUNDEVEN, MVT::f80, Expand);
860 setOperationAction(ISD::LROUND, MVT::f80, LibCall);
861 setOperationAction(ISD::LLROUND, MVT::f80, LibCall);
862 setOperationAction(ISD::LRINT, MVT::f80, Custom);
863 setOperationAction(ISD::LLRINT, MVT::f80, Custom);
864
865 // Handle constrained floating-point operations of scalar.
872 if (isTypeLegal(MVT::f16)) {
873 setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
875 } else {
877 }
878 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
879 // as Custom.
881 }
882
883 // f128 uses xmm registers, but most operations require libcalls.
884 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
885 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
886 : &X86::VR128RegClass);
887
888 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
889
900
901 setOperationAction(ISD::FABS, MVT::f128, Custom);
902 setOperationAction(ISD::FNEG, MVT::f128, Custom);
904
905 // clang-format off
906 setOperationAction(ISD::FSIN, MVT::f128, LibCall);
908 setOperationAction(ISD::FCOS, MVT::f128, LibCall);
910 setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
911 setOperationAction(ISD::FTAN, MVT::f128, LibCall);
913 // clang-format on
914 // No STRICT_FSINCOS
915 setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
917
918 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
920 // We need to custom handle any FP_ROUND with an f128 input, but
921 // LegalizeDAG uses the result type to know when to run a custom handler.
922 // So we have to list all legal floating point result types here.
923 if (isTypeLegal(MVT::f32)) {
926 }
927 if (isTypeLegal(MVT::f64)) {
930 }
931 if (isTypeLegal(MVT::f80)) {
935 }
936
938
939 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
940 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
941 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
942 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
943 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
944 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
945 }
946
947 // Always use a library call for pow.
948 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
949 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
950 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
951 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
952
953 setOperationAction(ISD::FLOG, MVT::f80, Expand);
954 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
955 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
956 setOperationAction(ISD::FEXP, MVT::f80, Expand);
957 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
958 setOperationAction(ISD::FEXP10, MVT::f80, Expand);
959 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
960 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
961
962 // Some FP actions are always expanded for vector types.
963 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
964 MVT::v4f32, MVT::v8f32, MVT::v16f32,
965 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
966 // clang-format off
967 setOperationAction(ISD::FSIN, VT, Expand);
968 setOperationAction(ISD::FSINCOS, VT, Expand);
969 setOperationAction(ISD::FCOS, VT, Expand);
970 setOperationAction(ISD::FTAN, VT, Expand);
973 setOperationAction(ISD::FPOW, VT, Expand);
974 setOperationAction(ISD::FLOG, VT, Expand);
975 setOperationAction(ISD::FLOG2, VT, Expand);
976 setOperationAction(ISD::FLOG10, VT, Expand);
977 setOperationAction(ISD::FEXP, VT, Expand);
978 setOperationAction(ISD::FEXP2, VT, Expand);
979 setOperationAction(ISD::FEXP10, VT, Expand);
980 // clang-format on
981 }
982
983 // First set operation action for all vector types to either promote
984 // (for widening) or expand (for scalarization). Then we will selectively
985 // turn on ones that can be effectively codegen'd.
996 setOperationAction(ISD::FFLOOR, VT, Expand);
997 setOperationAction(ISD::FCEIL, VT, Expand);
998 setOperationAction(ISD::FTRUNC, VT, Expand);
999 setOperationAction(ISD::FRINT, VT, Expand);
1000 setOperationAction(ISD::FNEARBYINT, VT, Expand);
1001 setOperationAction(ISD::FROUNDEVEN, VT, Expand);
1025 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1026 setTruncStoreAction(InnerVT, VT, Expand);
1027
1028 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
1029 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
1030
1031 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
1032 // types, we have to deal with them whether we ask for Expansion or not.
1033 // Setting Expand causes its own optimisation problems though, so leave
1034 // them legal.
1035 if (VT.getVectorElementType() == MVT::i1)
1036 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1037
1038 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
1039 // split/scalarized right now.
1040 if (VT.getVectorElementType() == MVT::f16 ||
1041 VT.getVectorElementType() == MVT::bf16)
1042 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1043 }
1044 }
1045
1046 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1047 // with -msoft-float, disable use of MMX as well.
1048 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1049 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1050 // No operations on x86mmx supported, everything uses intrinsics.
1051 }
1052
1053 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1054 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1055 : &X86::VR128RegClass);
1056
1057 setOperationAction(ISD::FMAXIMUM, MVT::f32, Custom);
1058 setOperationAction(ISD::FMINIMUM, MVT::f32, Custom);
1059 setOperationAction(ISD::FMAXIMUMNUM, MVT::f32, Custom);
1060 setOperationAction(ISD::FMINIMUMNUM, MVT::f32, Custom);
1061
1062 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1063 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1071
1072 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1073 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1075
1081 }
1082
1083 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1084 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1085 : &X86::VR128RegClass);
1086
1087 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1088 // registers cannot be used even for integer operations.
1089 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1090 : &X86::VR128RegClass);
1091 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1092 : &X86::VR128RegClass);
1093 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1094 : &X86::VR128RegClass);
1095 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1096 : &X86::VR128RegClass);
1097 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1098 : &X86::VR128RegClass);
1099
1100 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1101 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1102 setOperationAction(ISD::FMINIMUM, VT, Custom);
1103 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1104 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1105 }
1106
1107 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1108 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1113 }
1114
1115 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1116 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1117 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1118
1119 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1120 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1121 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1122 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1123 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1124 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1125 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1126 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1127 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1128 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1131
1132 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1133 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1134 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1135
1136 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1138 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1140
1141 setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
1142 setOperationAction(ISD::LRINT, MVT::v2i32, Custom);
1143
1144 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1145 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1146 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1147 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1148 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1149 }
1150
1161
1166
1167 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1173
1174 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1175 // setcc all the way to isel and prefer SETGT in some isel patterns.
1178 }
1179
1180 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1181 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1186
1187 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1193 }
1194
1195 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1199
1200 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1201 continue;
1202
1205 }
1206 setF16Action(MVT::v8f16, Expand);
1207 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1208 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1209 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1210 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1211 setOperationAction(ISD::FNEG, MVT::v8f16, Custom);
1212 setOperationAction(ISD::FABS, MVT::v8f16, Custom);
1214
1215 // Custom lower v2i64 and v2f64 selects.
1222
1229
1230 // Custom legalize these to avoid over promotion or custom promotion.
1231 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1236 }
1237
1242
1245
1248
1249 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1254
1255 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
1259
1260 // We want to legalize this to an f64 load rather than an i64 load on
1261 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1262 // store.
1263 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1264 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1265 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1266 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1267 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1268 setOperationAction(ISD::STORE, MVT::v8i8, Custom);
1269
1270 // Add 32-bit vector stores to help vectorization opportunities.
1271 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1272 setOperationAction(ISD::STORE, MVT::v4i8, Custom);
1273
1274 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
1275 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
1276 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
1277 if (!Subtarget.hasAVX512())
1278 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1279
1283
1285
1302
1303 // In the customized shift lowering, the legal v4i32/v2i64 cases
1304 // in AVX2 will be recognized.
1305 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1309 if (VT == MVT::v2i64) continue;
1314 }
1315
1321 }
1322
1323 if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) {
1328
1329 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1331 }
1332 }
1333
1334 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1335 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1336 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1337 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1338
1339 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1342 }
1343
1344 // These might be better off as horizontal vector ops.
1349 }
1350
1351 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1352 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1353 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
1355 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1357 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
1359 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1361 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
1363 setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
1365
1366 setOperationAction(ISD::FROUND, RoundedTy, Custom);
1367 }
1368
1369 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1370 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1371 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1372 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1373 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1374 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1375 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1376 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1377
1381
1382 // FIXME: Do we need to handle scalar-to-vector here?
1383 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1384 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1385
1386 // We directly match byte blends in the backend as they match the VSELECT
1387 // condition form.
1389
1390 // SSE41 brings specific instructions for doing vector sign extend even in
1391 // cases where we don't have SRA.
1392 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1395 }
1396
1397 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1398 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1399 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1400 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1401 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1402 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1403 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1404 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1405 }
1406
1407 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1408 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1409 // do the pre and post work in the vector domain.
1412 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1413 // so that DAG combine doesn't try to turn it into uint_to_fp.
1416 }
1417 }
1418
1419 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1421 }
1422
1423 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1424 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1425 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1428 }
1429
1430 // XOP can efficiently perform BITREVERSE with VPPERM.
1431 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1433 }
1434
1435 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1436 bool HasInt256 = Subtarget.hasInt256();
1437
1438 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1439 : &X86::VR256RegClass);
1440 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1441 : &X86::VR256RegClass);
1442 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1443 : &X86::VR256RegClass);
1444 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1445 : &X86::VR256RegClass);
1446 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1447 : &X86::VR256RegClass);
1448 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1449 : &X86::VR256RegClass);
1450 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1451 : &X86::VR256RegClass);
1452
1453 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1454 setOperationAction(ISD::FFLOOR, VT, Legal);
1456 setOperationAction(ISD::FCEIL, VT, Legal);
1458 setOperationAction(ISD::FTRUNC, VT, Legal);
1460 setOperationAction(ISD::FRINT, VT, Legal);
1462 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1464 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1466
1467 setOperationAction(ISD::FROUND, VT, Custom);
1468
1469 setOperationAction(ISD::FNEG, VT, Custom);
1470 setOperationAction(ISD::FABS, VT, Custom);
1472
1473 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1474 setOperationAction(ISD::FMINIMUM, VT, Custom);
1475 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1476 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1478 }
1479
1480 setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
1481 setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
1482
1483 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1484 // even though v8i16 is a legal type.
1485 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1486 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1487 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1488 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1492
1495 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Expand);
1497 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
1499
1511
1512 if (!Subtarget.hasAVX512())
1513 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1514
1515 // In the customized shift lowering, the legal v8i32/v4i64 cases
1516 // in AVX2 will be recognized.
1517 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1523 if (VT == MVT::v4i64) continue;
1528 }
1529
1530 // These types need custom splitting if their input is a 128-bit vector.
1535
1539 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1540 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1543
1544 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1548 }
1549
1554
1555 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1560
1561 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1562 // setcc all the way to isel and prefer SETGT in some isel patterns.
1565 }
1566
1567 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1568 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1573
1574 if (Subtarget.hasAnyFMA()) {
1575 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1576 MVT::v2f64, MVT::v4f64 }) {
1579 }
1580 }
1581
1582 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1583 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1584 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1585 }
1586
1587 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1588 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1589 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1590 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1591
1592 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1593 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1594 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1595 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1596 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1597 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1598 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1599 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1600
1601 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1602 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1603
1604 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1605 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1606 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1607 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1608 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1609
1610 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1611 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1612 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1613 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1614 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1615 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1616 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1617 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1622
1623 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1624 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1625 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1626 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1627 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1628 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1629 }
1630
1631 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1634 }
1635
1636 if (HasInt256) {
1637 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1638 // when we have a 256bit-wide blend with immediate.
1641
1642 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1643 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1644 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1645 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1646 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1647 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1648 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1649 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1650 }
1651 }
1652
1653 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1654 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1655 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1656 setOperationAction(ISD::MSTORE, VT, Legal);
1657 }
1658
1659 // Extract subvector is special because the value type
1660 // (result) is 128-bit but the source is 256-bit wide.
1661 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1662 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1664 }
1665
1666 // Custom lower several nodes for 256-bit types.
1667 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1668 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1677 setOperationAction(ISD::STORE, VT, Custom);
1678 }
1679 setF16Action(MVT::v16f16, Expand);
1680 setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
1681 setOperationAction(ISD::FABS, MVT::v16f16, Custom);
1683 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1684 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1685 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1686 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1687
1688 if (HasInt256) {
1690
1691 // Custom legalize 2x32 to get a little better code.
1692 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1693 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1694
1695 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1696 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1697 setOperationAction(ISD::MGATHER, VT, Custom);
1698 }
1699 }
1700
1701 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1702 Subtarget.hasF16C()) {
1703 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1706 }
1707 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1708 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1710 }
1711 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1712 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1713 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1714 }
1715 setOperationAction(ISD::SETCC, MVT::v8f16, Custom);
1716 setOperationAction(ISD::SETCC, MVT::v16f16, Custom);
1717 }
1718
1719 // This block controls legalization of the mask vector sizes that are
1720 // available with AVX512. 512-bit vectors are in a separate block controlled
1721 // by useAVX512Regs.
1722 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1723 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1724 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1725 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1726 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1727 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1728
1732
1733 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1734 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1735 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1736 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1737 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1738 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1739 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1740 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1748
1749 // There is no byte sized k-register load or store without AVX512DQ.
1750 if (!Subtarget.hasDQI()) {
1751 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1752 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1753 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1754 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1755
1756 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1757 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1758 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1759 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1760 }
1761
1762 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1763 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1767 }
1768
1769 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1771
1772 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1776
1783 }
1784
1785 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1787 }
1788 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1789 for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1790 setOperationAction(ISD::LRINT, VT, Legal);
1791 setOperationAction(ISD::LLRINT, VT, Legal);
1792 }
1793 }
1794
1795 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1796 // elements. 512-bits can be disabled based on prefer-vector-width and
1797 // required-vector-width function attributes.
1798 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1799 bool HasBWI = Subtarget.hasBWI();
1800
1801 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1802 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1803 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1804 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1805 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1806 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1807 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1808
1809 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1810 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1811 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1812 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1813 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1814 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1815 if (HasBWI)
1816 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1817 }
1818
1819 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1820 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1821 setOperationAction(ISD::FMINIMUM, VT, Custom);
1822 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1823 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1824 setOperationAction(ISD::FNEG, VT, Custom);
1825 setOperationAction(ISD::FABS, VT, Custom);
1830 }
1831 setOperationAction(ISD::LRINT, MVT::v16f32,
1832 Subtarget.hasDQI() ? Legal : Custom);
1833 setOperationAction(ISD::LRINT, MVT::v8f64,
1834 Subtarget.hasDQI() ? Legal : Custom);
1835 if (Subtarget.hasDQI())
1836 setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
1837
1838 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1843 }
1844
1845 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1850 }
1851
1856 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
1858
1870
1871 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1872 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1873 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1874 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1875 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1876 if (HasBWI)
1877 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1878
1879 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1880 // to 512-bit rather than use the AVX2 instructions so that we can use
1881 // k-masks.
1882 if (!Subtarget.hasVLX()) {
1883 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1884 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1885 setOperationAction(ISD::MLOAD, VT, Custom);
1886 setOperationAction(ISD::MSTORE, VT, Custom);
1887 }
1888 }
1889
1891 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1892 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1902
1903 if (HasBWI) {
1904 // Extends from v64i1 masks to 512-bit vectors.
1908 }
1909
1910 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1911 setOperationAction(ISD::FFLOOR, VT, Legal);
1913 setOperationAction(ISD::FCEIL, VT, Legal);
1915 setOperationAction(ISD::FTRUNC, VT, Legal);
1917 setOperationAction(ISD::FRINT, VT, Legal);
1919 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1921 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1923
1924 setOperationAction(ISD::FROUND, VT, Custom);
1925 }
1926
1927 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1930 }
1931
1932 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1933 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1934 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1935 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1936
1937 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1938 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1939 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1940 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1941
1942 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1943 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1944 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1945 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1946 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1947 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1948 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1949 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1950
1951 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1952 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1953
1954 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1964
1965 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1966 // setcc all the way to isel and prefer SETGT in some isel patterns.
1969 }
1970
1971 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
1972 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
1977
1978 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1985 }
1986
1987 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1988 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1989 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1991 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1992 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1993 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1994 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
1999 }
2000
2001 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
2002 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
2003 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
2004 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
2005 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
2006 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
2007
2008 if (Subtarget.hasDQI() || Subtarget.hasFP16())
2012 setOperationAction(Opc, MVT::v8i64, Custom);
2013
2014 if (Subtarget.hasDQI())
2015 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
2016
2017 if (Subtarget.hasCDI()) {
2018 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
2019 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
2021 }
2022 } // Subtarget.hasCDI()
2023
2024 if (Subtarget.hasVPOPCNTDQ()) {
2025 for (auto VT : { MVT::v16i32, MVT::v8i64 })
2027 }
2028
2029 // Extract subvector is special because the value type
2030 // (result) is 256-bit but the source is 512-bit wide.
2031 // 128-bit was made Legal under AVX1.
2032 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
2033 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2035
2036 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2037 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2047 }
2048 setF16Action(MVT::v32f16, Expand);
2051 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
2053 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2054 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
2055 setOperationAction(ISD::SETCC, MVT::v32f16, Custom);
2056
2057 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2058 setOperationAction(ISD::MLOAD, VT, Legal);
2059 setOperationAction(ISD::MSTORE, VT, Legal);
2060 setOperationAction(ISD::MGATHER, VT, Custom);
2061 setOperationAction(ISD::MSCATTER, VT, Custom);
2062 }
2063 if (HasBWI) {
2064 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2065 setOperationAction(ISD::MLOAD, VT, Legal);
2066 setOperationAction(ISD::MSTORE, VT, Legal);
2067 }
2068 } else {
2069 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
2070 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
2071 }
2072
2073 if (Subtarget.hasVBMI2()) {
2074 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2077 }
2078
2079 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
2080 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
2081 }
2082
2083 setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
2084 setOperationAction(ISD::FABS, MVT::v32f16, Custom);
2086 }// useAVX512Regs
2087
2088 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2089 for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2090 MVT::v4i64}) {
2093 }
2094 }
2095
2096 // This block controls legalization for operations that don't have
2097 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2098 // narrower widths.
2099 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2100 // These operations are handled on non-VLX by artificially widening in
2101 // isel patterns.
2102
2106
2107 if (Subtarget.hasDQI()) {
2108 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2109 // v2f32 UINT_TO_FP is already custom under SSE2.
2112 "Unexpected operation action!");
2113 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2118 }
2119
2120 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2126 }
2127
2128 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2131 }
2132
2133 // Custom legalize 2x32 to get a little better code.
2134 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
2135 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
2136
2137 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2138 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2139 setOperationAction(ISD::MSCATTER, VT, Custom);
2140
2141 if (Subtarget.hasDQI()) {
2145 setOperationAction(Opc, MVT::v2i64, Custom);
2146 setOperationAction(Opc, MVT::v4i64, Custom);
2147 }
2148 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2149 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2150 }
2151
2152 if (Subtarget.hasCDI()) {
2153 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2155 }
2156 } // Subtarget.hasCDI()
2157
2158 if (Subtarget.hasVPOPCNTDQ()) {
2159 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2161 }
2162
2163 // We can try to convert vectors to different sizes to leverage legal
2164 // `vpcompress` cases. So we mark these supported vector sizes as Custom and
2165 // then specialize to Legal below.
2166 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2167 MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16,
2168 MVT::v16i16, MVT::v8i8})
2170
2171 // Legal vpcompress depends on various AVX512 extensions.
2172 // Legal in AVX512F
2173 for (MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64})
2175
2176 // Legal in AVX512F + AVX512VL
2177 if (Subtarget.hasVLX())
2178 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2179 MVT::v4f64, MVT::v2i64, MVT::v2f64})
2181
2182 // Legal in AVX512F + AVX512VBMI2
2183 if (Subtarget.hasVBMI2())
2184 for (MVT VT : {MVT::v32i16, MVT::v64i8})
2186
2187 // Legal in AVX512F + AVX512VL + AVX512VBMI2
2188 if (Subtarget.hasVBMI2() && Subtarget.hasVLX())
2189 for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16})
2191 }
2192
2193 // This block control legalization of v32i1/v64i1 which are available with
2194 // AVX512BW..
2195 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2196 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2197 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2198
2199 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2210 }
2211
2212 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2214
2215 // Extends from v32i1 masks to 256-bit vectors.
2219
2220 for (auto VT : {MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16,
2221 MVT::v16f16, MVT::v8f16}) {
2222 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2223 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2224 }
2225
2226 // These operations are handled on non-VLX by artificially widening in
2227 // isel patterns.
2228 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2229
2230 if (Subtarget.hasBITALG()) {
2231 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2233 }
2234 }
2235
2236 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2237 auto setGroup = [&] (MVT VT) {
2246 setOperationAction(ISD::FSQRT, VT, Legal);
2248
2249 setOperationAction(ISD::FFLOOR, VT, Legal);
2251 setOperationAction(ISD::FCEIL, VT, Legal);
2253 setOperationAction(ISD::FTRUNC, VT, Legal);
2255 setOperationAction(ISD::FRINT, VT, Legal);
2257 setOperationAction(ISD::FNEARBYINT, VT, Legal);
2259 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
2261
2262 setOperationAction(ISD::FROUND, VT, Custom);
2263
2264 setOperationAction(ISD::LOAD, VT, Legal);
2265 setOperationAction(ISD::STORE, VT, Legal);
2266
2272
2273 setOperationAction(ISD::FNEG, VT, Custom);
2274 setOperationAction(ISD::FABS, VT, Custom);
2278
2282 };
2283
2284 // AVX512_FP16 scalar operations
2285 setGroup(MVT::f16);
2289 setOperationAction(ISD::BR_CC, MVT::f16, Expand);
2291 setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
2295 setOperationAction(ISD::FMAXIMUM, MVT::f16, Custom);
2296 setOperationAction(ISD::FMINIMUM, MVT::f16, Custom);
2297 setOperationAction(ISD::FMAXIMUMNUM, MVT::f16, Custom);
2298 setOperationAction(ISD::FMINIMUMNUM, MVT::f16, Custom);
2299 setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
2301 setOperationAction(ISD::LRINT, MVT::f16, Legal);
2302 setOperationAction(ISD::LLRINT, MVT::f16, Legal);
2303
2306
2307 if (Subtarget.useAVX512Regs()) {
2308 setGroup(MVT::v32f16);
2314 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2316 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
2318 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
2321
2326 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2328 MVT::v32i16);
2329 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2331 MVT::v32i16);
2332 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2334 MVT::v32i16);
2335 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2337 MVT::v32i16);
2338
2342
2343 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2344 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2345
2346 setOperationAction(ISD::FMINIMUM, MVT::v32f16, Custom);
2347 setOperationAction(ISD::FMAXIMUM, MVT::v32f16, Custom);
2348 setOperationAction(ISD::FMINIMUMNUM, MVT::v32f16, Custom);
2349 setOperationAction(ISD::FMAXIMUMNUM, MVT::v32f16, Custom);
2350 setOperationAction(ISD::LRINT, MVT::v32f16, Legal);
2351 setOperationAction(ISD::LLRINT, MVT::v8f16, Legal);
2352 }
2353
2358
2359 if (Subtarget.hasVLX()) {
2360 setGroup(MVT::v8f16);
2361 setGroup(MVT::v16f16);
2362
2373
2376 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Custom);
2378 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
2380
2381 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2384
2388
2389 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2390 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2391 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2392 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2393
2394 // Need to custom widen these to prevent scalarization.
2395 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2396 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2397
2398 setOperationAction(ISD::FMINIMUM, MVT::v8f16, Custom);
2399 setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Custom);
2400 setOperationAction(ISD::FMINIMUMNUM, MVT::v8f16, Custom);
2401 setOperationAction(ISD::FMAXIMUMNUM, MVT::v8f16, Custom);
2402
2403 setOperationAction(ISD::FMINIMUM, MVT::v16f16, Custom);
2404 setOperationAction(ISD::FMAXIMUM, MVT::v16f16, Custom);
2405 setOperationAction(ISD::FMINIMUMNUM, MVT::v16f16, Custom);
2406 setOperationAction(ISD::FMAXIMUMNUM, MVT::v16f16, Custom);
2407 setOperationAction(ISD::LRINT, MVT::v8f16, Legal);
2408 setOperationAction(ISD::LRINT, MVT::v16f16, Legal);
2409 }
2410 }
2411
2412 if (!Subtarget.useSoftFloat() &&
2413 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2414 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2415 : &X86::VR128RegClass);
2416 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2417 : &X86::VR256RegClass);
2418 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2419 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2420 // Set the operation action Custom to do the customization later.
2423 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2424 setF16Action(VT, Expand);
2425 if (!Subtarget.hasBF16())
2431 }
2432 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2433 setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2434 setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2435 }
2436 setOperationAction(ISD::SETCC, MVT::v8bf16, Custom);
2437 setOperationAction(ISD::SETCC, MVT::v16bf16, Custom);
2439 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2440 }
2441
2442 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() &&
2443 Subtarget.useAVX512Regs()) {
2444 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2445 setF16Action(MVT::v32bf16, Expand);
2446 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2447 setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
2448 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2450 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2454 }
2455
2456 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
2457 setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
2458 setOperationAction(ISD::FSUB, MVT::v32bf16, Legal);
2459 setOperationAction(ISD::FMUL, MVT::v32bf16, Legal);
2460 setOperationAction(ISD::FDIV, MVT::v32bf16, Legal);
2461 setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
2462 setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
2463 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2464 setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
2465 setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
2466 setOperationAction(ISD::FMINIMUMNUM, MVT::v32bf16, Custom);
2467 setOperationAction(ISD::FMAXIMUMNUM, MVT::v32bf16, Custom);
2468 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2473 setOperationAction(ISD::FSQRT, VT, Legal);
2476 setOperationAction(ISD::FMINIMUM, VT, Custom);
2477 setOperationAction(ISD::FMAXIMUM, VT, Custom);
2478 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
2479 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
2480 }
2481 for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
2484 }
2485 }
2486
2487 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2488 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2489 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2490 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2491 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2492 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2493
2494 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2495 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2496 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2497 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2498 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2499
2500 if (Subtarget.hasBWI()) {
2501 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2502 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2503 }
2504
2505 if (Subtarget.hasFP16()) {
2506 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2515 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2524 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2529 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2530 setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom);
2532 setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);
2534 }
2535 }
2536
2537 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2538 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2539 }
2540
2541 // We want to custom lower some of our intrinsics.
2545 if (!Subtarget.is64Bit()) {
2547 }
2548
2549 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2550 // handle type legalization for these operations here.
2551 //
2552 // FIXME: We really should do custom legalization for addition and
2553 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2554 // than generic legalization for 64-bit multiplication-with-overflow, though.
2555 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2556 if (VT == MVT::i64 && !Subtarget.is64Bit())
2557 continue;
2558 // Add/Sub/Mul with overflow operations are custom lowered.
2565
2566 // Support carry in as value rather than glue.
2572 }
2573
2574 // Combine sin / cos into _sincos_stret if it is available.
2575 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2576 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2577 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
2578 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
2579 }
2580
2581 if (Subtarget.isTargetWin64()) {
2582 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2583 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2584 setOperationAction(ISD::SREM, MVT::i128, Custom);
2585 setOperationAction(ISD::UREM, MVT::i128, Custom);
2594 }
2595
2596 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2597 // is. We should promote the value to 64-bits to solve this.
2598 // This is what the CRT headers do - `fmodf` is an inline header
2599 // function casting to f64 and calling `fmod`.
2600 if (Subtarget.is32Bit() &&
2601 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2602 // clang-format off
2603 for (ISD::NodeType Op :
2604 {ISD::FACOS, ISD::STRICT_FACOS,
2605 ISD::FASIN, ISD::STRICT_FASIN,
2606 ISD::FATAN, ISD::STRICT_FATAN,
2607 ISD::FATAN2, ISD::STRICT_FATAN2,
2608 ISD::FCEIL, ISD::STRICT_FCEIL,
2609 ISD::FCOS, ISD::STRICT_FCOS,
2610 ISD::FCOSH, ISD::STRICT_FCOSH,
2611 ISD::FEXP, ISD::STRICT_FEXP,
2612 ISD::FFLOOR, ISD::STRICT_FFLOOR,
2614 ISD::FLOG, ISD::STRICT_FLOG,
2615 ISD::FLOG10, ISD::STRICT_FLOG10,
2616 ISD::FPOW, ISD::STRICT_FPOW,
2617 ISD::FSIN, ISD::STRICT_FSIN,
2618 ISD::FSINH, ISD::STRICT_FSINH,
2619 ISD::FTAN, ISD::STRICT_FTAN,
2620 ISD::FTANH, ISD::STRICT_FTANH,
2621 // TODO: Add ISD:::STRICT_FMODF too once implemented.
2622 ISD::FMODF})
2623 if (isOperationExpand(Op, MVT::f32))
2624 setOperationAction(Op, MVT::f32, Promote);
2625 // clang-format on
2626
2627 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2628 // it, but it's just a wrapper around ldexp.
2629 if (Subtarget.isOSWindows()) {
2630 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
2631 if (isOperationExpand(Op, MVT::f32))
2632 setOperationAction(Op, MVT::f32, Promote);
2633 }
2634
2635 // We have target-specific dag combine patterns for the following nodes:
2643 ISD::BITCAST,
2646 ISD::SHL,
2647 ISD::SRA,
2648 ISD::SRL,
2649 ISD::OR,
2650 ISD::AND,
2656 ISD::ADD,
2657 ISD::FADD,
2658 ISD::FSUB,
2659 ISD::FNEG,
2660 ISD::FMA,
2662 ISD::FMINNUM,
2663 ISD::FMAXNUM,
2664 ISD::SUB,
2665 ISD::LOAD,
2666 ISD::LRINT,
2667 ISD::LLRINT,
2668 ISD::MLOAD,
2669 ISD::STORE,
2670 ISD::MSTORE,
2686 ISD::SETCC,
2687 ISD::MUL,
2688 ISD::XOR,
2689 ISD::MSCATTER,
2690 ISD::MGATHER,
2691 ISD::FP16_TO_FP,
2692 ISD::FP_EXTEND,
2699
2700 computeRegisterProperties(Subtarget.getRegisterInfo());
2701
2702 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2704 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2706 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2708
2709 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2710 // that needs to benchmarked and balanced with the potential use of vector
2711 // load/store types (PR33329, PR33914).
2714
2715 // Default loop alignment, which can be overridden by -align-loops.
2717
2718 // An out-of-order CPU can speculatively execute past a predictable branch,
2719 // but a conditional move could be stalled by an expensive earlier operation.
2720 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2721 EnableExtLdPromotion = true;
2723
2725
2726 // Default to having -disable-strictnode-mutation on
2727 IsStrictFPEnabled = true;
2728}
2729
2730// This has so far only been implemented for 64-bit MachO.
2732 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2733}
2734
2736 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2737 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2738}
2739
2741 const SDLoc &DL) const {
2742 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2743 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2744 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2745 return SDValue(Node, 0);
2746}
2747
2750 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2751 !Subtarget.hasBWI())
2752 return TypeSplitVector;
2753
2754 // Since v8f16 is legal, widen anything over v4f16.
2755 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2756 VT.getVectorNumElements() <= 4 && !Subtarget.hasF16C() &&
2757 VT.getVectorElementType() == MVT::f16)
2758 return TypeSplitVector;
2759
2760 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2761 VT.getVectorElementType() != MVT::i1)
2762 return TypeWidenVector;
2763
2765}
2766
2767FastISel *
2769 const TargetLibraryInfo *libInfo) const {
2770 return X86::createFastISel(funcInfo, libInfo);
2771}
2772
2773//===----------------------------------------------------------------------===//
2774// Other Lowering Hooks
2775//===----------------------------------------------------------------------===//
2776
2778 bool AssumeSingleUse) {
2779 if (!AssumeSingleUse && !Op.hasOneUse())
2780 return false;
2781 if (!ISD::isNormalLoad(Op.getNode()))
2782 return false;
2783
2784 // If this is an unaligned vector, make sure the target supports folding it.
2785 auto *Ld = cast<LoadSDNode>(Op.getNode());
2786 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
2787 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
2788 return false;
2789
2790 // TODO: If this is a non-temporal load and the target has an instruction
2791 // for it, it should not be folded. See "useNonTemporalLoad()".
2792
2793 return true;
2794}
2795
2797 const X86Subtarget &Subtarget,
2798 bool AssumeSingleUse) {
2799 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2800 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2801 return false;
2802
2803 // We can not replace a wide volatile load with a broadcast-from-memory,
2804 // because that would narrow the load, which isn't legal for volatiles.
2805 auto *Ld = cast<LoadSDNode>(Op.getNode());
2806 return !Ld->isVolatile() ||
2807 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2808}
2809
2811 if (!Op.hasOneUse())
2812 return false;
2813 // Peek through (oneuse) bitcast users
2814 SDNode *User = *Op->user_begin();
2815 while (User->getOpcode() == ISD::BITCAST) {
2816 if (!User->hasOneUse())
2817 return false;
2818 User = *User->user_begin();
2819 }
2820 return ISD::isNormalStore(User);
2821}
2822
2824 if (Op.hasOneUse()) {
2825 unsigned Opcode = Op.getNode()->user_begin()->getOpcode();
2826 return (ISD::ZERO_EXTEND == Opcode);
2827 }
2828 return false;
2829}
2830
2831static bool isLogicOp(unsigned Opcode) {
2832 // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
2833 return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
2834}
2835
2836static bool isTargetShuffle(unsigned Opcode) {
2837 switch(Opcode) {
2838 default: return false;
2839 case X86ISD::BLENDI:
2840 case X86ISD::PSHUFB:
2841 case X86ISD::PSHUFD:
2842 case X86ISD::PSHUFHW:
2843 case X86ISD::PSHUFLW:
2844 case X86ISD::SHUFP:
2845 case X86ISD::INSERTPS:
2846 case X86ISD::EXTRQI:
2847 case X86ISD::INSERTQI:
2848 case X86ISD::VALIGN:
2849 case X86ISD::PALIGNR:
2850 case X86ISD::VSHLDQ:
2851 case X86ISD::VSRLDQ:
2852 case X86ISD::MOVLHPS:
2853 case X86ISD::MOVHLPS:
2854 case X86ISD::MOVSHDUP:
2855 case X86ISD::MOVSLDUP:
2856 case X86ISD::MOVDDUP:
2857 case X86ISD::MOVSS:
2858 case X86ISD::MOVSD:
2859 case X86ISD::MOVSH:
2860 case X86ISD::UNPCKL:
2861 case X86ISD::UNPCKH:
2862 case X86ISD::VBROADCAST:
2863 case X86ISD::VPERMILPI:
2864 case X86ISD::VPERMILPV:
2865 case X86ISD::VPERM2X128:
2866 case X86ISD::SHUF128:
2867 case X86ISD::VPERMIL2:
2868 case X86ISD::VPERMI:
2869 case X86ISD::VPPERM:
2870 case X86ISD::VPERMV:
2871 case X86ISD::VPERMV3:
2872 case X86ISD::VZEXT_MOVL:
2873 return true;
2874 }
2875}
2876
2877static bool isTargetShuffleVariableMask(unsigned Opcode) {
2878 switch (Opcode) {
2879 default: return false;
2880 // Target Shuffles.
2881 case X86ISD::PSHUFB:
2882 case X86ISD::VPERMILPV:
2883 case X86ISD::VPERMIL2:
2884 case X86ISD::VPPERM:
2885 case X86ISD::VPERMV:
2886 case X86ISD::VPERMV3:
2887 return true;
2888 // 'Faux' Target Shuffles.
2889 case ISD::OR:
2890 case ISD::AND:
2891 case X86ISD::ANDNP:
2892 return true;
2893 }
2894}
2895
2898 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2900 int ReturnAddrIndex = FuncInfo->getRAIndex();
2901
2902 if (ReturnAddrIndex == 0) {
2903 // Set up a frame object for the return address.
2904 unsigned SlotSize = RegInfo->getSlotSize();
2905 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
2906 -(int64_t)SlotSize,
2907 false);
2908 FuncInfo->setRAIndex(ReturnAddrIndex);
2909 }
2910
2911 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
2912}
2913
2915 bool HasSymbolicDisplacement) {
2916 // Offset should fit into 32 bit immediate field.
2917 if (!isInt<32>(Offset))
2918 return false;
2919
2920 // If we don't have a symbolic displacement - we don't have any extra
2921 // restrictions.
2922 if (!HasSymbolicDisplacement)
2923 return true;
2924
2925 // We can fold large offsets in the large code model because we always use
2926 // 64-bit offsets.
2927 if (CM == CodeModel::Large)
2928 return true;
2929
2930 // For kernel code model we know that all object resist in the negative half
2931 // of 32bits address space. We may not accept negative offsets, since they may
2932 // be just off and we may accept pretty large positive ones.
2933 if (CM == CodeModel::Kernel)
2934 return Offset >= 0;
2935
2936 // For other non-large code models we assume that latest small object is 16MB
2937 // before end of 31 bits boundary. We may also accept pretty large negative
2938 // constants knowing that all objects are in the positive half of address
2939 // space.
2940 return Offset < 16 * 1024 * 1024;
2941}
2942
2943/// Return true if the condition is an signed comparison operation.
2944static bool isX86CCSigned(X86::CondCode X86CC) {
2945 switch (X86CC) {
2946 default:
2947 llvm_unreachable("Invalid integer condition!");
2948 case X86::COND_E:
2949 case X86::COND_NE:
2950 case X86::COND_B:
2951 case X86::COND_A:
2952 case X86::COND_BE:
2953 case X86::COND_AE:
2954 return false;
2955 case X86::COND_G:
2956 case X86::COND_GE:
2957 case X86::COND_L:
2958 case X86::COND_LE:
2959 return true;
2960 }
2961}
2962
2964 switch (SetCCOpcode) {
2965 // clang-format off
2966 default: llvm_unreachable("Invalid integer condition!");
2967 case ISD::SETEQ: return X86::COND_E;
2968 case ISD::SETGT: return X86::COND_G;
2969 case ISD::SETGE: return X86::COND_GE;
2970 case ISD::SETLT: return X86::COND_L;
2971 case ISD::SETLE: return X86::COND_LE;
2972 case ISD::SETNE: return X86::COND_NE;
2973 case ISD::SETULT: return X86::COND_B;
2974 case ISD::SETUGT: return X86::COND_A;
2975 case ISD::SETULE: return X86::COND_BE;
2976 case ISD::SETUGE: return X86::COND_AE;
2977 // clang-format on
2978 }
2979}
2980
2981/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
2982/// condition code, returning the condition code and the LHS/RHS of the
2983/// comparison to make.
2985 bool isFP, SDValue &LHS, SDValue &RHS,
2986 SelectionDAG &DAG) {
2987 if (!isFP) {
2989 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
2990 // X > -1 -> X == 0, jump !sign.
2991 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2992 return X86::COND_NS;
2993 }
2994 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
2995 // X < 0 -> X == 0, jump on sign.
2996 return X86::COND_S;
2997 }
2998 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
2999 // X >= 0 -> X == 0, jump on !sign.
3000 return X86::COND_NS;
3001 }
3002 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
3003 // X < 1 -> X <= 0
3004 RHS = DAG.getConstant(0, DL, RHS.getValueType());
3005 return X86::COND_LE;
3006 }
3007 }
3008
3009 return TranslateIntegerX86CC(SetCCOpcode);
3010 }
3011
3012 // First determine if it is required or is profitable to flip the operands.
3013
3014 // If LHS is a foldable load, but RHS is not, flip the condition.
3015 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3016 !ISD::isNON_EXTLoad(RHS.getNode())) {
3017 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3018 std::swap(LHS, RHS);
3019 }
3020
3021 switch (SetCCOpcode) {
3022 default: break;
3023 case ISD::SETOLT:
3024 case ISD::SETOLE:
3025 case ISD::SETUGT:
3026 case ISD::SETUGE:
3027 std::swap(LHS, RHS);
3028 break;
3029 }
3030
3031 // On a floating point condition, the flags are set as follows:
3032 // ZF PF CF op
3033 // 0 | 0 | 0 | X > Y
3034 // 0 | 0 | 1 | X < Y
3035 // 1 | 0 | 0 | X == Y
3036 // 1 | 1 | 1 | unordered
3037 switch (SetCCOpcode) {
3038 // clang-format off
3039 default: llvm_unreachable("Condcode should be pre-legalized away");
3040 case ISD::SETUEQ:
3041 case ISD::SETEQ: return X86::COND_E;
3042 case ISD::SETOLT: // flipped
3043 case ISD::SETOGT:
3044 case ISD::SETGT: return X86::COND_A;
3045 case ISD::SETOLE: // flipped
3046 case ISD::SETOGE:
3047 case ISD::SETGE: return X86::COND_AE;
3048 case ISD::SETUGT: // flipped
3049 case ISD::SETULT:
3050 case ISD::SETLT: return X86::COND_B;
3051 case ISD::SETUGE: // flipped
3052 case ISD::SETULE:
3053 case ISD::SETLE: return X86::COND_BE;
3054 case ISD::SETONE:
3055 case ISD::SETNE: return X86::COND_NE;
3056 case ISD::SETUO: return X86::COND_P;
3057 case ISD::SETO: return X86::COND_NP;
3058 case ISD::SETOEQ:
3059 case ISD::SETUNE: return X86::COND_INVALID;
3060 // clang-format on
3061 }
3062}
3063
3064/// Is there a floating point cmov for the specific X86 condition code?
3065/// Current x86 isa includes the following FP cmov instructions:
3066/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3067static bool hasFPCMov(unsigned X86CC) {
3068 switch (X86CC) {
3069 default:
3070 return false;
3071 case X86::COND_B:
3072 case X86::COND_BE:
3073 case X86::COND_E:
3074 case X86::COND_P:
3075 case X86::COND_A:
3076 case X86::COND_AE:
3077 case X86::COND_NE:
3078 case X86::COND_NP:
3079 return true;
3080 }
3081}
3082
3083static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
3084 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
3085 VT.is512BitVector();
3086}
3087
3089 const CallInst &I,
3090 MachineFunction &MF,
3091 unsigned Intrinsic) const {
3092 Info.flags = MachineMemOperand::MONone;
3093 Info.offset = 0;
3094
3096 if (!IntrData) {
3097 switch (Intrinsic) {
3098 case Intrinsic::x86_aesenc128kl:
3099 case Intrinsic::x86_aesdec128kl:
3100 Info.opc = ISD::INTRINSIC_W_CHAIN;
3101 Info.ptrVal = I.getArgOperand(1);
3102 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3103 Info.align = Align(1);
3104 Info.flags |= MachineMemOperand::MOLoad;
3105 return true;
3106 case Intrinsic::x86_aesenc256kl:
3107 case Intrinsic::x86_aesdec256kl:
3108 Info.opc = ISD::INTRINSIC_W_CHAIN;
3109 Info.ptrVal = I.getArgOperand(1);
3110 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3111 Info.align = Align(1);
3112 Info.flags |= MachineMemOperand::MOLoad;
3113 return true;
3114 case Intrinsic::x86_aesencwide128kl:
3115 case Intrinsic::x86_aesdecwide128kl:
3116 Info.opc = ISD::INTRINSIC_W_CHAIN;
3117 Info.ptrVal = I.getArgOperand(0);
3118 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3119 Info.align = Align(1);
3120 Info.flags |= MachineMemOperand::MOLoad;
3121 return true;
3122 case Intrinsic::x86_aesencwide256kl:
3123 case Intrinsic::x86_aesdecwide256kl:
3124 Info.opc = ISD::INTRINSIC_W_CHAIN;
3125 Info.ptrVal = I.getArgOperand(0);
3126 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3127 Info.align = Align(1);
3128 Info.flags |= MachineMemOperand::MOLoad;
3129 return true;
3130 case Intrinsic::x86_cmpccxadd32:
3131 case Intrinsic::x86_cmpccxadd64:
3132 case Intrinsic::x86_atomic_bts:
3133 case Intrinsic::x86_atomic_btc:
3134 case Intrinsic::x86_atomic_btr: {
3135 Info.opc = ISD::INTRINSIC_W_CHAIN;
3136 Info.ptrVal = I.getArgOperand(0);
3137 unsigned Size = I.getType()->getScalarSizeInBits();
3138 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3139 Info.align = Align(Size);
3142 return true;
3143 }
3144 case Intrinsic::x86_atomic_bts_rm:
3145 case Intrinsic::x86_atomic_btc_rm:
3146 case Intrinsic::x86_atomic_btr_rm: {
3147 Info.opc = ISD::INTRINSIC_W_CHAIN;
3148 Info.ptrVal = I.getArgOperand(0);
3149 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3150 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3151 Info.align = Align(Size);
3154 return true;
3155 }
3156 case Intrinsic::x86_aadd32:
3157 case Intrinsic::x86_aadd64:
3158 case Intrinsic::x86_aand32:
3159 case Intrinsic::x86_aand64:
3160 case Intrinsic::x86_aor32:
3161 case Intrinsic::x86_aor64:
3162 case Intrinsic::x86_axor32:
3163 case Intrinsic::x86_axor64:
3164 case Intrinsic::x86_atomic_add_cc:
3165 case Intrinsic::x86_atomic_sub_cc:
3166 case Intrinsic::x86_atomic_or_cc:
3167 case Intrinsic::x86_atomic_and_cc:
3168 case Intrinsic::x86_atomic_xor_cc: {
3169 Info.opc = ISD::INTRINSIC_W_CHAIN;
3170 Info.ptrVal = I.getArgOperand(0);
3171 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3172 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3173 Info.align = Align(Size);
3176 return true;
3177 }
3178 }
3179 return false;
3180 }
3181
3182 switch (IntrData->Type) {
3185 case TRUNCATE_TO_MEM_VI32: {
3186 Info.opc = ISD::INTRINSIC_VOID;
3187 Info.ptrVal = I.getArgOperand(0);
3188 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
3190 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3191 ScalarVT = MVT::i8;
3192 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3193 ScalarVT = MVT::i16;
3194 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3195 ScalarVT = MVT::i32;
3196
3197 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
3198 Info.align = Align(1);
3199 Info.flags |= MachineMemOperand::MOStore;
3200 break;
3201 }
3202 case GATHER:
3203 case GATHER_AVX2: {
3204 Info.opc = ISD::INTRINSIC_W_CHAIN;
3205 Info.ptrVal = nullptr;
3206 MVT DataVT = MVT::getVT(I.getType());
3207 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3208 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3209 IndexVT.getVectorNumElements());
3210 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3211 Info.align = Align(1);
3212 Info.flags |= MachineMemOperand::MOLoad;
3213 break;
3214 }
3215 case SCATTER: {
3216 Info.opc = ISD::INTRINSIC_VOID;
3217 Info.ptrVal = nullptr;
3218 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3219 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3220 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3221 IndexVT.getVectorNumElements());
3222 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3223 Info.align = Align(1);
3224 Info.flags |= MachineMemOperand::MOStore;
3225 break;
3226 }
3227 default:
3228 return false;
3229 }
3230
3231 return true;
3232}
3233
3234/// Returns true if the target can instruction select the
3235/// specified FP immediate natively. If false, the legalizer will
3236/// materialize the FP immediate as a load from a constant pool.
3238 bool ForCodeSize) const {
3239 for (const APFloat &FPImm : LegalFPImmediates)
3240 if (Imm.bitwiseIsEqual(FPImm))
3241 return true;
3242 return false;
3243}
3244
3246 SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
3247 std::optional<unsigned> ByteOffset) const {
3248 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3249
3250 auto PeekThroughOneUserBitcasts = [](const SDNode *N) {
3251 while (N->getOpcode() == ISD::BITCAST && N->hasOneUse())
3252 N = *N->user_begin();
3253 return N;
3254 };
3255
3256 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3257 // relocation target a movq or addq instruction: don't let the load shrink.
3258 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3259 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3260 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3261 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3262
3263 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3264 // those uses are extracted directly into a store, then the extract + store
3265 // can be store-folded, or (4) any use will be used by legal full width
3266 // instruction. Then, it's probably not worth splitting the load.
3267 EVT VT = Load->getValueType(0);
3268 if ((VT.is256BitVector() || VT.is512BitVector()) &&
3269 !SDValue(Load, 0).hasOneUse()) {
3270 bool FullWidthUse = false;
3271 bool AllExtractStores = true;
3272 for (SDUse &Use : Load->uses()) {
3273 // Skip uses of the chain value. Result 0 of the node is the load value.
3274 if (Use.getResNo() != 0)
3275 continue;
3276
3277 const SDNode *User = PeekThroughOneUserBitcasts(Use.getUser());
3278
3279 // If this use is an extract + store, it's probably not worth splitting.
3280 if (User->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
3281 all_of(User->uses(), [&](const SDUse &U) {
3282 const SDNode *Inner = PeekThroughOneUserBitcasts(U.getUser());
3283 return Inner->getOpcode() == ISD::STORE;
3284 }))
3285 continue;
3286
3287 AllExtractStores = false;
3288
3289 // If any use is a full width legal/target bin op, then assume its legal
3290 // and won't split.
3291 if (isBinOp(User->getOpcode()) &&
3292 (isOperationLegal(User->getOpcode(), User->getValueType(0)) ||
3293 User->getOpcode() > ISD::BUILTIN_OP_END))
3294 FullWidthUse = true;
3295 }
3296
3297 if (AllExtractStores)
3298 return false;
3299
3300 // If we have an user that uses the full vector width, then this use is
3301 // only worth splitting if the offset isn't 0 (to avoid an
3302 // EXTRACT_SUBVECTOR) or we're loading a scalar integer.
3303 if (FullWidthUse)
3304 return (ByteOffset.value_or(0) > 0) || NewVT.isScalarInteger();
3305 }
3306
3307 return true;
3308}
3309
3310/// Returns true if it is beneficial to convert a load of a constant
3311/// to just the constant itself.
3313 Type *Ty) const {
3314 assert(Ty->isIntegerTy());
3315
3316 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3317 if (BitSize == 0 || BitSize > 64)
3318 return false;
3319 return true;
3320}
3321
3323 // If we are using XMM registers in the ABI and the condition of the select is
3324 // a floating-point compare and we have blendv or conditional move, then it is
3325 // cheaper to select instead of doing a cross-register move and creating a
3326 // load that depends on the compare result.
3327 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
3328 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3329}
3330
3332 // TODO: It might be a win to ease or lift this restriction, but the generic
3333 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
3334 if (VT.isVector() && Subtarget.hasAVX512())
3335 return false;
3336
3337 return true;
3338}
3339
3341 SDValue C) const {
3342 // TODO: We handle scalars using custom code, but generic combining could make
3343 // that unnecessary.
3344 APInt MulC;
3345 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
3346 return false;
3347
3348 // Find the type this will be legalized too. Otherwise we might prematurely
3349 // convert this to shl+add/sub and then still have to type legalize those ops.
3350 // Another choice would be to defer the decision for illegal types until
3351 // after type legalization. But constant splat vectors of i64 can't make it
3352 // through type legalization on 32-bit targets so we would need to special
3353 // case vXi64.
3354 while (getTypeAction(Context, VT) != TypeLegal)
3355 VT = getTypeToTransformTo(Context, VT);
3356
3357 // If vector multiply is legal, assume that's faster than shl + add/sub.
3358 // Multiply is a complex op with higher latency and lower throughput in
3359 // most implementations, sub-vXi32 vector multiplies are always fast,
3360 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
3361 // is always going to be slow.
3362 unsigned EltSizeInBits = VT.getScalarSizeInBits();
3363 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
3364 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3365 return false;
3366
3367 // shl+add, shl+sub, shl+add+neg
3368 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3369 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3370}
3371
3373 unsigned Index) const {
3375 return false;
3376
3377 // Mask vectors support all subregister combinations and operations that
3378 // extract half of vector.
3379 if (ResVT.getVectorElementType() == MVT::i1)
3380 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
3381 (Index == ResVT.getVectorNumElements()));
3382
3383 return (Index % ResVT.getVectorNumElements()) == 0;
3384}
3385
3387 unsigned Opc = VecOp.getOpcode();
3388
3389 // Assume target opcodes can't be scalarized.
3390 // TODO - do we have any exceptions?
3391 if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
3392 return false;
3393
3394 // If the vector op is not supported, try to convert to scalar.
3395 EVT VecVT = VecOp.getValueType();
3397 return true;
3398
3399 // If the vector op is supported, but the scalar op is not, the transform may
3400 // not be worthwhile.
3401 EVT ScalarVT = VecVT.getScalarType();
3402 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
3403}
3404
3406 bool) const {
3407 // TODO: Allow vectors?
3408 if (VT.isVector())
3409 return false;
3410 return VT.isSimple() || !isOperationExpand(Opcode, VT);
3411}
3412
3414 // Speculate cttz only if we can directly use TZCNT/CMOV, can promote to
3415 // i32/i64 or can rely on BSF passthrough value.
3416 return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||
3417 Subtarget.hasBitScanPassThrough() ||
3418 (!Ty->isVectorTy() &&
3419 Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
3420}
3421
3423 // Speculate ctlz only if we can directly use LZCNT/CMOV, or can rely on BSR
3424 // passthrough value.
3425 return Subtarget.hasLZCNT() || Subtarget.canUseCMOV() ||
3426 Subtarget.hasBitScanPassThrough();
3427}
3428
3430 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
3431 // expensive than a straight movsd. On the other hand, it's important to
3432 // shrink long double fp constant since fldt is very slow.
3433 return !Subtarget.hasSSE2() || VT == MVT::f80;
3434}
3435
3437 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3438 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3439}
3440
3442 const SelectionDAG &DAG,
3443 const MachineMemOperand &MMO) const {
3444 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
3445 BitcastVT.getVectorElementType() == MVT::i1)
3446 return false;
3447
3448 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3449 return false;
3450
3451 // If both types are legal vectors, it's always ok to convert them.
3452 if (LoadVT.isVector() && BitcastVT.isVector() &&
3453 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
3454 return true;
3455
3456 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
3457}
3458
3460 const MachineFunction &MF) const {
3461 // Do not merge to float value size (128 bytes) if no implicit
3462 // float attribute is set.
3463 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
3464
3465 if (NoFloat) {
3466 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3467 return (MemVT.getSizeInBits() <= MaxIntSize);
3468 }
3469 // Make sure we don't merge greater than our preferred vector
3470 // width.
3471 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
3472 return false;
3473
3474 return true;
3475}
3476
3478 return Subtarget.hasFastLZCNT();
3479}
3480
3482 const Instruction &AndI) const {
3483 return true;
3484}
3485
3487 EVT VT = Y.getValueType();
3488
3489 if (VT.isVector())
3490 return false;
3491
3492 if (!Subtarget.hasBMI())
3493 return false;
3494
3495 // There are only 32-bit and 64-bit forms for 'andn'.
3496 if (VT != MVT::i32 && VT != MVT::i64)
3497 return false;
3498
3499 return !isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque();
3500}
3501
3503 EVT VT = Y.getValueType();
3504
3505 if (!VT.isVector())
3506 return hasAndNotCompare(Y);
3507
3508 // Vector.
3509
3510 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
3511 return false;
3512
3513 if (VT == MVT::v4i32)
3514 return true;
3515
3516 return Subtarget.hasSSE2();
3517}
3518
3520 return X.getValueType().isScalarInteger(); // 'bt'
3521}
3522
3526 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
3527 SelectionDAG &DAG) const {
3528 // Does baseline recommend not to perform the fold by default?
3530 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
3531 return false;
3532 // For scalars this transform is always beneficial.
3533 if (X.getValueType().isScalarInteger())
3534 return true;
3535 // If all the shift amounts are identical, then transform is beneficial even
3536 // with rudimentary SSE2 shifts.
3537 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
3538 return true;
3539 // If we have AVX2 with it's powerful shift operations, then it's also good.
3540 if (Subtarget.hasAVX2())
3541 return true;
3542 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3543 return NewShiftOpcode == ISD::SHL;
3544}
3545
3547 EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
3548 const APInt &ShiftOrRotateAmt, const std::optional<APInt> &AndMask) const {
3549 if (!VT.isInteger())
3550 return ShiftOpc;
3551
3552 bool PreferRotate = false;
3553 if (VT.isVector()) {
3554 // For vectors, if we have rotate instruction support, then its definetly
3555 // best. Otherwise its not clear what the best so just don't make changed.
3556 PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 ||
3557 VT.getScalarType() == MVT::i64);
3558 } else {
3559 // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer
3560 // rotate unless we have a zext mask+shr.
3561 PreferRotate = Subtarget.hasBMI2();
3562 if (!PreferRotate) {
3563 unsigned MaskBits =
3564 VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
3565 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3566 }
3567 }
3568
3569 if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) {
3570 assert(AndMask.has_value() && "Null andmask when querying about shift+and");
3571
3572 if (PreferRotate && MayTransformRotate)
3573 return ISD::ROTL;
3574
3575 // If vector we don't really get much benefit swapping around constants.
3576 // Maybe we could check if the DAG has the flipped node already in the
3577 // future.
3578 if (VT.isVector())
3579 return ShiftOpc;
3580
3581 // See if the beneficial to swap shift type.
3582 if (ShiftOpc == ISD::SHL) {
3583 // If the current setup has imm64 mask, then inverse will have
3584 // at least imm32 mask (or be zext i32 -> i64).
3585 if (VT == MVT::i64)
3586 return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
3587 : ShiftOpc;
3588
3589 // We can only benefit if req at least 7-bit for the mask. We
3590 // don't want to replace shl of 1,2,3 as they can be implemented
3591 // with lea/add.
3592 return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc;
3593 }
3594
3595 if (VT == MVT::i64)
3596 // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
3597 // extremely efficient.
3598 return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
3599
3600 // Keep small shifts as shl so we can generate add/lea.
3601 return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc;
3602 }
3603
3604 // We prefer rotate for vectors of if we won't get a zext mask with SRL
3605 // (PreferRotate will be set in the latter case).
3606 if (PreferRotate || !MayTransformRotate || VT.isVector())
3607 return ShiftOpc;
3608
3609 // Non-vector type and we have a zext mask with SRL.
3610 return ISD::SRL;
3611}
3612
3615 const Value *Lhs,
3616 const Value *Rhs) const {
3617 using namespace llvm::PatternMatch;
3618 int BaseCost = BrMergingBaseCostThresh.getValue();
3619 // With CCMP, branches can be merged in a more efficient way.
3620 if (BaseCost >= 0 && Subtarget.hasCCMP())
3621 BaseCost += BrMergingCcmpBias;
3622 // a == b && a == c is a fast pattern on x86.
3623 if (BaseCost >= 0 && Opc == Instruction::And &&
3626 BaseCost += 1;
3627 return {BaseCost, BrMergingLikelyBias.getValue(),
3628 BrMergingUnlikelyBias.getValue()};
3629}
3630
3632 return N->getOpcode() != ISD::FP_EXTEND;
3633}
3634
3636 const SDNode *N, CombineLevel Level) const {
3637 assert(((N->getOpcode() == ISD::SHL &&
3638 N->getOperand(0).getOpcode() == ISD::SRL) ||
3639 (N->getOpcode() == ISD::SRL &&
3640 N->getOperand(0).getOpcode() == ISD::SHL)) &&
3641 "Expected shift-shift mask");
3642 // TODO: Should we always create i64 masks? Or only folded immediates?
3643 EVT VT = N->getValueType(0);
3644 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
3645 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
3646 // Only fold if the shift values are equal - so it folds to AND.
3647 // TODO - we should fold if either is a non-uniform vector but we don't do
3648 // the fold for non-splats yet.
3649 return N->getOperand(1) == N->getOperand(0).getOperand(1);
3650 }
3652}
3653
3655 EVT VT = Y.getValueType();
3656
3657 // For vectors, we don't have a preference, but we probably want a mask.
3658 if (VT.isVector())
3659 return false;
3660
3661 unsigned MaxWidth = Subtarget.is64Bit() ? 64 : 32;
3662 return VT.getScalarSizeInBits() <= MaxWidth;
3663}
3664
3667 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
3669 !Subtarget.isOSWindows())
3672 ExpansionFactor);
3673}
3674
3676 // Any legal vector type can be splatted more efficiently than
3677 // loading/spilling from memory.
3678 return isTypeLegal(VT);
3679}
3680
3682 MVT VT = MVT::getIntegerVT(NumBits);
3683 if (isTypeLegal(VT))
3684 return VT;
3685
3686 // PMOVMSKB can handle this.
3687 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
3688 return MVT::v16i8;
3689
3690 // VPMOVMSKB can handle this.
3691 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
3692 return MVT::v32i8;
3693
3694 // TODO: Allow 64-bit type for 32-bit target.
3695 // TODO: 512-bit types should be allowed, but make sure that those
3696 // cases are handled in combineVectorSizedSetCCEquality().
3697
3699}
3700
3701/// Val is the undef sentinel value or equal to the specified value.
3702static bool isUndefOrEqual(int Val, int CmpVal) {
3703 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
3704}
3705
3706/// Return true if every element in Mask is the undef sentinel value or equal to
3707/// the specified value.
3708static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
3709 return llvm::all_of(Mask, [CmpVal](int M) {
3710 return (M == SM_SentinelUndef) || (M == CmpVal);
3711 });
3712}
3713
3714/// Return true if every element in Mask, beginning from position Pos and ending
3715/// in Pos+Size is the undef sentinel value or equal to the specified value.
3716static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,
3717 unsigned Size) {
3718 return llvm::all_of(Mask.slice(Pos, Size),
3719 [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });
3720}
3721
3722/// Val is either the undef or zero sentinel value.
3723static bool isUndefOrZero(int Val) {
3724 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
3725}
3726
3727/// Return true if every element in Mask, beginning from position Pos and ending
3728/// in Pos+Size is the undef sentinel value.
3729static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
3730 return llvm::all_of(Mask.slice(Pos, Size),
3731 [](int M) { return M == SM_SentinelUndef; });
3732}
3733
3734/// Return true if the mask creates a vector whose lower half is undefined.
3736 unsigned NumElts = Mask.size();
3737 return isUndefInRange(Mask, 0, NumElts / 2);
3738}
3739
3740/// Return true if the mask creates a vector whose upper half is undefined.
3742 unsigned NumElts = Mask.size();
3743 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
3744}
3745
3746/// Return true if Val falls within the specified range (L, H].
3747static bool isInRange(int Val, int Low, int Hi) {
3748 return (Val >= Low && Val < Hi);
3749}
3750
3751/// Return true if the value of any element in Mask falls within the specified
3752/// range (L, H].
3753static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
3754 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
3755}
3756
3757/// Return true if the value of any element in Mask is the zero sentinel value.
3758static bool isAnyZero(ArrayRef<int> Mask) {
3759 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
3760}
3761
3762/// Return true if Val is undef or if its value falls within the
3763/// specified range (L, H].
3764static bool isUndefOrInRange(int Val, int Low, int Hi) {
3765 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
3766}
3767
3768/// Return true if every element in Mask is undef or if its value
3769/// falls within the specified range (L, H].
3770static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3771 return llvm::all_of(
3772 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
3773}
3774
3775/// Return true if Val is undef, zero or if its value falls within the
3776/// specified range (L, H].
3777static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
3778 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
3779}
3780
3781/// Return true if every element in Mask is undef, zero or if its value
3782/// falls within the specified range (L, H].
3783static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3784 return llvm::all_of(
3785 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
3786}
3787
3788/// Return true if every element in Mask, is an in-place blend/select mask or is
3789/// undef.
3791 unsigned NumElts = Mask.size();
3792 for (auto [I, M] : enumerate(Mask))
3793 if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))
3794 return false;
3795 return true;
3796}
3797
3798/// Return true if every element in Mask, beginning
3799/// from position Pos and ending in Pos + Size, falls within the specified
3800/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3801static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
3802 unsigned Size, int Low, int Step = 1) {
3803 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3804 if (!isUndefOrEqual(Mask[i], Low))
3805 return false;
3806 return true;
3807}
3808
3809/// Return true if every element in Mask, beginning
3810/// from position Pos and ending in Pos+Size, falls within the specified
3811/// sequential range (Low, Low+Size], or is undef or is zero.
3813 unsigned Size, int Low,
3814 int Step = 1) {
3815 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3816 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
3817 return false;
3818 return true;
3819}
3820
3821/// Return true if every element in Mask, beginning
3822/// from position Pos and ending in Pos+Size is undef or is zero.
3823static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
3824 unsigned Size) {
3825 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
3826}
3827
3828/// Return true if every element of a single input is referenced by the shuffle
3829/// mask. i.e. it just permutes them all.
3831 unsigned NumElts = Mask.size();
3832 APInt DemandedElts = APInt::getZero(NumElts);
3833 for (int M : Mask)
3834 if (isInRange(M, 0, NumElts))
3835 DemandedElts.setBit(M);
3836 return DemandedElts.isAllOnes();
3837}
3838
3839/// Helper function to test whether a shuffle mask could be
3840/// simplified by widening the elements being shuffled.
3841///
3842/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
3843/// leaves it in an unspecified state.
3844///
3845/// NOTE: This must handle normal vector shuffle masks and *target* vector
3846/// shuffle masks. The latter have the special property of a '-2' representing
3847/// a zero-ed lane of a vector.
3849 SmallVectorImpl<int> &WidenedMask) {
3850 WidenedMask.assign(Mask.size() / 2, 0);
3851 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
3852 int M0 = Mask[i];
3853 int M1 = Mask[i + 1];
3854
3855 // If both elements are undef, its trivial.
3856 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
3857 WidenedMask[i / 2] = SM_SentinelUndef;
3858 continue;
3859 }
3860
3861 // Check for an undef mask and a mask value properly aligned to fit with
3862 // a pair of values. If we find such a case, use the non-undef mask's value.
3863 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
3864 WidenedMask[i / 2] = M1 / 2;
3865 continue;
3866 }
3867 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
3868 WidenedMask[i / 2] = M0 / 2;
3869 continue;
3870 }
3871
3872 // When zeroing, we need to spread the zeroing across both lanes to widen.
3873 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
3874 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
3876 WidenedMask[i / 2] = SM_SentinelZero;
3877 continue;
3878 }
3879 return false;
3880 }
3881
3882 // Finally check if the two mask values are adjacent and aligned with
3883 // a pair.
3884 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
3885 WidenedMask[i / 2] = M0 / 2;
3886 continue;
3887 }
3888
3889 // Otherwise we can't safely widen the elements used in this shuffle.
3890 return false;
3891 }
3892 assert(WidenedMask.size() == Mask.size() / 2 &&
3893 "Incorrect size of mask after widening the elements!");
3894
3895 return true;
3896}
3897
3899 const APInt &Zeroable,
3900 bool V2IsZero,
3901 SmallVectorImpl<int> &WidenedMask) {
3902 // Create an alternative mask with info about zeroable elements.
3903 // Here we do not set undef elements as zeroable.
3904 SmallVector<int, 64> ZeroableMask(Mask);
3905 if (V2IsZero) {
3906 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
3907 for (int i = 0, Size = Mask.size(); i != Size; ++i)
3908 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
3909 ZeroableMask[i] = SM_SentinelZero;
3910 }
3911 return canWidenShuffleElements(ZeroableMask, WidenedMask);
3912}
3913
3915 SmallVector<int, 32> WidenedMask;
3916 return canWidenShuffleElements(Mask, WidenedMask);
3917}
3918
3919// Attempt to narrow/widen shuffle mask until it matches the target number of
3920// elements.
3921static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
3922 SmallVectorImpl<int> &ScaledMask) {
3923 unsigned NumSrcElts = Mask.size();
3924 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
3925 "Illegal shuffle scale factor");
3926
3927 // Narrowing is guaranteed to work.
3928 if (NumDstElts >= NumSrcElts) {
3929 int Scale = NumDstElts / NumSrcElts;
3930 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
3931 return true;
3932 }
3933
3934 // We have to repeat the widening until we reach the target size, but we can
3935 // split out the first widening as it sets up ScaledMask for us.
3936 if (canWidenShuffleElements(Mask, ScaledMask)) {
3937 while (ScaledMask.size() > NumDstElts) {
3938 SmallVector<int, 16> WidenedMask;
3939 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
3940 return false;
3941 ScaledMask = std::move(WidenedMask);
3942 }
3943 return true;
3944 }
3945
3946 return false;
3947}
3948
3949static bool canScaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts) {
3950 SmallVector<int, 32> ScaledMask;
3951 return scaleShuffleElements(Mask, NumDstElts, ScaledMask);
3952}
3953
3954// Helper to grow the shuffle mask for a larger value type.
3955// NOTE: This is different to scaleShuffleElements which is a same size type.
3956static void growShuffleMask(ArrayRef<int> SrcMask,
3957 SmallVectorImpl<int> &DstMask,
3958 unsigned SrcSizeInBits, unsigned DstSizeInBits) {
3959 assert(DstMask.empty() && "Expected an empty shuffle mas");
3960 assert((DstSizeInBits % SrcSizeInBits) == 0 && "Illegal shuffle scale");
3961 unsigned Scale = DstSizeInBits / SrcSizeInBits;
3962 unsigned NumSrcElts = SrcMask.size();
3963 DstMask.assign(SrcMask.begin(), SrcMask.end());
3964 for (int &M : DstMask) {
3965 if (M < 0)
3966 continue;
3967 M = (M % NumSrcElts) + ((M / NumSrcElts) * Scale * NumSrcElts);
3968 }
3969 DstMask.append((Scale - 1) * NumSrcElts, SM_SentinelUndef);
3970}
3971
3972/// Returns true if Elt is a constant zero or a floating point constant +0.0.
3974 return isNullConstant(Elt) || isNullFPConstant(Elt);
3975}
3976
3977// Build a vector of constants.
3978// Use an UNDEF node if MaskElt == -1.
3979// Split 64-bit constants in the 32-bit mode.
3981 const SDLoc &dl, bool IsMask = false) {
3982
3984 bool Split = false;
3985
3986 MVT ConstVecVT = VT;
3987 unsigned NumElts = VT.getVectorNumElements();
3988 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3989 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3990 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3991 Split = true;
3992 }
3993
3994 MVT EltVT = ConstVecVT.getVectorElementType();
3995 for (unsigned i = 0; i < NumElts; ++i) {
3996 bool IsUndef = Values[i] < 0 && IsMask;
3997 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
3998 DAG.getConstant(Values[i], dl, EltVT);
3999 Ops.push_back(OpNode);
4000 if (Split)
4001 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4002 DAG.getConstant(0, dl, EltVT));
4003 }
4004 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4005 if (Split)
4006 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4007 return ConstsNode;
4008}
4009
4010static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
4011 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4012 assert(Bits.size() == Undefs.getBitWidth() &&
4013 "Unequal constant and undef arrays");
4015 bool Split = false;
4016
4017 MVT ConstVecVT = VT;
4018 unsigned NumElts = VT.getVectorNumElements();
4019 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4020 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4021 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4022 Split = true;
4023 }
4024
4025 MVT EltVT = ConstVecVT.getVectorElementType();
4026 MVT EltIntVT = EltVT.changeTypeToInteger();
4027 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4028 if (Undefs[i]) {
4029 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4030 continue;
4031 }
4032 const APInt &V = Bits[i];
4033 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4034 if (Split) {
4035 Ops.push_back(DAG.getConstant(V.extractBits(32, 0), dl, EltVT));
4036 Ops.push_back(DAG.getConstant(V.extractBits(32, 32), dl, EltVT));
4037 } else {
4038 Ops.push_back(DAG.getBitcast(EltVT, DAG.getConstant(V, dl, EltIntVT)));
4039 }
4040 }
4041
4042 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4043 return DAG.getBitcast(VT, ConstsNode);
4044}
4045
4047 SelectionDAG &DAG, const SDLoc &dl) {
4048 APInt Undefs = APInt::getZero(Bits.size());
4049 return getConstVector(Bits, Undefs, VT, DAG, dl);
4050}
4051
4052/// Returns a vector of specified type with all zero elements.
4053static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4054 SelectionDAG &DAG, const SDLoc &dl) {
4055 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4056 VT.getVectorElementType() == MVT::i1) &&
4057 "Unexpected vector type");
4058
4059 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4060 // type. This ensures they get CSE'd. But if the integer type is not
4061 // available, use a floating-point +0.0 instead.
4062 SDValue Vec;
4063 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4064 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4065 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4066 } else if (VT.isFloatingPoint() &&
4068 Vec = DAG.getConstantFP(+0.0, dl, VT);
4069 } else if (VT.getVectorElementType() == MVT::i1) {
4070 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4071 "Unexpected vector type");
4072 Vec = DAG.getConstant(0, dl, VT);
4073 } else {
4074 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4075 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4076 }
4077 return DAG.getBitcast(VT, Vec);
4078}
4079
4080// Helper to determine if the ops are all the extracted subvectors come from a
4081// single source. If we allow commute they don't have to be in order (Lo/Hi).
4082static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
4083 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4084 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4085 LHS.getValueType() != RHS.getValueType() ||
4086 LHS.getOperand(0) != RHS.getOperand(0))
4087 return SDValue();
4088
4089 SDValue Src = LHS.getOperand(0);
4090 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
4091 return SDValue();
4092
4093 unsigned NumElts = LHS.getValueType().getVectorNumElements();
4094 if ((LHS.getConstantOperandAPInt(1) == 0 &&
4095 RHS.getConstantOperandAPInt(1) == NumElts) ||
4096 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
4097 LHS.getConstantOperandAPInt(1) == NumElts))
4098 return Src;
4099
4100 return SDValue();
4101}
4102
4103static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4104 const SDLoc &dl, unsigned vectorWidth) {
4105 EVT VT = Vec.getValueType();
4106 EVT ElVT = VT.getVectorElementType();
4107 unsigned ResultNumElts =
4108 (VT.getVectorNumElements() * vectorWidth) / VT.getSizeInBits();
4109 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, ResultNumElts);
4110
4111 assert(ResultVT.getSizeInBits() == vectorWidth &&
4112 "Illegal subvector extraction");
4113
4114 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4115 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4116 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4117
4118 // This is the index of the first element of the vectorWidth-bit chunk
4119 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4120 IdxVal &= ~(ElemsPerChunk - 1);
4121
4122 // If the input is a buildvector just emit a smaller one.
4123 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4124 return DAG.getBuildVector(ResultVT, dl,
4125 Vec->ops().slice(IdxVal, ElemsPerChunk));
4126
4127 // Check if we're extracting the upper undef of a widening pattern.
4128 if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
4129 Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
4130 isNullConstant(Vec.getOperand(2)))
4131 return DAG.getUNDEF(ResultVT);
4132
4133 return DAG.getExtractSubvector(dl, ResultVT, Vec, IdxVal);
4134}
4135
4136/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4137/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4138/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4139/// instructions or a simple subregister reference. Idx is an index in the
4140/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4141/// lowering EXTRACT_VECTOR_ELT operations easier.
4142static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4143 SelectionDAG &DAG, const SDLoc &dl) {
4145 Vec.getValueType().is512BitVector()) &&
4146 "Unexpected vector size!");
4147 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4148}
4149
4150/// Generate a DAG to grab 256-bits from a 512-bit vector.
4151static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4152 SelectionDAG &DAG, const SDLoc &dl) {
4153 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4154 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4155}
4156
4157static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4158 SelectionDAG &DAG, const SDLoc &dl,
4159 unsigned vectorWidth) {
4160 assert((vectorWidth == 128 || vectorWidth == 256) &&
4161 "Unsupported vector width");
4162 // Inserting UNDEF is Result
4163 if (Vec.isUndef())
4164 return Result;
4165
4166 // Insert the relevant vectorWidth bits.
4167 EVT VT = Vec.getValueType();
4168 unsigned ElemsPerChunk = vectorWidth / VT.getScalarSizeInBits();
4169 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4170
4171 // This is the index of the first element of the vectorWidth-bit chunk
4172 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4173 IdxVal &= ~(ElemsPerChunk - 1);
4174 return DAG.getInsertSubvector(dl, Result, Vec, IdxVal);
4175}
4176
4177/// Generate a DAG to put 128-bits into a vector > 128 bits. This
4178/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4179/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4180/// simple superregister reference. Idx is an index in the 128 bits
4181/// we want. It need not be aligned to a 128-bit boundary. That makes
4182/// lowering INSERT_VECTOR_ELT operations easier.
4183static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4184 SelectionDAG &DAG, const SDLoc &dl) {
4185 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4186 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4187}
4188
4189/// Widen a vector to a larger size with the same scalar type, with the new
4190/// elements either zero or undef.
4191static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
4192 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4193 const SDLoc &dl) {
4194 EVT VecVT = Vec.getValueType();
4196 VecVT.getScalarType() == VT.getScalarType() &&
4197 "Unsupported vector widening type");
4198 // If the upper 128-bits of a build vector are already undef/zero, then try to
4199 // widen from the lower 128-bits.
4200 if (Vec.getOpcode() == ISD::BUILD_VECTOR && VecVT.is256BitVector()) {
4201 unsigned NumSrcElts = VecVT.getVectorNumElements();
4202 ArrayRef<SDUse> Hi = Vec->ops().drop_front(NumSrcElts / 2);
4203 if (all_of(Hi, [&](SDValue V) {
4204 return V.isUndef() || (ZeroNewElements && X86::isZeroNode(V));
4205 }))
4206 Vec = extract128BitVector(Vec, 0, DAG, dl);
4207 }
4208 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
4209 : DAG.getUNDEF(VT);
4210 return DAG.getInsertSubvector(dl, Res, Vec, 0);
4211}
4212
4213/// Widen a vector to a larger size with the same scalar type, with the new
4214/// elements either zero or undef.
4215static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
4216 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4217 const SDLoc &dl, unsigned WideSizeInBits) {
4218 assert(Vec.getValueSizeInBits() <= WideSizeInBits &&
4219 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
4220 "Unsupported vector widening type");
4221 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
4222 MVT SVT = Vec.getSimpleValueType().getScalarType();
4223 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
4224 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4225}
4226
4227/// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT
4228/// and bitcast with integer types.
4229static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) {
4230 assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector");
4231 unsigned NumElts = VT.getVectorNumElements();
4232 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
4233 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4234 return VT;
4235}
4236
4237/// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and
4238/// bitcast with integer types.
4239static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements,
4240 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4241 const SDLoc &dl) {
4242 MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget);
4243 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4244}
4245
4246// Helper function to collect subvector ops that are concatenated together,
4247// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
4248// The subvectors in Ops are guaranteed to be the same type.
4250 SelectionDAG &DAG) {
4251 assert(Ops.empty() && "Expected an empty ops vector");
4252
4253 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
4254 Ops.append(N->op_begin(), N->op_end());
4255 return true;
4256 }
4257
4258 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
4259 SDValue Src = N->getOperand(0);
4260 SDValue Sub = N->getOperand(1);
4261 const APInt &Idx = N->getConstantOperandAPInt(2);
4262 EVT VT = Src.getValueType();
4263 EVT SubVT = Sub.getValueType();
4264
4265 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
4266 // insert_subvector(undef, x, lo)
4267 if (Idx == 0 && Src.isUndef()) {
4268 Ops.push_back(Sub);
4269 Ops.push_back(DAG.getUNDEF(SubVT));
4270 return true;
4271 }
4272 if (Idx == (VT.getVectorNumElements() / 2)) {
4273 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
4274 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
4275 Src.getOperand(1).getValueType() == SubVT &&
4276 isNullConstant(Src.getOperand(2))) {
4277 // Attempt to recurse into inner (matching) concats.
4278 SDValue Lo = Src.getOperand(1);
4279 SDValue Hi = Sub;
4280 SmallVector<SDValue, 2> LoOps, HiOps;
4281 if (collectConcatOps(Lo.getNode(), LoOps, DAG) &&
4282 collectConcatOps(Hi.getNode(), HiOps, DAG) &&
4283 LoOps.size() == HiOps.size()) {
4284 Ops.append(LoOps);
4285 Ops.append(HiOps);
4286 return true;
4287 }
4288 Ops.push_back(Lo);
4289 Ops.push_back(Hi);
4290 return true;
4291 }
4292 // insert_subvector(x, extract_subvector(x, lo), hi)
4293 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4294 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
4295 Ops.append(2, Sub);
4296 return true;
4297 }
4298 // insert_subvector(undef, x, hi)
4299 if (Src.isUndef()) {
4300 Ops.push_back(DAG.getUNDEF(SubVT));
4301 Ops.push_back(Sub);
4302 return true;
4303 }
4304 }
4305 }
4306 }
4307
4308 if (N->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
4309 EVT VT = N->getValueType(0);
4310 SDValue Src = N->getOperand(0);
4311 uint64_t Idx = N->getConstantOperandVal(1);
4312
4313 // Collect all the subvectors from the source vector and slice off the
4314 // extraction.
4316 if (collectConcatOps(Src.getNode(), SrcOps, DAG) &&
4317 VT.getSizeInBits() > SrcOps[0].getValueSizeInBits() &&
4318 (VT.getSizeInBits() % SrcOps[0].getValueSizeInBits()) == 0 &&
4319 (Idx % SrcOps[0].getValueType().getVectorNumElements()) == 0) {
4320 unsigned SubIdx = Idx / SrcOps[0].getValueType().getVectorNumElements();
4321 unsigned NumSubs = VT.getSizeInBits() / SrcOps[0].getValueSizeInBits();
4322 Ops.append(SrcOps.begin() + SubIdx, SrcOps.begin() + SubIdx + NumSubs);
4323 return true;
4324 }
4325 }
4326
4327 assert(Ops.empty() && "Expected an empty ops vector");
4328 return false;
4329}
4330
4331// Helper to check if \p V can be split into subvectors and the upper subvectors
4332// are all undef. In which case return the lower subvector.
4334 SelectionDAG &DAG) {
4335 SmallVector<SDValue> SubOps;
4336 if (!collectConcatOps(V.getNode(), SubOps, DAG))
4337 return SDValue();
4338
4339 unsigned NumSubOps = SubOps.size();
4340 unsigned HalfNumSubOps = NumSubOps / 2;
4341 assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors");
4342
4343 ArrayRef<SDValue> UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end());
4344 if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); }))
4345 return SDValue();
4346
4347 EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
4348 ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps);
4349 return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps);
4350}
4351
4352// Helper to check if we can access all the constituent subvectors without any
4353// extract ops.
4356 return collectConcatOps(V.getNode(), Ops, DAG);
4357}
4358
4359static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
4360 const SDLoc &dl) {
4361 EVT VT = Op.getValueType();
4362 unsigned NumElems = VT.getVectorNumElements();
4363 unsigned SizeInBits = VT.getSizeInBits();
4364 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4365 "Can't split odd sized vector");
4366
4368 if (collectConcatOps(Op.getNode(), SubOps, DAG)) {
4369 assert((SubOps.size() % 2) == 0 && "Can't split odd sized vector concat");
4370 unsigned HalfOps = SubOps.size() / 2;
4371 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
4372 SmallVector<SDValue, 2> LoOps(SubOps.begin(), SubOps.begin() + HalfOps);
4373 SmallVector<SDValue, 2> HiOps(SubOps.begin() + HalfOps, SubOps.end());
4374 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, LoOps);
4375 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, HiOps);
4376 return std::make_pair(Lo, Hi);
4377 }
4378
4379 // If this is a splat value (with no-undefs) then use the lower subvector,
4380 // which should be a free extraction.
4381 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
4382 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
4383 return std::make_pair(Lo, Lo);
4384
4385 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
4386 return std::make_pair(Lo, Hi);
4387}
4388
4389/// Break an operation into 2 half sized ops and then concatenate the results.
4391 unsigned NumOps = Op.getNumOperands();
4392 EVT VT = Op.getValueType();
4393
4394 // Extract the LHS Lo/Hi vectors
4397 for (unsigned I = 0; I != NumOps; ++I) {
4398 SDValue SrcOp = Op.getOperand(I);
4399 if (!SrcOp.getValueType().isVector()) {
4400 LoOps[I] = HiOps[I] = SrcOp;
4401 continue;
4402 }
4403 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
4404 }
4405
4406 EVT LoVT, HiVT;
4407 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
4408 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
4409 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
4410 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
4411}
4412
4413/// Break an unary integer operation into 2 half sized ops and then
4414/// concatenate the result back.
4416 const SDLoc &dl) {
4417 // Make sure we only try to split 256/512-bit types to avoid creating
4418 // narrow vectors.
4419 [[maybe_unused]] EVT VT = Op.getValueType();
4420 assert((Op.getOperand(0).getValueType().is256BitVector() ||
4421 Op.getOperand(0).getValueType().is512BitVector()) &&
4422 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4423 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
4424 VT.getVectorNumElements() &&
4425 "Unexpected VTs!");
4426 return splitVectorOp(Op, DAG, dl);
4427}
4428
4429/// Break a binary integer operation into 2 half sized ops and then
4430/// concatenate the result back.
4432 const SDLoc &dl) {
4433 // Assert that all the types match.
4434 [[maybe_unused]] EVT VT = Op.getValueType();
4435 assert(Op.getOperand(0).getValueType() == VT &&
4436 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
4437 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4438 return splitVectorOp(Op, DAG, dl);
4439}
4440
4441// Helper for splitting operands of an operation to legal target size and
4442// apply a function on each part.
4443// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
4444// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
4445// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
4446// The argument Builder is a function that will be applied on each split part:
4447// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
4448template <typename F>
4450 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
4451 F Builder, bool CheckBWI = true,
4452 bool AllowAVX512 = true) {
4453 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
4454 unsigned NumSubs = 1;
4455 if (AllowAVX512 && ((CheckBWI && Subtarget.useBWIRegs()) ||
4456 (!CheckBWI && Subtarget.useAVX512Regs()))) {
4457 if (VT.getSizeInBits() > 512) {
4458 NumSubs = VT.getSizeInBits() / 512;
4459 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
4460 }
4461 } else if (Subtarget.hasAVX2()) {
4462 if (VT.getSizeInBits() > 256) {
4463 NumSubs = VT.getSizeInBits() / 256;
4464 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
4465 }
4466 } else {
4467 if (VT.getSizeInBits() > 128) {
4468 NumSubs = VT.getSizeInBits() / 128;
4469 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
4470 }
4471 }
4472
4473 if (NumSubs == 1)
4474 return Builder(DAG, DL, Ops);
4475
4477 for (unsigned i = 0; i != NumSubs; ++i) {
4479 for (SDValue Op : Ops) {
4480 EVT OpVT = Op.getValueType();
4481 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
4482 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
4483 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
4484 }
4485 Subs.push_back(Builder(DAG, DL, SubOps));
4486 }
4487 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
4488}
4489
4490// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4491// targets.
4492static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
4494 const X86Subtarget &Subtarget) {
4495 assert(Subtarget.hasAVX512() && "AVX512 target expected");
4496 MVT SVT = VT.getScalarType();
4497
4498 // If we have a 32/64 splatted constant, splat it to DstTy to
4499 // encourage a foldable broadcast'd operand.
4500 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
4501 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4502 // AVX512 broadcasts 32/64-bit operands.
4503 // TODO: Support float once getAVX512Node is used by fp-ops.
4504 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4506 return SDValue();
4507 // If we're not widening, don't bother if we're not bitcasting.
4508 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
4509 return SDValue();
4511 APInt SplatValue, SplatUndef;
4512 unsigned SplatBitSize;
4513 bool HasAnyUndefs;
4514 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4515 HasAnyUndefs, OpEltSizeInBits) &&
4516 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
4517 return DAG.getConstant(SplatValue, DL, DstVT);
4518 }
4519 return SDValue();
4520 };
4521
4522 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
4523
4524 MVT DstVT = VT;
4525 if (Widen)
4526 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
4527
4528 // Canonicalize src operands.
4529 SmallVector<SDValue> SrcOps(Ops);
4530 for (SDValue &Op : SrcOps) {
4531 MVT OpVT = Op.getSimpleValueType();
4532 // Just pass through scalar operands.
4533 if (!OpVT.isVector())
4534 continue;
4535 assert(OpVT == VT && "Vector type mismatch");
4536
4537 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
4538 Op = BroadcastOp;
4539 continue;
4540 }
4541
4542 // Just widen the subvector by inserting into an undef wide vector.
4543 if (Widen)
4544 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
4545 }
4546
4547 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
4548
4549 // Perform the 512-bit op then extract the bottom subvector.
4550 if (Widen)
4551 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
4552 return Res;
4553}
4554
4555/// Insert i1-subvector to i1-vector.
4557 const X86Subtarget &Subtarget) {
4558
4559 SDLoc dl(Op);
4560 SDValue Vec = Op.getOperand(0);
4561 SDValue SubVec = Op.getOperand(1);
4562 SDValue Idx = Op.getOperand(2);
4563 unsigned IdxVal = Op.getConstantOperandVal(2);
4564
4565 // Inserting undef is a nop. We can just return the original vector.
4566 if (SubVec.isUndef())
4567 return Vec;
4568
4569 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4570 return Op;
4571
4572 MVT OpVT = Op.getSimpleValueType();
4573 unsigned NumElems = OpVT.getVectorNumElements();
4574 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
4575
4576 // Extend to natively supported kshift.
4577 MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);
4578
4579 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
4580 // if necessary.
4581 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
4582 // May need to promote to a legal type.
4583 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4584 DAG.getConstant(0, dl, WideOpVT),
4585 SubVec, Idx);
4586 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4587 }
4588
4589 MVT SubVecVT = SubVec.getSimpleValueType();
4590 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4591 assert(IdxVal + SubVecNumElems <= NumElems &&
4592 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4593 "Unexpected index value in INSERT_SUBVECTOR");
4594
4595 SDValue Undef = DAG.getUNDEF(WideOpVT);
4596
4597 if (IdxVal == 0) {
4598 // Zero lower bits of the Vec
4599 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
4600 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
4601 ZeroIdx);
4602 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4603 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4604 // Merge them together, SubVec should be zero extended.
4605 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4606 DAG.getConstant(0, dl, WideOpVT),
4607 SubVec, ZeroIdx);
4608 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4609 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4610 }
4611
4612 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4613 Undef, SubVec, ZeroIdx);
4614
4615 if (Vec.isUndef()) {
4616 assert(IdxVal != 0 && "Unexpected index");
4617 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4618 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4619 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4620 }
4621
4623 assert(IdxVal != 0 && "Unexpected index");
4624 // If upper elements of Vec are known undef, then just shift into place.
4625 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4626 [](SDValue V) { return V.isUndef(); })) {
4627 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4628 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4629 } else {
4630 NumElems = WideOpVT.getVectorNumElements();
4631 unsigned ShiftLeft = NumElems - SubVecNumElems;
4632 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4633 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4634 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4635 if (ShiftRight != 0)
4636 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4637 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4638 }
4639 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4640 }
4641
4642 // Simple case when we put subvector in the upper part
4643 if (IdxVal + SubVecNumElems == NumElems) {
4644 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4645 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4646 if (SubVecNumElems * 2 == NumElems) {
4647 // Special case, use legal zero extending insert_subvector. This allows
4648 // isel to optimize when bits are known zero.
4649 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
4650 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4651 DAG.getConstant(0, dl, WideOpVT),
4652 Vec, ZeroIdx);
4653 } else {
4654 // Otherwise use explicit shifts to zero the bits.
4655 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4656 Undef, Vec, ZeroIdx);
4657 NumElems = WideOpVT.getVectorNumElements();
4658 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4659 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4660 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4661 }
4662 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4663 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4664 }
4665
4666 // Inserting into the middle is more complicated.
4667
4668 NumElems = WideOpVT.getVectorNumElements();
4669
4670 // Widen the vector if needed.
4671 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4672
4673 unsigned ShiftLeft = NumElems - SubVecNumElems;
4674 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4675
4676 // Do an optimization for the most frequently used types.
4677 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4678 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
4679 Mask0.flipAllBits();
4680 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
4681 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
4682 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
4683 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4684 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4685 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4686 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4687 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4688
4689 // Reduce to original width if needed.
4690 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4691 }
4692
4693 // Clear the upper bits of the subvector and move it to its insert position.
4694 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4695 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4696 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4697 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4698
4699 // Isolate the bits below the insertion point.
4700 unsigned LowShift = NumElems - IdxVal;
4701 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
4702 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4703 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
4704 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4705
4706 // Isolate the bits after the last inserted bit.
4707 unsigned HighShift = IdxVal + SubVecNumElems;
4708 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
4709 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4710 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
4711 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4712
4713 // Now OR all 3 pieces together.
4714 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
4715 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
4716
4717 // Reduce to original width if needed.
4718 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4719}
4720
4722 const SDLoc &dl) {
4723 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
4724 EVT SubVT = V1.getValueType();
4725 EVT SubSVT = SubVT.getScalarType();
4726 unsigned SubNumElts = SubVT.getVectorNumElements();
4727 unsigned SubVectorWidth = SubVT.getSizeInBits();
4728 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
4729 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
4730 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
4731}
4732
4733/// Returns a vector of specified type with all bits set.
4734/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
4735/// Then bitcast to their original type, ensuring they get CSE'd.
4736static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4737 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4738 "Expected a 128/256/512-bit vector type");
4739 unsigned NumElts = VT.getSizeInBits() / 32;
4740 SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts));
4741 return DAG.getBitcast(VT, Vec);
4742}
4743
4744static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
4745 SDValue In, SelectionDAG &DAG) {
4746 EVT InVT = In.getValueType();
4747 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
4748
4749 // Canonicalize Opcode to general extension version.
4750 switch (Opcode) {
4751 case ISD::ANY_EXTEND:
4753 Opcode = ISD::ANY_EXTEND;
4754 break;
4755 case ISD::SIGN_EXTEND:
4757 Opcode = ISD::SIGN_EXTEND;
4758 break;
4759 case ISD::ZERO_EXTEND:
4761 Opcode = ISD::ZERO_EXTEND;
4762 break;
4763 default:
4764 llvm_unreachable("Unknown extension opcode");
4765 }
4766
4767 // For 256-bit vectors, we only need the lower (128-bit) input half.
4768 // For 512-bit vectors, we only need the lower input half or quarter.
4769 if (InVT.getSizeInBits() > 128) {
4770 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
4771 "Expected VTs to be the same size!");
4772 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
4773 In = extractSubVector(In, 0, DAG, DL,
4774 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
4775 InVT = In.getValueType();
4776 }
4777
4778 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
4779 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
4780
4781 return DAG.getNode(Opcode, DL, VT, In);
4782}
4783
4784// Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern
4786 SDValue Mask, SelectionDAG &DAG) {
4787 LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);
4788 RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);
4789 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
4790}
4791
4793 bool Lo, bool Unary) {
4794 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
4795 "Illegal vector type to unpack");
4796 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4797 int NumElts = VT.getVectorNumElements();
4798 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
4799 for (int i = 0; i < NumElts; ++i) {
4800 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
4801 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
4802 Pos += (Unary ? 0 : NumElts * (i % 2));
4803 Pos += (Lo ? 0 : NumEltsInLane / 2);
4804 Mask.push_back(Pos);
4805 }
4806}
4807
4808/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
4809/// imposed by AVX and specific to the unary pattern. Example:
4810/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
4811/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
4813 bool Lo) {
4814 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4815 int NumElts = VT.getVectorNumElements();
4816 for (int i = 0; i < NumElts; ++i) {
4817 int Pos = i / 2;
4818 Pos += (Lo ? 0 : NumElts / 2);
4819 Mask.push_back(Pos);
4820 }
4821}
4822
4823// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
4824static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
4825 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
4828 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
4829 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
4830 int M = Mask[I];
4831 if (M < 0)
4832 continue;
4833 SDValue V = (M < NumElts) ? V1 : V2;
4834 if (V.isUndef())
4835 continue;
4836 Ops[I] = V.getOperand(M % NumElts);
4837 }
4838 return DAG.getBuildVector(VT, dl, Ops);
4839 }
4840
4841 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4842}
4843
4844/// Returns a vector_shuffle node for an unpackl operation.
4845static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4846 SDValue V1, SDValue V2) {
4848 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
4849 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4850}
4851
4852/// Returns a vector_shuffle node for an unpackh operation.
4853static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4854 SDValue V1, SDValue V2) {
4856 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
4857 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4858}
4859
4860/// Returns a node that packs the LHS + RHS nodes together at half width.
4861/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
4862/// TODO: Add subvector splitting if/when we have a need for it.
4863static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
4864 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
4865 bool PackHiHalf = false) {
4866 MVT OpVT = LHS.getSimpleValueType();
4867 unsigned EltSizeInBits = VT.getScalarSizeInBits();
4868 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
4869 assert(OpVT == RHS.getSimpleValueType() &&
4870 VT.getSizeInBits() == OpVT.getSizeInBits() &&
4871 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
4872 "Unexpected PACK operand types");
4873 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
4874 "Unexpected PACK result type");
4875
4876 // Rely on vector shuffles for vXi64 -> vXi32 packing.
4877 if (EltSizeInBits == 32) {
4878 SmallVector<int> PackMask;
4879 int Offset = PackHiHalf ? 1 : 0;
4880 int NumElts = VT.getVectorNumElements();
4881 for (int I = 0; I != NumElts; I += 4) {
4882 PackMask.push_back(I + Offset);
4883 PackMask.push_back(I + Offset + 2);
4884 PackMask.push_back(I + Offset + NumElts);
4885 PackMask.push_back(I + Offset + NumElts + 2);
4886 }
4887 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
4888 DAG.getBitcast(VT, RHS), PackMask);
4889 }
4890
4891 // See if we already have sufficient leading bits for PACKSS/PACKUS.
4892 if (!PackHiHalf) {
4893 if (UsePackUS &&
4894 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
4895 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
4896 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4897
4898 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
4899 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
4900 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4901 }
4902
4903 // Fallback to sign/zero extending the requested half and pack.
4904 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
4905 if (UsePackUS) {
4906 if (PackHiHalf) {
4907 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
4908 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
4909 } else {
4910 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
4911 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
4912 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
4913 };
4914 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4915 };
4916
4917 if (!PackHiHalf) {
4918 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
4919 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
4920 }
4921 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
4922 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
4923 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4924}
4925
4926/// Return a vector_shuffle of the specified vector of zero or undef vector.
4927/// This produces a shuffle where the low element of V2 is swizzled into the
4928/// zero/undef vector, landing at element Idx.
4929/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
4931 bool IsZero,
4932 const X86Subtarget &Subtarget,
4933 SelectionDAG &DAG) {
4934 MVT VT = V2.getSimpleValueType();
4935 SDValue V1 = IsZero
4936 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4937 int NumElems = VT.getVectorNumElements();
4938 SmallVector<int, 16> MaskVec(NumElems);
4939 for (int i = 0; i != NumElems; ++i)
4940 // If this is the insertion idx, put the low elt of V2 here.
4941 MaskVec[i] = (i == Idx) ? NumElems : i;
4942 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
4943}
4944
4946 if (Ptr.getOpcode() == X86ISD::Wrapper ||
4947 Ptr.getOpcode() == X86ISD::WrapperRIP)
4948 Ptr = Ptr.getOperand(0);
4950}
4951
4952// TODO: Add support for non-zero offsets.
4955 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
4956 return nullptr;
4957 return CNode->getConstVal();
4958}
4959
4961 if (!Load || !ISD::isNormalLoad(Load))
4962 return nullptr;
4963 return getTargetConstantFromBasePtr(Load->getBasePtr());
4964}
4965
4970
4971const Constant *
4973 assert(LD && "Unexpected null LoadSDNode");
4974 return getTargetConstantFromNode(LD);
4975}
4976
4978 // Do not fold (vselect not(C), X, 0s) to (vselect C, Os, X)
4979 SDValue Cond = N->getOperand(0);
4980 SDValue RHS = N->getOperand(2);
4981 EVT CondVT = Cond.getValueType();
4982 return N->getOpcode() == ISD::VSELECT && Subtarget.hasAVX512() &&
4983 CondVT.getVectorElementType() == MVT::i1 &&
4984 ISD::isBuildVectorAllZeros(RHS.getNode());
4985}
4986
4987// Extract raw constant bits from constant pools.
4988static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
4989 APInt &UndefElts,
4990 SmallVectorImpl<APInt> &EltBits,
4991 bool AllowWholeUndefs = true,
4992 bool AllowPartialUndefs = false) {
4993 assert(EltBits.empty() && "Expected an empty EltBits vector");
4994
4996
4997 EVT VT = Op.getValueType();
4998 unsigned SizeInBits = VT.getSizeInBits();
4999 unsigned NumElts = SizeInBits / EltSizeInBits;
5000
5001 // Can't split constant.
5002 if ((SizeInBits % EltSizeInBits) != 0)
5003 return false;
5004
5005 // Bitcast a source array of element bits to the target size.
5006 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5007 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5008 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5009 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5010 "Constant bit sizes don't match");
5011
5012 // Don't split if we don't allow undef bits.
5013 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5014 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5015 return false;
5016
5017 // If we're already the right size, don't bother bitcasting.
5018 if (NumSrcElts == NumElts) {
5019 UndefElts = UndefSrcElts;
5020 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5021 return true;
5022 }
5023
5024 // Extract all the undef/constant element data and pack into single bitsets.
5025 APInt UndefBits(SizeInBits, 0);
5026 APInt MaskBits(SizeInBits, 0);
5027
5028 for (unsigned i = 0; i != NumSrcElts; ++i) {
5029 unsigned BitOffset = i * SrcEltSizeInBits;
5030 if (UndefSrcElts[i])
5031 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5032 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5033 }
5034
5035 // Split the undef/constant single bitset data into the target elements.
5036 UndefElts = APInt(NumElts, 0);
5037 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5038
5039 for (unsigned i = 0; i != NumElts; ++i) {
5040 unsigned BitOffset = i * EltSizeInBits;
5041 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5042
5043 // Only treat an element as UNDEF if all bits are UNDEF.
5044 if (UndefEltBits.isAllOnes()) {
5045 if (!AllowWholeUndefs)
5046 return false;
5047 UndefElts.setBit(i);
5048 continue;
5049 }
5050
5051 // If only some bits are UNDEF then treat them as zero (or bail if not
5052 // supported).
5053 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5054 return false;
5055
5056 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
5057 }
5058 return true;
5059 };
5060
5061 // Collect constant bits and insert into mask/undef bit masks.
5062 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5063 unsigned UndefBitIndex) {
5064 if (!Cst)
5065 return false;
5066 if (isa<UndefValue>(Cst)) {
5067 Undefs.setBit(UndefBitIndex);
5068 return true;
5069 }
5070 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5071 Mask = CInt->getValue();
5072 return true;
5073 }
5074 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5075 Mask = CFP->getValueAPF().bitcastToAPInt();
5076 return true;
5077 }
5078 if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
5079 Type *Ty = CDS->getType();
5080 Mask = APInt::getZero(Ty->getPrimitiveSizeInBits());
5081 Type *EltTy = CDS->getElementType();
5082 bool IsInteger = EltTy->isIntegerTy();
5083 bool IsFP =
5084 EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
5085 if (!IsInteger && !IsFP)
5086 return false;
5087 unsigned EltBits = EltTy->getPrimitiveSizeInBits();
5088 for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
5089 if (IsInteger)
5090 Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
5091 else
5092 Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
5093 I * EltBits);
5094 return true;
5095 }
5096 return false;
5097 };
5098
5099 // Handle UNDEFs.
5100 if (Op.isUndef()) {
5101 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
5102 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5103 return CastBitData(UndefSrcElts, SrcEltBits);
5104 }
5105
5106 // Extract scalar constant bits.
5107 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5108 APInt UndefSrcElts = APInt::getZero(1);
5109 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5110 return CastBitData(UndefSrcElts, SrcEltBits);
5111 }
5112 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5113 APInt UndefSrcElts = APInt::getZero(1);
5114 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5115 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
5116 return CastBitData(UndefSrcElts, SrcEltBits);
5117 }
5118
5119 // Extract constant bits from build vector.
5120 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
5121 BitVector Undefs;
5122 SmallVector<APInt> SrcEltBits;
5123 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5124 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
5125 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
5126 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
5127 if (Undefs[I])
5128 UndefSrcElts.setBit(I);
5129 return CastBitData(UndefSrcElts, SrcEltBits);
5130 }
5131 }
5132
5133 // Extract constant bits from constant pool vector.
5134 if (auto *Cst = getTargetConstantFromNode(Op)) {
5135 Type *CstTy = Cst->getType();
5136 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5137 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5138 return false;
5139
5140 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5141 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5142 if ((SizeInBits % SrcEltSizeInBits) != 0)
5143 return false;
5144
5145 APInt UndefSrcElts(NumSrcElts, 0);
5146 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5147 for (unsigned i = 0; i != NumSrcElts; ++i)
5148 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5149 UndefSrcElts, i))
5150 return false;
5151
5152 return CastBitData(UndefSrcElts, SrcEltBits);
5153 }
5154
5155 // Extract constant bits from a broadcasted constant pool scalar.
5156 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
5157 EltSizeInBits <= VT.getScalarSizeInBits()) {
5158 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5159 if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
5160 return false;
5161
5162 SDValue Ptr = MemIntr->getBasePtr();
5164 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5165 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5166
5167 APInt UndefSrcElts(NumSrcElts, 0);
5168 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5169 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
5170 if (UndefSrcElts[0])
5171 UndefSrcElts.setBits(0, NumSrcElts);
5172 if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)
5173 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
5174 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5175 return CastBitData(UndefSrcElts, SrcEltBits);
5176 }
5177 }
5178 }
5179
5180 // Extract constant bits from a subvector broadcast.
5181 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
5182 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5183 SDValue Ptr = MemIntr->getBasePtr();
5184 // The source constant may be larger than the subvector broadcast,
5185 // ensure we extract the correct subvector constants.
5186 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
5187 Type *CstTy = Cst->getType();
5188 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5189 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
5190 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
5191 (SizeInBits % SubVecSizeInBits) != 0)
5192 return false;
5193 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5194 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
5195 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
5196 APInt UndefSubElts(NumSubElts, 0);
5197 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
5198 APInt(CstEltSizeInBits, 0));
5199 for (unsigned i = 0; i != NumSubElts; ++i) {
5200 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
5201 UndefSubElts, i))
5202 return false;
5203 for (unsigned j = 1; j != NumSubVecs; ++j)
5204 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
5205 }
5206 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
5207 UndefSubElts);
5208 return CastBitData(UndefSubElts, SubEltBits);
5209 }
5210 }
5211
5212 // Extract a rematerialized scalar constant insertion.
5213 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5214 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5215 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5216 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5217 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5218
5219 APInt UndefSrcElts(NumSrcElts, 0);
5220 SmallVector<APInt, 64> SrcEltBits;
5221 const APInt &C = Op.getOperand(0).getConstantOperandAPInt(0);
5222 SrcEltBits.push_back(C.zextOrTrunc(SrcEltSizeInBits));
5223 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5224 return CastBitData(UndefSrcElts, SrcEltBits);
5225 }
5226
5227 // Insert constant bits from a base and sub vector sources.
5228 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
5229 // If bitcasts to larger elements we might lose track of undefs - don't
5230 // allow any to be safe.
5231 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5232 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
5233
5234 APInt UndefSrcElts, UndefSubElts;
5235 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
5236 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
5237 UndefSubElts, EltSubBits,
5238 AllowWholeUndefs && AllowUndefs,
5239 AllowPartialUndefs && AllowUndefs) &&
5240 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
5241 UndefSrcElts, EltSrcBits,
5242 AllowWholeUndefs && AllowUndefs,
5243 AllowPartialUndefs && AllowUndefs)) {
5244 unsigned BaseIdx = Op.getConstantOperandVal(2);
5245 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
5246 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
5247 EltSrcBits[BaseIdx + i] = EltSubBits[i];
5248 return CastBitData(UndefSrcElts, EltSrcBits);
5249 }
5250 }
5251
5252 // Extract constant bits from a subvector's source.
5253 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5254 getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts,
5255 EltBits, AllowWholeUndefs,
5256 AllowPartialUndefs)) {
5257 EVT SrcVT = Op.getOperand(0).getValueType();
5258 unsigned NumSrcElts = SrcVT.getSizeInBits() / EltSizeInBits;
5259 unsigned NumSubElts = VT.getSizeInBits() / EltSizeInBits;
5260 unsigned BaseOfs = Op.getConstantOperandVal(1) * VT.getScalarSizeInBits();
5261 unsigned BaseIdx = BaseOfs / EltSizeInBits;
5262 assert((SrcVT.getSizeInBits() % EltSizeInBits) == 0 &&
5263 (VT.getSizeInBits() % EltSizeInBits) == 0 &&
5264 (BaseOfs % EltSizeInBits) == 0 && "Bad subvector index");
5265
5266 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
5267 if ((BaseIdx + NumSubElts) != NumSrcElts)
5268 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
5269 if (BaseIdx != 0)
5270 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
5271 return true;
5272 }
5273
5274 // Extract constant bits from shuffle node sources.
5275 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
5276 // TODO - support shuffle through bitcasts.
5277 if (EltSizeInBits != VT.getScalarSizeInBits())
5278 return false;
5279
5280 ArrayRef<int> Mask = SVN->getMask();
5281 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5282 llvm::any_of(Mask, [](int M) { return M < 0; }))
5283 return false;
5284
5285 APInt UndefElts0, UndefElts1;
5286 SmallVector<APInt, 32> EltBits0, EltBits1;
5287 if (isAnyInRange(Mask, 0, NumElts) &&
5288 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5289 UndefElts0, EltBits0, AllowWholeUndefs,
5290 AllowPartialUndefs))
5291 return false;
5292 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
5293 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
5294 UndefElts1, EltBits1, AllowWholeUndefs,
5295 AllowPartialUndefs))
5296 return false;
5297
5298 UndefElts = APInt::getZero(NumElts);
5299 for (int i = 0; i != (int)NumElts; ++i) {
5300 int M = Mask[i];
5301 if (M < 0) {
5302 UndefElts.setBit(i);
5303 EltBits.push_back(APInt::getZero(EltSizeInBits));
5304 } else if (M < (int)NumElts) {
5305 if (UndefElts0[M])
5306 UndefElts.setBit(i);
5307 EltBits.push_back(EltBits0[M]);
5308 } else {
5309 if (UndefElts1[M - NumElts])
5310 UndefElts.setBit(i);
5311 EltBits.push_back(EltBits1[M - NumElts]);
5312 }
5313 }
5314 return true;
5315 }
5316
5317 return false;
5318}
5319
5320namespace llvm {
5321namespace X86 {
5322bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
5323 APInt UndefElts;
5324 SmallVector<APInt, 16> EltBits;
5326 Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5327 /*AllowWholeUndefs*/ true, AllowPartialUndefs)) {
5328 int SplatIndex = -1;
5329 for (int i = 0, e = EltBits.size(); i != e; ++i) {
5330 if (UndefElts[i])
5331 continue;
5332 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5333 SplatIndex = -1;
5334 break;
5335 }
5336 SplatIndex = i;
5337 }
5338 if (0 <= SplatIndex) {
5339 SplatVal = EltBits[SplatIndex];
5340 return true;
5341 }
5342 }
5343
5344 return false;
5345}
5346
5347int getRoundingModeX86(unsigned RM) {
5348 switch (static_cast<::llvm::RoundingMode>(RM)) {
5349 // clang-format off
5350 case ::llvm::RoundingMode::NearestTiesToEven: return X86::rmToNearest; break;
5351 case ::llvm::RoundingMode::TowardNegative: return X86::rmDownward; break;
5352 case ::llvm::RoundingMode::TowardPositive: return X86::rmUpward; break;
5353 case ::llvm::RoundingMode::TowardZero: return X86::rmTowardZero; break;
5354 default:
5355 return X86::rmInvalid; // Invalid rounding mode
5356 }
5357}
5358
5359} // namespace X86
5360} // namespace llvm
5361
5363 unsigned MaskEltSizeInBits,
5365 APInt &UndefElts) {
5366 // Extract the raw target constant bits.
5367 SmallVector<APInt, 64> EltBits;
5368 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5369 EltBits, /* AllowWholeUndefs */ true,
5370 /* AllowPartialUndefs */ false))
5371 return false;
5372
5373 // Insert the extracted elements into the mask.
5374 for (const APInt &Elt : EltBits)
5375 RawMask.push_back(Elt.getZExtValue());
5376
5377 return true;
5378}
5379
5380static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts,
5381 bool AllowUndefs) {
5382 APInt UndefElts;
5383 SmallVector<APInt, 64> EltBits;
5384 if (!getTargetConstantBitsFromNode(V, EltSizeInBIts, UndefElts, EltBits,
5385 /*AllowWholeUndefs*/ AllowUndefs,
5386 /*AllowPartialUndefs*/ false))
5387 return false;
5388
5389 bool IsPow2OrUndef = true;
5390 for (unsigned I = 0, E = EltBits.size(); I != E; ++I)
5391 IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2();
5392 return IsPow2OrUndef;
5393}
5394
5395// Helper to attempt to return a cheaper, bit-inverted version of \p V.
5397 // TODO: don't always ignore oneuse constraints.
5398 V = peekThroughBitcasts(V);
5399 EVT VT = V.getValueType();
5400
5401 // Match not(xor X, -1) -> X.
5402 if (V.getOpcode() == ISD::XOR &&
5403 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
5404 isAllOnesConstant(V.getOperand(1))))
5405 return V.getOperand(0);
5406
5407 // Match not(extract_subvector(not(X)) -> extract_subvector(X).
5408 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5409 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5410 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
5411 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
5412 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), VT, Not,
5413 V.getOperand(1));
5414 }
5415 }
5416
5417 // Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
5418 if (V.getOpcode() == X86ISD::PCMPGT &&
5419 !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&
5420 !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&
5421 V.getOperand(0).hasOneUse()) {
5422 APInt UndefElts;
5423 SmallVector<APInt> EltBits;
5424 if (getTargetConstantBitsFromNode(V.getOperand(0),
5425 V.getScalarValueSizeInBits(), UndefElts,
5426 EltBits) &&
5427 !ISD::isBuildVectorOfConstantSDNodes(V.getOperand(1).getNode())) {
5428 // Don't fold min_signed_value -> (min_signed_value - 1)
5429 bool MinSigned = false;
5430 for (APInt &Elt : EltBits) {
5431 MinSigned |= Elt.isMinSignedValue();
5432 Elt -= 1;
5433 }
5434 if (!MinSigned) {
5435 SDLoc DL(V);
5436 MVT VT = V.getSimpleValueType();
5437 return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),
5438 getConstVector(EltBits, UndefElts, VT, DAG, DL));
5439 }
5440 }
5441 }
5442
5443 // Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y).
5445 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
5446 for (SDValue &CatOp : CatOps) {
5447 SDValue NotCat = IsNOT(CatOp, DAG);
5448 if (!NotCat)
5449 return SDValue();
5450 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
5451 }
5452 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), VT, CatOps);
5453 }
5454
5455 // Match not(or(not(X),not(Y))) -> and(X, Y).
5456 if (V.getOpcode() == ISD::OR && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
5457 V.getOperand(0).hasOneUse() && V.getOperand(1).hasOneUse()) {
5458 // TODO: Handle cases with single NOT operand -> ANDNP
5459 if (SDValue Op1 = IsNOT(V.getOperand(1), DAG))
5460 if (SDValue Op0 = IsNOT(V.getOperand(0), DAG))
5461 return DAG.getNode(ISD::AND, SDLoc(V), VT, DAG.getBitcast(VT, Op0),
5462 DAG.getBitcast(VT, Op1));
5463 }
5464
5465 return SDValue();
5466}
5467
5468/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5469/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
5470/// Note: This ignores saturation, so inputs must be checked first.
5472 bool Unary, unsigned NumStages = 1) {
5473 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5474 unsigned NumElts = VT.getVectorNumElements();
5475 unsigned NumLanes = VT.getSizeInBits() / 128;
5476 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5477 unsigned Offset = Unary ? 0 : NumElts;
5478 unsigned Repetitions = 1u << (NumStages - 1);
5479 unsigned Increment = 1u << NumStages;
5480 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
5481
5482 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5483 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5484 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5485 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5486 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5487 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5488 }
5489 }
5490}
5491
5492// Split the demanded elts of a PACKSS/PACKUS node between its operands.
5493static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
5494 APInt &DemandedLHS, APInt &DemandedRHS) {
5495 int NumLanes = VT.getSizeInBits() / 128;
5496 int NumElts = DemandedElts.getBitWidth();
5497 int NumInnerElts = NumElts / 2;
5498 int NumEltsPerLane = NumElts / NumLanes;
5499 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5500
5501 DemandedLHS = APInt::getZero(NumInnerElts);
5502 DemandedRHS = APInt::getZero(NumInnerElts);
5503
5504 // Map DemandedElts to the packed operands.
5505 for (int Lane = 0; Lane != NumLanes; ++Lane) {
5506 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5507 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5508 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5509 if (DemandedElts[OuterIdx])
5510 DemandedLHS.setBit(InnerIdx);
5511 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5512 DemandedRHS.setBit(InnerIdx);
5513 }
5514 }
5515}
5516
5517// Split the demanded elts of a HADD/HSUB node between its operands.
5518static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
5519 APInt &DemandedLHS, APInt &DemandedRHS) {
5521 DemandedLHS, DemandedRHS);
5522 DemandedLHS |= DemandedLHS << 1;
5523 DemandedRHS |= DemandedRHS << 1;
5524}
5525
5526/// Calculates the shuffle mask corresponding to the target-specific opcode.
5527/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5528/// operands in \p Ops, and returns true.
5529/// Sets \p IsUnary to true if only one source is used. Note that this will set
5530/// IsUnary for shuffles which use a single input multiple times, and in those
5531/// cases it will adjust the mask to only have indices within that single input.
5532/// It is an error to call this with non-empty Mask/Ops vectors.
5533static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5535 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5536 if (!isTargetShuffle(N.getOpcode()))
5537 return false;
5538
5539 MVT VT = N.getSimpleValueType();
5540 unsigned NumElems = VT.getVectorNumElements();
5541 unsigned MaskEltSize = VT.getScalarSizeInBits();
5543 APInt RawUndefs;
5544 uint64_t ImmN;
5545
5546 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5547 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5548
5549 IsUnary = false;
5550 bool IsFakeUnary = false;
5551 switch (N.getOpcode()) {
5552 case X86ISD::BLENDI:
5553 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5554 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5555 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5556 DecodeBLENDMask(NumElems, ImmN, Mask);
5557 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5558 break;
5559 case X86ISD::SHUFP:
5560 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5561 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5562 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5563 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
5564 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5565 break;
5566 case X86ISD::INSERTPS:
5567 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5568 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5569 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5570 DecodeINSERTPSMask(ImmN, Mask, /*SrcIsMem=*/false);
5571 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5572 break;
5573 case X86ISD::EXTRQI:
5574 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5575 if (isa<ConstantSDNode>(N.getOperand(1)) &&
5576 isa<ConstantSDNode>(N.getOperand(2))) {
5577 int BitLen = N.getConstantOperandVal(1);
5578 int BitIdx = N.getConstantOperandVal(2);
5579 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5580 IsUnary = true;
5581 }
5582 break;
5583 case X86ISD::INSERTQI:
5584 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5585 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5586 if (isa<ConstantSDNode>(N.getOperand(2)) &&
5587 isa<ConstantSDNode>(N.getOperand(3))) {
5588 int BitLen = N.getConstantOperandVal(2);
5589 int BitIdx = N.getConstantOperandVal(3);
5590 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5591 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5592 }
5593 break;
5594 case X86ISD::UNPCKH:
5595 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5596 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5597 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
5598 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5599 break;
5600 case X86ISD::UNPCKL:
5601 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5602 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5603 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
5604 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5605 break;
5606 case X86ISD::MOVHLPS:
5607 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5608 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5609 DecodeMOVHLPSMask(NumElems, Mask);
5610 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5611 break;
5612 case X86ISD::MOVLHPS:
5613 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5614 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5615 DecodeMOVLHPSMask(NumElems, Mask);
5616 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5617 break;
5618 case X86ISD::VALIGN:
5619 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
5620 "Only 32-bit and 64-bit elements are supported!");
5621 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5622 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5623 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5624 DecodeVALIGNMask(NumElems, ImmN, Mask);
5625 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5626 Ops.push_back(N.getOperand(1));
5627 Ops.push_back(N.getOperand(0));
5628 break;
5629 case X86ISD::PALIGNR:
5630 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5631 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5632 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5633 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5634 DecodePALIGNRMask(NumElems, ImmN, Mask);
5635 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5636 Ops.push_back(N.getOperand(1));
5637 Ops.push_back(N.getOperand(0));
5638 break;
5639 case X86ISD::VSHLDQ:
5640 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5641 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5642 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5643 DecodePSLLDQMask(NumElems, ImmN, Mask);
5644 IsUnary = true;
5645 break;
5646 case X86ISD::VSRLDQ:
5647 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5648 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5649 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5650 DecodePSRLDQMask(NumElems, ImmN, Mask);
5651 IsUnary = true;
5652 break;
5653 case X86ISD::PSHUFD:
5654 case X86ISD::VPERMILPI:
5655 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5656 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5657 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
5658 IsUnary = true;
5659 break;
5660 case X86ISD::PSHUFHW:
5661 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5662 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5663 DecodePSHUFHWMask(NumElems, ImmN, Mask);
5664 IsUnary = true;
5665 break;
5666 case X86ISD::PSHUFLW:
5667 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5668 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5669 DecodePSHUFLWMask(NumElems, ImmN, Mask);
5670 IsUnary = true;
5671 break;
5672 case X86ISD::VZEXT_MOVL:
5673 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5674 DecodeZeroMoveLowMask(NumElems, Mask);
5675 IsUnary = true;
5676 break;
5677 case X86ISD::VBROADCAST:
5678 // We only decode broadcasts of same-sized vectors, peeking through to
5679 // extracted subvectors is likely to cause hasOneUse issues with
5680 // SimplifyDemandedBits etc.
5681 if (N.getOperand(0).getValueType() == VT) {
5682 DecodeVectorBroadcast(NumElems, Mask);
5683 IsUnary = true;
5684 break;
5685 }
5686 return false;
5687 case X86ISD::VPERMILPV: {
5688 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5689 IsUnary = true;
5690 SDValue MaskNode = N.getOperand(1);
5691 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5692 RawUndefs)) {
5693 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
5694 break;
5695 }
5696 return false;
5697 }
5698 case X86ISD::PSHUFB: {
5699 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5700 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5701 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5702 IsUnary = true;
5703 SDValue MaskNode = N.getOperand(1);
5704 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5705 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
5706 break;
5707 }
5708 return false;
5709 }
5710 case X86ISD::VPERMI:
5711 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5712 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5713 DecodeVPERMMask(NumElems, ImmN, Mask);
5714 IsUnary = true;
5715 break;
5716 case X86ISD::MOVSS:
5717 case X86ISD::MOVSD:
5718 case X86ISD::MOVSH:
5719 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5720 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5721 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5722 break;
5723 case X86ISD::VPERM2X128:
5724 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5725 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5726 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5727 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
5728 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5729 break;
5730 case X86ISD::SHUF128:
5731 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5732 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5733 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5734 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
5735 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5736 break;
5737 case X86ISD::MOVSLDUP:
5738 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5739 DecodeMOVSLDUPMask(NumElems, Mask);
5740 IsUnary = true;
5741 break;
5742 case X86ISD::MOVSHDUP:
5743 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5744 DecodeMOVSHDUPMask(NumElems, Mask);
5745 IsUnary = true;
5746 break;
5747 case X86ISD::MOVDDUP:
5748 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5749 DecodeMOVDDUPMask(NumElems, Mask);
5750 IsUnary = true;
5751 break;
5752 case X86ISD::VPERMIL2: {
5753 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5754 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5755 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5756 SDValue MaskNode = N.getOperand(2);
5757 SDValue CtrlNode = N.getOperand(3);
5758 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5759 unsigned CtrlImm = CtrlOp->getZExtValue();
5760 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5761 RawUndefs)) {
5762 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
5763 Mask);
5764 break;
5765 }
5766 }
5767 return false;
5768 }
5769 case X86ISD::VPPERM: {
5770 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5771 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5772 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5773 SDValue MaskNode = N.getOperand(2);
5774 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5775 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
5776 break;
5777 }
5778 return false;
5779 }
5780 case X86ISD::VPERMV: {
5781 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5782 IsUnary = true;
5783 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5784 Ops.push_back(N.getOperand(1));
5785 SDValue MaskNode = N.getOperand(0);
5786 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5787 RawUndefs)) {
5788 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
5789 break;
5790 }
5791 return false;
5792 }
5793 case X86ISD::VPERMV3: {
5794 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5795 assert(N.getOperand(2).getValueType() == VT && "Unexpected value type");
5796 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(2);
5797 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5798 Ops.push_back(N.getOperand(0));
5799 Ops.push_back(N.getOperand(2));
5800 SDValue MaskNode = N.getOperand(1);
5801 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5802 RawUndefs)) {
5803 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
5804 break;
5805 }
5806 return false;
5807 }
5808 default:
5809 llvm_unreachable("unknown target shuffle node");
5810 }
5811
5812 // Empty mask indicates the decode failed.
5813 if (Mask.empty())
5814 return false;
5815
5816 // Check if we're getting a shuffle mask with zero'd elements.
5817 if (!AllowSentinelZero && isAnyZero(Mask))
5818 return false;
5819
5820 // If we have a fake unary shuffle, the shuffle mask is spread across two
5821 // inputs that are actually the same node. Re-map the mask to always point
5822 // into the first input.
5823 if (IsFakeUnary)
5824 for (int &M : Mask)
5825 if (M >= (int)Mask.size())
5826 M -= Mask.size();
5827
5828 // If we didn't already add operands in the opcode-specific code, default to
5829 // adding 1 or 2 operands starting at 0.
5830 if (Ops.empty()) {
5831 Ops.push_back(N.getOperand(0));
5832 if (!IsUnary || IsFakeUnary)
5833 Ops.push_back(N.getOperand(1));
5834 }
5835
5836 return true;
5837}
5838
5839// Wrapper for getTargetShuffleMask with InUnary;
5840static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5842 SmallVectorImpl<int> &Mask) {
5843 bool IsUnary;
5844 return getTargetShuffleMask(N, AllowSentinelZero, Ops, Mask, IsUnary);
5845}
5846
5847/// Compute whether each element of a shuffle is zeroable.
5848///
5849/// A "zeroable" vector shuffle element is one which can be lowered to zero.
5850/// Either it is an undef element in the shuffle mask, the element of the input
5851/// referenced is undef, or the element of the input referenced is known to be
5852/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
5853/// as many lanes with this technique as possible to simplify the remaining
5854/// shuffle.
5856 SDValue V1, SDValue V2,
5857 APInt &KnownUndef, APInt &KnownZero) {
5858 int Size = Mask.size();
5859 KnownUndef = KnownZero = APInt::getZero(Size);
5860
5861 V1 = peekThroughBitcasts(V1);
5862 V2 = peekThroughBitcasts(V2);
5863
5864 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
5865 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
5866
5867 int VectorSizeInBits = V1.getValueSizeInBits();
5868 int ScalarSizeInBits = VectorSizeInBits / Size;
5869 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
5870
5871 for (int i = 0; i < Size; ++i) {
5872 int M = Mask[i];
5873 // Handle the easy cases.
5874 if (M < 0) {
5875 KnownUndef.setBit(i);
5876 continue;
5877 }
5878 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
5879 KnownZero.setBit(i);
5880 continue;
5881 }
5882
5883 // Determine shuffle input and normalize the mask.
5884 SDValue V = M < Size ? V1 : V2;
5885 M %= Size;
5886
5887 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5888 if (V.getOpcode() != ISD::BUILD_VECTOR)
5889 continue;
5890
5891 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
5892 // the (larger) source element must be UNDEF/ZERO.
5893 if ((Size % V.getNumOperands()) == 0) {
5894 int Scale = Size / V->getNumOperands();
5895 SDValue Op = V.getOperand(M / Scale);
5896 if (Op.isUndef())
5897 KnownUndef.setBit(i);
5898 if (X86::isZeroNode(Op))
5899 KnownZero.setBit(i);
5900 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
5901 APInt Val = Cst->getAPIntValue();
5902 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5903 if (Val == 0)
5904 KnownZero.setBit(i);
5905 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5906 APInt Val = Cst->getValueAPF().bitcastToAPInt();
5907 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5908 if (Val == 0)
5909 KnownZero.setBit(i);
5910 }
5911 continue;
5912 }
5913
5914 // If the BUILD_VECTOR has more elements then all the (smaller) source
5915 // elements must be UNDEF or ZERO.
5916 if ((V.getNumOperands() % Size) == 0) {
5917 int Scale = V->getNumOperands() / Size;
5918 bool AllUndef = true;
5919 bool AllZero = true;
5920 for (int j = 0; j < Scale; ++j) {
5921 SDValue Op = V.getOperand((M * Scale) + j);
5922 AllUndef &= Op.isUndef();
5923 AllZero &= X86::isZeroNode(Op);
5924 }
5925 if (AllUndef)
5926 KnownUndef.setBit(i);
5927 if (AllZero)
5928 KnownZero.setBit(i);
5929 continue;
5930 }
5931 }
5932}
5933
5934/// Decode a target shuffle mask and inputs and see if any values are
5935/// known to be undef or zero from their inputs.
5936/// Returns true if the target shuffle mask was decoded.
5937/// FIXME: Merge this with computeZeroableShuffleElements?
5940 APInt &KnownUndef, APInt &KnownZero) {
5941 bool IsUnary;
5942 if (!isTargetShuffle(N.getOpcode()))
5943 return false;
5944
5945 MVT VT = N.getSimpleValueType();
5946 if (!getTargetShuffleMask(N, true, Ops, Mask, IsUnary))
5947 return false;
5948
5949 int Size = Mask.size();
5950 SDValue V1 = Ops[0];
5951 SDValue V2 = IsUnary ? V1 : Ops[1];
5952 KnownUndef = KnownZero = APInt::getZero(Size);
5953
5954 V1 = peekThroughBitcasts(V1);
5955 V2 = peekThroughBitcasts(V2);
5956
5957 assert((VT.getSizeInBits() % Size) == 0 &&
5958 "Illegal split of shuffle value type");
5959 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
5960
5961 // Extract known constant input data.
5962 APInt UndefSrcElts[2];
5963 SmallVector<APInt, 32> SrcEltBits[2];
5964 bool IsSrcConstant[2] = {
5965 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5966 SrcEltBits[0], /*AllowWholeUndefs*/ true,
5967 /*AllowPartialUndefs*/ false),
5968 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5969 SrcEltBits[1], /*AllowWholeUndefs*/ true,
5970 /*AllowPartialUndefs*/ false)};
5971
5972 for (int i = 0; i < Size; ++i) {
5973 int M = Mask[i];
5974
5975 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5976 if (M < 0) {
5977 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
5978 if (SM_SentinelUndef == M)
5979 KnownUndef.setBit(i);
5980 if (SM_SentinelZero == M)
5981 KnownZero.setBit(i);
5982 continue;
5983 }
5984
5985 // Determine shuffle input and normalize the mask.
5986 unsigned SrcIdx = M / Size;
5987 SDValue V = M < Size ? V1 : V2;
5988 M %= Size;
5989
5990 // We are referencing an UNDEF input.
5991 if (V.isUndef()) {
5992 KnownUndef.setBit(i);
5993 continue;
5994 }
5995
5996 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5997 // TODO: We currently only set UNDEF for integer types - floats use the same
5998 // registers as vectors and many of the scalar folded loads rely on the
5999 // SCALAR_TO_VECTOR pattern.
6000 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
6001 (Size % V.getValueType().getVectorNumElements()) == 0) {
6002 int Scale = Size / V.getValueType().getVectorNumElements();
6003 int Idx = M / Scale;
6004 if (Idx != 0 && !VT.isFloatingPoint())
6005 KnownUndef.setBit(i);
6006 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
6007 KnownZero.setBit(i);
6008 continue;
6009 }
6010
6011 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
6012 // base vectors.
6013 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
6014 SDValue Vec = V.getOperand(0);
6015 int NumVecElts = Vec.getValueType().getVectorNumElements();
6016 if (Vec.isUndef() && Size == NumVecElts) {
6017 int Idx = V.getConstantOperandVal(2);
6018 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
6019 if (M < Idx || (Idx + NumSubElts) <= M)
6020 KnownUndef.setBit(i);
6021 }
6022 continue;
6023 }
6024
6025 // Attempt to extract from the source's constant bits.
6026 if (IsSrcConstant[SrcIdx]) {
6027 if (UndefSrcElts[SrcIdx][M])
6028 KnownUndef.setBit(i);
6029 else if (SrcEltBits[SrcIdx][M] == 0)
6030 KnownZero.setBit(i);
6031 }
6032 }
6033
6034 assert(VT.getVectorNumElements() == (unsigned)Size &&
6035 "Different mask size from vector size!");
6036 return true;
6037}
6038
6039// Replace target shuffle mask elements with known undef/zero sentinels.
6041 const APInt &KnownUndef,
6042 const APInt &KnownZero,
6043 bool ResolveKnownZeros= true) {
6044 unsigned NumElts = Mask.size();
6045 assert(KnownUndef.getBitWidth() == NumElts &&
6046 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
6047
6048 for (unsigned i = 0; i != NumElts; ++i) {
6049 if (KnownUndef[i])
6050 Mask[i] = SM_SentinelUndef;
6051 else if (ResolveKnownZeros && KnownZero[i])
6052 Mask[i] = SM_SentinelZero;
6053 }
6054}
6055
6056// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
6058 APInt &KnownUndef,
6059 APInt &KnownZero) {
6060 unsigned NumElts = Mask.size();
6061 KnownUndef = KnownZero = APInt::getZero(NumElts);
6062
6063 for (unsigned i = 0; i != NumElts; ++i) {
6064 int M = Mask[i];
6065 if (SM_SentinelUndef == M)
6066 KnownUndef.setBit(i);
6067 if (SM_SentinelZero == M)
6068 KnownZero.setBit(i);
6069 }
6070}
6071
6072// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
6074 SDValue Cond, bool IsBLENDV = false) {
6075 EVT CondVT = Cond.getValueType();
6076 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
6077 unsigned NumElts = CondVT.getVectorNumElements();
6078
6079 APInt UndefElts;
6080 SmallVector<APInt, 32> EltBits;
6081 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
6082 /*AllowWholeUndefs*/ true,
6083 /*AllowPartialUndefs*/ false))
6084 return false;
6085
6086 Mask.resize(NumElts, SM_SentinelUndef);
6087
6088 for (int i = 0; i != (int)NumElts; ++i) {
6089 Mask[i] = i;
6090 // Arbitrarily choose from the 2nd operand if the select condition element
6091 // is undef.
6092 // TODO: Can we do better by matching patterns such as even/odd?
6093 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
6094 (IsBLENDV && EltBits[i].isNonNegative()))
6095 Mask[i] += NumElts;
6096 }
6097
6098 return true;
6099}
6100
6101// Forward declaration (for getFauxShuffleMask recursive check).
6102static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6105 const SelectionDAG &DAG, unsigned Depth,
6106 bool ResolveKnownElts);
6107
6108// Attempt to decode ops that could be represented as a shuffle mask.
6109// The decoded shuffle mask may contain a different number of elements to the
6110// destination value type.
6111// TODO: Merge into getTargetShuffleInputs()
6112static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
6115 const SelectionDAG &DAG, unsigned Depth,
6116 bool ResolveKnownElts) {
6117 Mask.clear();
6118 Ops.clear();
6119
6120 MVT VT = N.getSimpleValueType();
6121 unsigned NumElts = VT.getVectorNumElements();
6122 unsigned NumSizeInBits = VT.getSizeInBits();
6123 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
6124 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
6125 return false;
6126 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
6127 unsigned NumSizeInBytes = NumSizeInBits / 8;
6128 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6129
6130 unsigned Opcode = N.getOpcode();
6131 switch (Opcode) {
6132 case ISD::VECTOR_SHUFFLE: {
6133 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
6134 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6135 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
6136 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6137 Ops.push_back(N.getOperand(0));
6138 Ops.push_back(N.getOperand(1));
6139 return true;
6140 }
6141 return false;
6142 }
6143 case ISD::AND:
6144 case X86ISD::ANDNP: {
6145 // Attempt to decode as a per-byte mask.
6146 APInt UndefElts;
6147 SmallVector<APInt, 32> EltBits;
6148 SDValue N0 = N.getOperand(0);
6149 SDValue N1 = N.getOperand(1);
6150 bool IsAndN = (X86ISD::ANDNP == Opcode);
6151 uint64_t ZeroMask = IsAndN ? 255 : 0;
6152 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits,
6153 /*AllowWholeUndefs*/ false,
6154 /*AllowPartialUndefs*/ false))
6155 return false;
6156 // We can't assume an undef src element gives an undef dst - the other src
6157 // might be zero.
6158 assert(UndefElts.isZero() && "Unexpected UNDEF element in AND/ANDNP mask");
6159 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
6160 const APInt &ByteBits = EltBits[i];
6161 if (ByteBits != 0 && ByteBits != 255)
6162 return false;
6163 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
6164 }
6165 Ops.push_back(IsAndN ? N1 : N0);
6166 return true;
6167 }
6168 case ISD::OR: {
6169 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
6170 // is a valid shuffle index.
6171 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
6172 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
6173 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
6174 return false;
6175
6176 SmallVector<int, 64> SrcMask0, SrcMask1;
6177 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
6180 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
6181 Depth + 1, true) ||
6182 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
6183 Depth + 1, true))
6184 return false;
6185
6186 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
6187 SmallVector<int, 64> Mask0, Mask1;
6188 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
6189 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
6190 for (int i = 0; i != (int)MaskSize; ++i) {
6191 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
6192 // loops converting between OR and BLEND shuffles due to
6193 // canWidenShuffleElements merging away undef elements, meaning we
6194 // fail to recognise the OR as the undef element isn't known zero.
6195 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
6196 Mask.push_back(SM_SentinelZero);
6197 else if (Mask1[i] == SM_SentinelZero)
6198 Mask.push_back(i);
6199 else if (Mask0[i] == SM_SentinelZero)
6200 Mask.push_back(i + MaskSize);
6201 else
6202 return false;
6203 }
6204 Ops.push_back(N.getOperand(0));
6205 Ops.push_back(N.getOperand(1));
6206 return true;
6207 }
6208 case ISD::CONCAT_VECTORS: {
6209 // Limit this to vXi64 vector cases to make the most of cross lane shuffles.
6210 unsigned NumSubElts = N.getOperand(0).getValueType().getVectorNumElements();
6211 if (NumBitsPerElt == 64) {
6212 for (unsigned I = 0, E = N.getNumOperands(); I != E; ++I) {
6213 for (unsigned M = 0; M != NumSubElts; ++M)
6214 Mask.push_back((I * NumElts) + M);
6215 Ops.push_back(N.getOperand(I));
6216 }
6217 return true;
6218 }
6219 return false;
6220 }
6221 case ISD::INSERT_SUBVECTOR: {
6222 SDValue Src = N.getOperand(0);
6223 SDValue Sub = N.getOperand(1);
6224 EVT SubVT = Sub.getValueType();
6225 unsigned NumSubElts = SubVT.getVectorNumElements();
6226 uint64_t InsertIdx = N.getConstantOperandVal(2);
6227 // Subvector isn't demanded - just return the base vector.
6228 if (DemandedElts.extractBits(NumSubElts, InsertIdx) == 0) {
6229 Mask.resize(NumElts);
6230 std::iota(Mask.begin(), Mask.end(), 0);
6231 Ops.push_back(Src);
6232 return true;
6233 }
6234 // Handle CONCAT(SUB0, SUB1).
6235 // Limit to vXi64/splat cases to make the most of cross lane shuffles.
6236 if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6237 Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6238 Src.getOperand(0).isUndef() &&
6239 Src.getOperand(1).getValueType() == SubVT &&
6240 Src.getConstantOperandVal(2) == 0 &&
6241 (NumBitsPerElt == 64 || Src.getOperand(1) == Sub) &&
6242 SDNode::areOnlyUsersOf({N.getNode(), Src.getNode()}, Sub.getNode())) {
6243 Mask.resize(NumElts);
6244 std::iota(Mask.begin(), Mask.begin() + NumSubElts, 0);
6245 std::iota(Mask.begin() + NumSubElts, Mask.end(), NumElts);
6246 Ops.push_back(Src.getOperand(1));
6247 Ops.push_back(Sub);
6248 return true;
6249 }
6250 if (!N->isOnlyUserOf(Sub.getNode()))
6251 return false;
6252
6253 SmallVector<int, 64> SubMask;
6254 SmallVector<SDValue, 2> SubInputs;
6256 EVT SubSrcVT = SubSrc.getValueType();
6257 if (!SubSrcVT.isVector())
6258 return false;
6259
6260 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
6261 if (SubSrc.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6262 SubSrc.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6263 uint64_t ExtractIdx = SubSrc.getConstantOperandVal(1);
6264 SDValue SubSrcSrc = SubSrc.getOperand(0);
6265 unsigned NumSubSrcSrcElts =
6266 SubSrcSrc.getValueType().getVectorNumElements();
6267 unsigned MaxElts = std::max(NumElts, NumSubSrcSrcElts);
6268 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcSrcElts) == 0 &&
6269 "Subvector valuetype mismatch");
6270 InsertIdx *= (MaxElts / NumElts);
6271 ExtractIdx *= (MaxElts / NumSubSrcSrcElts);
6272 NumSubElts *= (MaxElts / NumElts);
6273 bool SrcIsUndef = Src.isUndef();
6274 for (int i = 0; i != (int)MaxElts; ++i)
6275 Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i);
6276 for (int i = 0; i != (int)NumSubElts; ++i)
6277 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
6278 if (!SrcIsUndef)
6279 Ops.push_back(Src);
6280 Ops.push_back(SubSrcSrc);
6281 return true;
6282 }
6283
6284 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
6285 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
6286 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
6287 Depth + 1, ResolveKnownElts))
6288 return false;
6289
6290 // Subvector shuffle inputs must not be larger than the subvector.
6291 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
6292 return SubVT.getFixedSizeInBits() <
6293 SubInput.getValueSizeInBits().getFixedValue();
6294 }))
6295 return false;
6296
6297 if (SubMask.size() != NumSubElts) {
6298 assert(((SubMask.size() % NumSubElts) == 0 ||
6299 (NumSubElts % SubMask.size()) == 0) &&
6300 "Illegal submask scale");
6301 if ((NumSubElts % SubMask.size()) == 0) {
6302 int Scale = NumSubElts / SubMask.size();
6303 SmallVector<int, 64> ScaledSubMask;
6304 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
6305 SubMask = ScaledSubMask;
6306 } else {
6307 int Scale = SubMask.size() / NumSubElts;
6308 NumSubElts = SubMask.size();
6309 NumElts *= Scale;
6310 InsertIdx *= Scale;
6311 }
6312 }
6313 Ops.push_back(Src);
6314 Ops.append(SubInputs.begin(), SubInputs.end());
6315 if (ISD::isBuildVectorAllZeros(Src.getNode()))
6316 Mask.append(NumElts, SM_SentinelZero);
6317 else
6318 for (int i = 0; i != (int)NumElts; ++i)
6319 Mask.push_back(i);
6320 for (int i = 0; i != (int)NumSubElts; ++i) {
6321 int M = SubMask[i];
6322 if (0 <= M) {
6323 int InputIdx = M / NumSubElts;
6324 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
6325 }
6326 Mask[i + InsertIdx] = M;
6327 }
6328 return true;
6329 }
6330 case X86ISD::PINSRB:
6331 case X86ISD::PINSRW:
6334 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
6335 // vector, for matching src/dst vector types.
6336 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
6337
6338 unsigned DstIdx = 0;
6339 if (Opcode != ISD::SCALAR_TO_VECTOR) {
6340 // Check we have an in-range constant insertion index.
6341 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
6342 N.getConstantOperandAPInt(2).uge(NumElts))
6343 return false;
6344 DstIdx = N.getConstantOperandVal(2);
6345
6346 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
6347 if (X86::isZeroNode(Scl)) {
6348 Ops.push_back(N.getOperand(0));
6349 for (unsigned i = 0; i != NumElts; ++i)
6350 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
6351 return true;
6352 }
6353 }
6354
6355 // Peek through trunc/aext/zext/bitcast.
6356 // TODO: aext shouldn't require SM_SentinelZero padding.
6357 // TODO: handle shift of scalars.
6358 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
6359 while (Scl.getOpcode() == ISD::TRUNCATE ||
6360 Scl.getOpcode() == ISD::ANY_EXTEND ||
6361 Scl.getOpcode() == ISD::ZERO_EXTEND ||
6362 (Scl.getOpcode() == ISD::BITCAST &&
6365 Scl = Scl.getOperand(0);
6366 MinBitsPerElt =
6367 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
6368 }
6369 if ((MinBitsPerElt % 8) != 0)
6370 return false;
6371
6372 // Attempt to find the source vector the scalar was extracted from.
6373 SDValue SrcExtract;
6374 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
6375 Scl.getOpcode() == X86ISD::PEXTRW ||
6376 Scl.getOpcode() == X86ISD::PEXTRB) &&
6377 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6378 SrcExtract = Scl;
6379 }
6380 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6381 return false;
6382
6383 SDValue SrcVec = SrcExtract.getOperand(0);
6384 EVT SrcVT = SrcVec.getValueType();
6385 if (!SrcVT.getScalarType().isByteSized())
6386 return false;
6387 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6388 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
6389 unsigned DstByte = DstIdx * NumBytesPerElt;
6390 MinBitsPerElt =
6391 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
6392
6393 // Create 'identity' byte level shuffle mask and then add inserted bytes.
6394 if (Opcode == ISD::SCALAR_TO_VECTOR) {
6395 Ops.push_back(SrcVec);
6396 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6397 } else {
6398 Ops.push_back(SrcVec);
6399 Ops.push_back(N.getOperand(0));
6400 for (int i = 0; i != (int)NumSizeInBytes; ++i)
6401 Mask.push_back(NumSizeInBytes + i);
6402 }
6403
6404 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6405 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6406 for (unsigned i = 0; i != MinBytesPerElts; ++i)
6407 Mask[DstByte + i] = SrcByte + i;
6408 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6409 Mask[DstByte + i] = SM_SentinelZero;
6410 return true;
6411 }
6412 case X86ISD::PACKSS:
6413 case X86ISD::PACKUS: {
6414 SDValue N0 = N.getOperand(0);
6415 SDValue N1 = N.getOperand(1);
6416 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6417 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6418 "Unexpected input value type");
6419
6420 APInt EltsLHS, EltsRHS;
6421 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
6422
6423 // If we know input saturation won't happen (or we don't care for particular
6424 // lanes), we can treat this as a truncation shuffle.
6425 bool Offset0 = false, Offset1 = false;
6426 if (Opcode == X86ISD::PACKSS) {
6427 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6428 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
6429 (!(N1.isUndef() || EltsRHS.isZero()) &&
6430 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
6431 return false;
6432 // We can't easily fold ASHR into a shuffle, but if it was feeding a
6433 // PACKSS then it was likely being used for sign-extension for a
6434 // truncation, so just peek through and adjust the mask accordingly.
6435 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
6436 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
6437 Offset0 = true;
6438 N0 = N0.getOperand(0);
6439 }
6440 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
6441 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
6442 Offset1 = true;
6443 N1 = N1.getOperand(0);
6444 }
6445 } else {
6446 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6447 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6448 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
6449 (!(N1.isUndef() || EltsRHS.isZero()) &&
6450 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
6451 return false;
6452 }
6453
6454 bool IsUnary = (N0 == N1);
6455
6456 Ops.push_back(N0);
6457 if (!IsUnary)
6458 Ops.push_back(N1);
6459
6460 createPackShuffleMask(VT, Mask, IsUnary);
6461
6462 if (Offset0 || Offset1) {
6463 for (int &M : Mask)
6464 if ((Offset0 && isInRange(M, 0, NumElts)) ||
6465 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
6466 ++M;
6467 }
6468 return true;
6469 }
6470 case ISD::VSELECT:
6471 case X86ISD::BLENDV: {
6472 SDValue Cond = N.getOperand(0);
6473 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
6474 Ops.push_back(N.getOperand(1));
6475 Ops.push_back(N.getOperand(2));
6476 return true;
6477 }
6478 return false;
6479 }
6480 case X86ISD::VTRUNC: {
6481 SDValue Src = N.getOperand(0);
6482 EVT SrcVT = Src.getValueType();
6483 if (SrcVT.getSizeInBits() != NumSizeInBits)
6484 return false;
6485 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6486 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6487 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6488 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
6489 for (unsigned i = 0; i != NumSrcElts; ++i)
6490 Mask.push_back(i * Scale);
6491 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
6492 Ops.push_back(Src);
6493 return true;
6494 }
6495 case ISD::SHL:
6496 case ISD::SRL: {
6497 APInt UndefElts;
6498 SmallVector<APInt, 32> EltBits;
6499 if (!getTargetConstantBitsFromNode(N.getOperand(1), NumBitsPerElt,
6500 UndefElts, EltBits,
6501 /*AllowWholeUndefs*/ true,
6502 /*AllowPartialUndefs*/ false))
6503 return false;
6504
6505 // We can only decode 'whole byte' bit shifts as shuffles.
6506 for (unsigned I = 0; I != NumElts; ++I)
6507 if (DemandedElts[I] && !UndefElts[I] &&
6508 (EltBits[I].urem(8) != 0 || EltBits[I].uge(NumBitsPerElt)))
6509 return false;
6510
6511 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6512 Ops.push_back(N.getOperand(0));
6513
6514 for (unsigned I = 0; I != NumElts; ++I) {
6515 if (!DemandedElts[I] || UndefElts[I])
6516 continue;
6517 unsigned ByteShift = EltBits[I].getZExtValue() / 8;
6518 unsigned Lo = I * NumBytesPerElt;
6519 unsigned Hi = Lo + NumBytesPerElt;
6520 // Clear mask to all zeros and insert the shifted byte indices.
6521 std::fill(Mask.begin() + Lo, Mask.begin() + Hi, SM_SentinelZero);
6522 if (ISD::SHL == Opcode)
6523 std::iota(Mask.begin() + Lo + ByteShift, Mask.begin() + Hi, Lo);
6524 else
6525 std::iota(Mask.begin() + Lo, Mask.begin() + Hi - ByteShift,
6526 Lo + ByteShift);
6527 }
6528 return true;
6529 }
6530 case X86ISD::VSHLI:
6531 case X86ISD::VSRLI: {
6532 uint64_t ShiftVal = N.getConstantOperandVal(1);
6533 // Out of range bit shifts are guaranteed to be zero.
6534 if (NumBitsPerElt <= ShiftVal) {
6535 Mask.append(NumElts, SM_SentinelZero);
6536 return true;
6537 }
6538
6539 // We can only decode 'whole byte' bit shifts as shuffles.
6540 if ((ShiftVal % 8) != 0)
6541 break;
6542
6543 uint64_t ByteShift = ShiftVal / 8;
6544 Ops.push_back(N.getOperand(0));
6545
6546 // Clear mask to all zeros and insert the shifted byte indices.
6547 Mask.append(NumSizeInBytes, SM_SentinelZero);
6548
6549 if (X86ISD::VSHLI == Opcode) {
6550 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6551 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6552 Mask[i + j] = i + j - ByteShift;
6553 } else {
6554 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6555 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6556 Mask[i + j - ByteShift] = i + j;
6557 }
6558 return true;
6559 }
6560 case X86ISD::VROTLI:
6561 case X86ISD::VROTRI: {
6562 // We can only decode 'whole byte' bit rotates as shuffles.
6563 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6564 if ((RotateVal % 8) != 0)
6565 return false;
6566 Ops.push_back(N.getOperand(0));
6567 int Offset = RotateVal / 8;
6568 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
6569 for (int i = 0; i != (int)NumElts; ++i) {
6570 int BaseIdx = i * NumBytesPerElt;
6571 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
6572 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
6573 }
6574 }
6575 return true;
6576 }
6577 case X86ISD::VBROADCAST: {
6578 SDValue Src = N.getOperand(0);
6579 if (!Src.getSimpleValueType().isVector()) {
6580 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6581 !isNullConstant(Src.getOperand(1)) ||
6582 Src.getOperand(0).getValueType().getScalarType() !=
6583 VT.getScalarType())
6584 return false;
6585 Src = Src.getOperand(0);
6586 }
6587 Ops.push_back(Src);
6588 Mask.append(NumElts, 0);
6589 return true;
6590 }
6592 SDValue Src = N.getOperand(0);
6593 EVT SrcVT = Src.getValueType();
6594 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6595
6596 // Extended source must be a simple vector.
6597 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6598 (NumBitsPerSrcElt % 8) != 0)
6599 return false;
6600
6601 // We can only handle all-signbits extensions.
6602 APInt DemandedSrcElts =
6603 DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
6604 if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
6605 return false;
6606
6607 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
6608 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
6609 for (unsigned I = 0; I != NumElts; ++I)
6610 Mask.append(Scale, I);
6611 Ops.push_back(Src);
6612 return true;
6613 }
6614 case ISD::ZERO_EXTEND:
6615 case ISD::ANY_EXTEND:
6618 SDValue Src = N.getOperand(0);
6619 EVT SrcVT = Src.getValueType();
6620
6621 // Extended source must be a simple vector.
6622 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6623 (SrcVT.getScalarSizeInBits() % 8) != 0)
6624 return false;
6625
6626 bool IsAnyExtend =
6627 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
6628 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
6629 IsAnyExtend, Mask);
6630 Ops.push_back(Src);
6631 return true;
6632 }
6633 }
6634
6635 return false;
6636}
6637
6638/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
6640 SmallVectorImpl<int> &Mask) {
6641 int MaskWidth = Mask.size();
6642 SmallVector<SDValue, 16> UsedInputs;
6643 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6644 int lo = UsedInputs.size() * MaskWidth;
6645 int hi = lo + MaskWidth;
6646
6647 // Strip UNDEF input usage.
6648 if (Inputs[i].isUndef())
6649 for (int &M : Mask)
6650 if ((lo <= M) && (M < hi))
6651 M = SM_SentinelUndef;
6652
6653 // Check for unused inputs.
6654 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6655 for (int &M : Mask)
6656 if (lo <= M)
6657 M -= MaskWidth;
6658 continue;
6659 }
6660
6661 // Check for repeated inputs.
6662 bool IsRepeat = false;
6663 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
6664 if (UsedInputs[j] != Inputs[i])
6665 continue;
6666 for (int &M : Mask)
6667 if (lo <= M)
6668 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6669 IsRepeat = true;
6670 break;
6671 }
6672 if (IsRepeat)
6673 continue;
6674
6675 UsedInputs.push_back(Inputs[i]);
6676 }
6677 Inputs = UsedInputs;
6678}
6679
6680/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
6681/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
6682/// Returns true if the target shuffle mask was decoded.
6683static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6686 APInt &KnownUndef, APInt &KnownZero,
6687 const SelectionDAG &DAG, unsigned Depth,
6688 bool ResolveKnownElts) {
6690 return false; // Limit search depth.
6691
6692 EVT VT = Op.getValueType();
6693 if (!VT.isSimple() || !VT.isVector())
6694 return false;
6695
6696 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
6697 if (ResolveKnownElts)
6698 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
6699 return true;
6700 }
6701 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
6702 ResolveKnownElts)) {
6703 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
6704 return true;
6705 }
6706 return false;
6707}
6708
6709static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6712 const SelectionDAG &DAG, unsigned Depth,
6713 bool ResolveKnownElts) {
6714 APInt KnownUndef, KnownZero;
6715 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
6716 KnownZero, DAG, Depth, ResolveKnownElts);
6717}
6718
6721 const SelectionDAG &DAG, unsigned Depth = 0,
6722 bool ResolveKnownElts = true) {
6723 EVT VT = Op.getValueType();
6724 if (!VT.isSimple() || !VT.isVector())
6725 return false;
6726
6727 unsigned NumElts = Op.getValueType().getVectorNumElements();
6728 APInt DemandedElts = APInt::getAllOnes(NumElts);
6729 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
6730 ResolveKnownElts);
6731}
6732
6733// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
6734static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
6735 EVT MemVT, MemSDNode *Mem, unsigned Offset,
6736 SelectionDAG &DAG) {
6737 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
6738 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
6739 "Unknown broadcast load type");
6740
6741 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
6742 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
6743 return SDValue();
6744
6747 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
6748 SDValue Ops[] = {Mem->getChain(), Ptr};
6749 SDValue BcstLd = DAG.getMemIntrinsicNode(
6750 Opcode, DL, Tys, Ops, MemVT,
6752 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
6753 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
6754 return BcstLd;
6755}
6756
6757/// Returns the scalar element that will make up the i'th
6758/// element of the result of the vector shuffle.
6759static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
6760 SelectionDAG &DAG, unsigned Depth) {
6762 return SDValue(); // Limit search depth.
6763
6764 EVT VT = Op.getValueType();
6765 unsigned Opcode = Op.getOpcode();
6766 unsigned NumElems = VT.getVectorNumElements();
6767
6768 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6769 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
6770 int Elt = SV->getMaskElt(Index);
6771
6772 if (Elt < 0)
6773 return DAG.getUNDEF(VT.getVectorElementType());
6774
6775 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6776 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6777 }
6778
6779 // Recurse into target specific vector shuffles to find scalars.
6780 if (isTargetShuffle(Opcode)) {
6781 MVT ShufVT = VT.getSimpleVT();
6782 MVT ShufSVT = ShufVT.getVectorElementType();
6783 int NumElems = (int)ShufVT.getVectorNumElements();
6784 SmallVector<int, 16> ShuffleMask;
6786 if (!getTargetShuffleMask(Op, true, ShuffleOps, ShuffleMask))
6787 return SDValue();
6788
6789 int Elt = ShuffleMask[Index];
6790 if (Elt == SM_SentinelZero)
6791 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
6792 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
6793 if (Elt == SM_SentinelUndef)
6794 return DAG.getUNDEF(ShufSVT);
6795
6796 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
6797 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6798 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6799 }
6800
6801 // Recurse into insert_subvector base/sub vector to find scalars.
6802 if (Opcode == ISD::INSERT_SUBVECTOR) {
6803 SDValue Vec = Op.getOperand(0);
6804 SDValue Sub = Op.getOperand(1);
6805 uint64_t SubIdx = Op.getConstantOperandVal(2);
6806 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
6807
6808 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
6809 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
6810 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
6811 }
6812
6813 // Recurse into concat_vectors sub vector to find scalars.
6814 if (Opcode == ISD::CONCAT_VECTORS) {
6815 EVT SubVT = Op.getOperand(0).getValueType();
6816 unsigned NumSubElts = SubVT.getVectorNumElements();
6817 uint64_t SubIdx = Index / NumSubElts;
6818 uint64_t SubElt = Index % NumSubElts;
6819 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
6820 }
6821
6822 // Recurse into extract_subvector src vector to find scalars.
6823 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
6824 SDValue Src = Op.getOperand(0);
6825 uint64_t SrcIdx = Op.getConstantOperandVal(1);
6826 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
6827 }
6828
6829 // We only peek through bitcasts of the same vector width.
6830 if (Opcode == ISD::BITCAST) {
6831 SDValue Src = Op.getOperand(0);
6832 EVT SrcVT = Src.getValueType();
6833 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
6834 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
6835 return SDValue();
6836 }
6837
6838 // Actual nodes that may contain scalar elements
6839
6840 // For insert_vector_elt - either return the index matching scalar or recurse
6841 // into the base vector.
6842 if (Opcode == ISD::INSERT_VECTOR_ELT &&
6843 isa<ConstantSDNode>(Op.getOperand(2))) {
6844 if (Op.getConstantOperandAPInt(2) == Index)
6845 return Op.getOperand(1);
6846 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
6847 }
6848
6849 if (Opcode == ISD::SCALAR_TO_VECTOR)
6850 return (Index == 0) ? Op.getOperand(0)
6851 : DAG.getUNDEF(VT.getVectorElementType());
6852
6853 if (Opcode == ISD::BUILD_VECTOR)
6854 return Op.getOperand(Index);
6855
6856 return SDValue();
6857}
6858
6859// Use PINSRB/PINSRW/PINSRD to create a build vector.
6861 const APInt &NonZeroMask,
6862 unsigned NumNonZero, unsigned NumZero,
6863 SelectionDAG &DAG,
6864 const X86Subtarget &Subtarget) {
6865 MVT VT = Op.getSimpleValueType();
6866 unsigned NumElts = VT.getVectorNumElements();
6867 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6868 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6869 "Illegal vector insertion");
6870
6871 SDValue V;
6872 bool First = true;
6873
6874 for (unsigned i = 0; i < NumElts; ++i) {
6875 bool IsNonZero = NonZeroMask[i];
6876 if (!IsNonZero)
6877 continue;
6878
6879 // If the build vector contains zeros or our first insertion is not the
6880 // first index then insert into zero vector to break any register
6881 // dependency else use SCALAR_TO_VECTOR.
6882 if (First) {
6883 First = false;
6884 if (NumZero || 0 != i)
6885 V = getZeroVector(VT, Subtarget, DAG, DL);
6886 else {
6887 assert(0 == i && "Expected insertion into zero-index");
6888 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6889 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6890 V = DAG.getBitcast(VT, V);
6891 continue;
6892 }
6893 }
6894 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i),
6895 DAG.getVectorIdxConstant(i, DL));
6896 }
6897
6898 return V;
6899}
6900
6901/// Custom lower build_vector of v16i8.
6903 const APInt &NonZeroMask,
6904 unsigned NumNonZero, unsigned NumZero,
6905 SelectionDAG &DAG,
6906 const X86Subtarget &Subtarget) {
6907 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6908 return SDValue();
6909
6910 // SSE4.1 - use PINSRB to insert each byte directly.
6911 if (Subtarget.hasSSE41())
6912 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero,
6913 DAG, Subtarget);
6914
6915 SDValue V;
6916
6917 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6918 // If both the lowest 16-bits are non-zero, then convert to MOVD.
6919 if (!NonZeroMask.extractBits(2, 0).isZero() &&
6920 !NonZeroMask.extractBits(2, 2).isZero()) {
6921 for (unsigned I = 0; I != 4; ++I) {
6922 if (!NonZeroMask[I])
6923 continue;
6924 SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), DL, MVT::i32);
6925 if (I != 0)
6926 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt,
6927 DAG.getConstant(I * 8, DL, MVT::i8));
6928 V = V ? DAG.getNode(ISD::OR, DL, MVT::i32, V, Elt) : Elt;
6929 }
6930 assert(V && "Failed to fold v16i8 vector to zero");
6931 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6932 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, V);
6933 V = DAG.getBitcast(MVT::v8i16, V);
6934 }
6935 for (unsigned i = V ? 4 : 0; i < 16; i += 2) {
6936 bool ThisIsNonZero = NonZeroMask[i];
6937 bool NextIsNonZero = NonZeroMask[i + 1];
6938 if (!ThisIsNonZero && !NextIsNonZero)
6939 continue;
6940
6941 SDValue Elt;
6942 if (ThisIsNonZero) {
6943 if (NumZero || NextIsNonZero)
6944 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6945 else
6946 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6947 }
6948
6949 if (NextIsNonZero) {
6950 SDValue NextElt = Op.getOperand(i + 1);
6951 if (i == 0 && NumZero)
6952 NextElt = DAG.getZExtOrTrunc(NextElt, DL, MVT::i32);
6953 else
6954 NextElt = DAG.getAnyExtOrTrunc(NextElt, DL, MVT::i32);
6955 NextElt = DAG.getNode(ISD::SHL, DL, MVT::i32, NextElt,
6956 DAG.getConstant(8, DL, MVT::i8));
6957 if (ThisIsNonZero)
6958 Elt = DAG.getNode(ISD::OR, DL, MVT::i32, NextElt, Elt);
6959 else
6960 Elt = NextElt;
6961 }
6962
6963 // If our first insertion is not the first index or zeros are needed, then
6964 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
6965 // elements undefined).
6966 if (!V) {
6967 if (i != 0 || NumZero)
6968 V = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
6969 else {
6970 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, Elt);
6971 V = DAG.getBitcast(MVT::v8i16, V);
6972 continue;
6973 }
6974 }
6975 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6976 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt,
6977 DAG.getVectorIdxConstant(i / 2, DL));
6978 }
6979
6980 return DAG.getBitcast(MVT::v16i8, V);
6981}
6982
6983/// Custom lower build_vector of v8i16.
6985 const APInt &NonZeroMask,
6986 unsigned NumNonZero, unsigned NumZero,
6987 SelectionDAG &DAG,
6988 const X86Subtarget &Subtarget) {
6989 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6990 return SDValue();
6991
6992 // Use PINSRW to insert each byte directly.
6993 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG,
6994 Subtarget);
6995}
6996
6997/// Custom lower build_vector of v4i32 or v4f32.
6999 SelectionDAG &DAG,
7000 const X86Subtarget &Subtarget) {
7001 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
7002 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
7003 // Because we're creating a less complicated build vector here, we may enable
7004 // further folding of the MOVDDUP via shuffle transforms.
7005 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
7006 Op.getOperand(0) == Op.getOperand(2) &&
7007 Op.getOperand(1) == Op.getOperand(3) &&
7008 Op.getOperand(0) != Op.getOperand(1)) {
7009 MVT VT = Op.getSimpleValueType();
7010 MVT EltVT = VT.getVectorElementType();
7011 // Create a new build vector with the first 2 elements followed by undef
7012 // padding, bitcast to v2f64, duplicate, and bitcast back.
7013 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
7014 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
7015 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
7016 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
7017 return DAG.getBitcast(VT, Dup);
7018 }
7019
7020 // Find all zeroable elements.
7021 std::bitset<4> Zeroable, Undefs;
7022 for (int i = 0; i < 4; ++i) {
7023 SDValue Elt = Op.getOperand(i);
7024 Undefs[i] = Elt.isUndef();
7025 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
7026 }
7027 assert(Zeroable.size() - Zeroable.count() > 1 &&
7028 "We expect at least two non-zero elements!");
7029
7030 // We only know how to deal with build_vector nodes where elements are either
7031 // zeroable or extract_vector_elt with constant index.
7032 SDValue FirstNonZero;
7033 unsigned FirstNonZeroIdx;
7034 for (unsigned i = 0; i < 4; ++i) {
7035 if (Zeroable[i])
7036 continue;
7037 SDValue Elt = Op.getOperand(i);
7038 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7040 return SDValue();
7041 // Make sure that this node is extracting from a 128-bit vector.
7042 MVT VT = Elt.getOperand(0).getSimpleValueType();
7043 if (!VT.is128BitVector())
7044 return SDValue();
7045 if (!FirstNonZero.getNode()) {
7046 FirstNonZero = Elt;
7047 FirstNonZeroIdx = i;
7048 }
7049 }
7050
7051 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
7052 SDValue V1 = FirstNonZero.getOperand(0);
7053 MVT VT = V1.getSimpleValueType();
7054
7055 // See if this build_vector can be lowered as a blend with zero.
7056 SDValue Elt;
7057 unsigned EltMaskIdx, EltIdx;
7058 int Mask[4];
7059 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
7060 if (Zeroable[EltIdx]) {
7061 // The zero vector will be on the right hand side.
7062 Mask[EltIdx] = EltIdx+4;
7063 continue;
7064 }
7065
7066 Elt = Op->getOperand(EltIdx);
7067 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
7068 EltMaskIdx = Elt.getConstantOperandVal(1);
7069 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
7070 break;
7071 Mask[EltIdx] = EltIdx;
7072 }
7073
7074 if (EltIdx == 4) {
7075 // Let the shuffle legalizer deal with blend operations.
7076 SDValue VZeroOrUndef = (Zeroable == Undefs)
7077 ? DAG.getUNDEF(VT)
7078 : getZeroVector(VT, Subtarget, DAG, DL);
7079 if (V1.getSimpleValueType() != VT)
7080 V1 = DAG.getBitcast(VT, V1);
7081 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
7082 }
7083
7084 // See if we can lower this build_vector to a INSERTPS.
7085 if (!Subtarget.hasSSE41())
7086 return SDValue();
7087
7088 SDValue V2 = Elt.getOperand(0);
7089 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
7090 V1 = SDValue();
7091
7092 bool CanFold = true;
7093 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
7094 if (Zeroable[i])
7095 continue;
7096
7097 SDValue Current = Op->getOperand(i);
7098 SDValue SrcVector = Current->getOperand(0);
7099 if (!V1.getNode())
7100 V1 = SrcVector;
7101 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
7102 }
7103
7104 if (!CanFold)
7105 return SDValue();
7106
7107 assert(V1.getNode() && "Expected at least two non-zero elements!");
7108 if (V1.getSimpleValueType() != MVT::v4f32)
7109 V1 = DAG.getBitcast(MVT::v4f32, V1);
7110 if (V2.getSimpleValueType() != MVT::v4f32)
7111 V2 = DAG.getBitcast(MVT::v4f32, V2);
7112
7113 // Ok, we can emit an INSERTPS instruction.
7114 unsigned ZMask = Zeroable.to_ulong();
7115
7116 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
7117 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
7118 SDValue Result =
7119 DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
7120 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
7121 return DAG.getBitcast(VT, Result);
7122}
7123
7124/// Return a vector logical shift node.
7125static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
7126 SelectionDAG &DAG, const TargetLowering &TLI,
7127 const SDLoc &dl) {
7128 assert(VT.is128BitVector() && "Unknown type for VShift");
7129 MVT ShVT = MVT::v16i8;
7130 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
7131 SrcOp = DAG.getBitcast(ShVT, SrcOp);
7132 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
7133 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
7134 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
7135}
7136
7138 SelectionDAG &DAG) {
7139
7140 // Check if the scalar load can be widened into a vector load. And if
7141 // the address is "base + cst" see if the cst can be "absorbed" into
7142 // the shuffle mask.
7144 SDValue Ptr = LD->getBasePtr();
7145 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
7146 return SDValue();
7147 EVT PVT = LD->getValueType(0);
7148 if (PVT != MVT::i32 && PVT != MVT::f32)
7149 return SDValue();
7150
7151 int FI = -1;
7152 int64_t Offset = 0;
7154 FI = FINode->getIndex();
7155 Offset = 0;
7156 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
7157 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
7158 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
7159 Offset = Ptr.getConstantOperandVal(1);
7160 Ptr = Ptr.getOperand(0);
7161 } else {
7162 return SDValue();
7163 }
7164
7165 // FIXME: 256-bit vector instructions don't require a strict alignment,
7166 // improve this code to support it better.
7167 Align RequiredAlign(VT.getSizeInBits() / 8);
7168 SDValue Chain = LD->getChain();
7169 // Make sure the stack object alignment is at least 16 or 32.
7171 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
7172 if (!InferredAlign || *InferredAlign < RequiredAlign) {
7173 if (MFI.isFixedObjectIndex(FI)) {
7174 // Can't change the alignment. FIXME: It's possible to compute
7175 // the exact stack offset and reference FI + adjust offset instead.
7176 // If someone *really* cares about this. That's the way to implement it.
7177 return SDValue();
7178 } else {
7179 MFI.setObjectAlignment(FI, RequiredAlign);
7180 }
7181 }
7182
7183 // (Offset % 16 or 32) must be multiple of 4. Then address is then
7184 // Ptr + (Offset & ~15).
7185 if (Offset < 0)
7186 return SDValue();
7187 if ((Offset % RequiredAlign.value()) & 3)
7188 return SDValue();
7189 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
7190 if (StartOffset) {
7191 SDLoc DL(Ptr);
7192 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7193 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
7194 }
7195
7196 int EltNo = (Offset - StartOffset) >> 2;
7197 unsigned NumElems = VT.getVectorNumElements();
7198
7199 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
7200 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
7201 LD->getPointerInfo().getWithOffset(StartOffset));
7202
7203 SmallVector<int, 8> Mask(NumElems, EltNo);
7204
7205 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
7206 }
7207
7208 return SDValue();
7209}
7210
7211// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
7212static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
7213 if (ISD::isNON_EXTLoad(Elt.getNode())) {
7214 auto *BaseLd = cast<LoadSDNode>(Elt);
7215 if (!BaseLd->isSimple())
7216 return false;
7217 Ld = BaseLd;
7218 ByteOffset = 0;
7219 return true;
7220 }
7221
7222 switch (Elt.getOpcode()) {
7223 case ISD::BITCAST:
7224 case ISD::TRUNCATE:
7226 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
7227 case ISD::SRL:
7228 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7229 uint64_t Amt = AmtC->getZExtValue();
7230 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
7231 ByteOffset += Amt / 8;
7232 return true;
7233 }
7234 }
7235 break;
7237 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7238 SDValue Src = Elt.getOperand(0);
7239 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
7240 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
7241 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
7242 findEltLoadSrc(Src, Ld, ByteOffset)) {
7243 uint64_t Idx = IdxC->getZExtValue();
7244 ByteOffset += Idx * (SrcSizeInBits / 8);
7245 return true;
7246 }
7247 }
7248 break;
7249 }
7250
7251 return false;
7252}
7253
7254/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
7255/// elements can be replaced by a single large load which has the same value as
7256/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
7257///
7258/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
7260 const SDLoc &DL, SelectionDAG &DAG,
7261 const X86Subtarget &Subtarget,
7262 bool IsAfterLegalize) {
7263 if ((VT.getScalarSizeInBits() % 8) != 0)
7264 return SDValue();
7265
7266 unsigned NumElems = Elts.size();
7267
7268 int LastLoadedElt = -1;
7269 APInt LoadMask = APInt::getZero(NumElems);
7270 APInt ZeroMask = APInt::getZero(NumElems);
7271 APInt UndefMask = APInt::getZero(NumElems);
7272
7273 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
7274 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
7275
7276 // For each element in the initializer, see if we've found a load, zero or an
7277 // undef.
7278 for (unsigned i = 0; i < NumElems; ++i) {
7279 SDValue Elt = peekThroughBitcasts(Elts[i]);
7280 if (!Elt.getNode())
7281 return SDValue();
7282 if (Elt.isUndef()) {
7283 UndefMask.setBit(i);
7284 continue;
7285 }
7287 ZeroMask.setBit(i);
7288 continue;
7289 }
7290
7291 // Each loaded element must be the correct fractional portion of the
7292 // requested vector load.
7293 unsigned EltSizeInBits = Elt.getValueSizeInBits();
7294 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
7295 return SDValue();
7296
7297 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
7298 return SDValue();
7299 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7300 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
7301 return SDValue();
7302
7303 LoadMask.setBit(i);
7304 LastLoadedElt = i;
7305 }
7306 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==
7307 NumElems &&
7308 "Incomplete element masks");
7309
7310 // Handle Special Cases - all undef or undef/zero.
7311 if (UndefMask.popcount() == NumElems)
7312 return DAG.getUNDEF(VT);
7313 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
7314 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
7315 : DAG.getConstantFP(0.0, DL, VT);
7316
7317 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7318 int FirstLoadedElt = LoadMask.countr_zero();
7319 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
7320 EVT EltBaseVT = EltBase.getValueType();
7321 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
7322 "Register/Memory size mismatch");
7323 LoadSDNode *LDBase = Loads[FirstLoadedElt];
7324 assert(LDBase && "Did not find base load for merging consecutive loads");
7325 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
7326 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
7327 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
7328 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
7329 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
7330
7331 // TODO: Support offsetting the base load.
7332 if (ByteOffsets[FirstLoadedElt] != 0)
7333 return SDValue();
7334
7335 // Check to see if the element's load is consecutive to the base load
7336 // or offset from a previous (already checked) load.
7337 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
7338 LoadSDNode *Ld = Loads[EltIdx];
7339 int64_t ByteOffset = ByteOffsets[EltIdx];
7340 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
7341 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7342 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
7343 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
7344 }
7345 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
7346 EltIdx - FirstLoadedElt);
7347 };
7348
7349 // Consecutive loads can contain UNDEFS but not ZERO elements.
7350 // Consecutive loads with UNDEFs and ZEROs elements require a
7351 // an additional shuffle stage to clear the ZERO elements.
7352 bool IsConsecutiveLoad = true;
7353 bool IsConsecutiveLoadWithZeros = true;
7354 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7355 if (LoadMask[i]) {
7356 if (!CheckConsecutiveLoad(LDBase, i)) {
7357 IsConsecutiveLoad = false;
7358 IsConsecutiveLoadWithZeros = false;
7359 break;
7360 }
7361 } else if (ZeroMask[i]) {
7362 IsConsecutiveLoad = false;
7363 }
7364 }
7365
7366 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
7367 auto MMOFlags = LDBase->getMemOperand()->getFlags();
7368 assert(LDBase->isSimple() &&
7369 "Cannot merge volatile or atomic loads.");
7370 SDValue NewLd =
7371 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
7372 LDBase->getPointerInfo(), LDBase->getBaseAlign(), MMOFlags);
7373 for (auto *LD : Loads)
7374 if (LD)
7375 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
7376 return NewLd;
7377 };
7378
7379 // Check if the base load is entirely dereferenceable.
7380 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
7381 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
7382
7383 // LOAD - all consecutive load/undefs (must start/end with a load or be
7384 // entirely dereferenceable). If we have found an entire vector of loads and
7385 // undefs, then return a large load of the entire vector width starting at the
7386 // base pointer. If the vector contains zeros, then attempt to shuffle those
7387 // elements.
7388 if (FirstLoadedElt == 0 &&
7389 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
7390 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7391 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
7392 return SDValue();
7393
7394 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
7395 // will lower to regular temporal loads and use the cache.
7396 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
7397 VT.is256BitVector() && !Subtarget.hasInt256())
7398 return SDValue();
7399
7400 if (NumElems == 1)
7401 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
7402
7403 if (!ZeroMask)
7404 return CreateLoad(VT, LDBase);
7405
7406 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7407 // vector and a zero vector to clear out the zero elements.
7408 if (!IsAfterLegalize && VT.isVector()) {
7409 unsigned NumMaskElts = VT.getVectorNumElements();
7410 if ((NumMaskElts % NumElems) == 0) {
7411 unsigned Scale = NumMaskElts / NumElems;
7412 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7413 for (unsigned i = 0; i < NumElems; ++i) {
7414 if (UndefMask[i])
7415 continue;
7416 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7417 for (unsigned j = 0; j != Scale; ++j)
7418 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
7419 }
7420 SDValue V = CreateLoad(VT, LDBase);
7421 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
7422 : DAG.getConstantFP(0.0, DL, VT);
7423 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
7424 }
7425 }
7426 }
7427
7428 // If the upper half of a ymm/zmm load is undef then just load the lower half.
7429 if (VT.is256BitVector() || VT.is512BitVector()) {
7430 unsigned HalfNumElems = NumElems / 2;
7431 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
7432 EVT HalfVT =
7433 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
7434 SDValue HalfLD =
7435 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
7436 DAG, Subtarget, IsAfterLegalize);
7437 if (HalfLD)
7438 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
7439 HalfLD, DAG.getVectorIdxConstant(0, DL));
7440 }
7441 }
7442
7443 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7444 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7445 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7446 LoadSizeInBits == 64) &&
7447 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
7448 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
7449 : MVT::getIntegerVT(LoadSizeInBits);
7450 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
7451 // Allow v4f32 on SSE1 only targets.
7452 // FIXME: Add more isel patterns so we can just use VT directly.
7453 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
7454 VecVT = MVT::v4f32;
7455 if (TLI.isTypeLegal(VecVT)) {
7456 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
7457 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7458 SDValue ResNode = DAG.getMemIntrinsicNode(
7459 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
7461 for (auto *LD : Loads)
7462 if (LD)
7463 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
7464 return DAG.getBitcast(VT, ResNode);
7465 }
7466 }
7467
7468 // BROADCAST - match the smallest possible repetition pattern, load that
7469 // scalar/subvector element and then broadcast to the entire vector.
7470 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
7471 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
7472 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7473 unsigned RepeatSize = SubElems * BaseSizeInBits;
7474 unsigned ScalarSize = std::min(RepeatSize, 64u);
7475 if (!Subtarget.hasAVX2() && ScalarSize < 32)
7476 continue;
7477
7478 // Don't attempt a 1:N subvector broadcast - it should be caught by
7479 // combineConcatVectorOps, else will cause infinite loops.
7480 if (RepeatSize > ScalarSize && SubElems == 1)
7481 continue;
7482
7483 bool Match = true;
7484 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
7485 for (unsigned i = 0; i != NumElems && Match; ++i) {
7486 if (!LoadMask[i])
7487 continue;
7488 SDValue Elt = peekThroughBitcasts(Elts[i]);
7489 if (RepeatedLoads[i % SubElems].isUndef())
7490 RepeatedLoads[i % SubElems] = Elt;
7491 else
7492 Match &= (RepeatedLoads[i % SubElems] == Elt);
7493 }
7494
7495 // We must have loads at both ends of the repetition.
7496 Match &= !RepeatedLoads.front().isUndef();
7497 Match &= !RepeatedLoads.back().isUndef();
7498 if (!Match)
7499 continue;
7500
7501 EVT RepeatVT =
7502 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
7503 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
7504 : EVT::getFloatingPointVT(ScalarSize);
7505 if (RepeatSize > ScalarSize)
7506 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
7507 RepeatSize / ScalarSize);
7508 EVT BroadcastVT =
7509 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
7510 VT.getSizeInBits() / ScalarSize);
7511 if (TLI.isTypeLegal(BroadcastVT)) {
7512 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
7513 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
7514 SDValue Broadcast = RepeatLoad;
7515 if (RepeatSize > ScalarSize) {
7516 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
7517 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
7518 } else {
7519 if (!Subtarget.hasAVX2() &&
7521 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
7522 Subtarget,
7523 /*AssumeSingleUse=*/true))
7524 return SDValue();
7525 Broadcast =
7526 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
7527 }
7528 return DAG.getBitcast(VT, Broadcast);
7529 }
7530 }
7531 }
7532 }
7533
7534 return SDValue();
7535}
7536
7537// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
7538// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
7539// are consecutive, non-overlapping, and in the right order.
7541 SelectionDAG &DAG,
7542 const X86Subtarget &Subtarget,
7543 bool IsAfterLegalize) {
7545 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
7546 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
7547 Elts.push_back(Elt);
7548 continue;
7549 }
7550 return SDValue();
7551 }
7552 assert(Elts.size() == VT.getVectorNumElements());
7553 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
7554 IsAfterLegalize);
7555}
7556
7558 const APInt &Undefs, LLVMContext &C) {
7559 unsigned ScalarSize = VT.getScalarSizeInBits();
7560 Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C);
7561
7562 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7563 if (VT.isFloatingPoint()) {
7564 if (ScalarSize == 16)
7565 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7566 if (ScalarSize == 32)
7567 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7568 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7569 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7570 }
7571 return Constant::getIntegerValue(Ty, Val);
7572 };
7573
7574 SmallVector<Constant *, 32> ConstantVec;
7575 for (unsigned I = 0, E = Bits.size(); I != E; ++I)
7576 ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty)
7577 : getConstantScalar(Bits[I]));
7578
7579 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7580}
7581
7582static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
7583 unsigned SplatBitSize, LLVMContext &C) {
7584 unsigned ScalarSize = VT.getScalarSizeInBits();
7585
7586 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7587 if (VT.isFloatingPoint()) {
7588 if (ScalarSize == 16)
7589 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7590 if (ScalarSize == 32)
7591 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7592 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7593 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7594 }
7595 return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
7596 };
7597
7598 if (ScalarSize == SplatBitSize)
7599 return getConstantScalar(SplatValue);
7600
7601 unsigned NumElm = SplatBitSize / ScalarSize;
7602 SmallVector<Constant *, 32> ConstantVec;
7603 for (unsigned I = 0; I != NumElm; ++I) {
7604 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
7605 ConstantVec.push_back(getConstantScalar(Val));
7606 }
7607 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7608}
7609
7611 for (auto *U : N->users()) {
7612 unsigned Opc = U->getOpcode();
7613 // VPERMV/VPERMV3 shuffles can never fold their index operands.
7614 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
7615 return false;
7616 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
7617 return false;
7618 if (isTargetShuffle(Opc))
7619 return true;
7620 if (Opc == ISD::BITCAST) // Ignore bitcasts
7621 return isFoldableUseOfShuffle(U);
7622 if (N->hasOneUse()) {
7623 // TODO, there may be some general way to know if a SDNode can
7624 // be folded. We now only know whether an MI is foldable.
7625 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
7626 return false;
7627 return true;
7628 }
7629 }
7630 return false;
7631}
7632
7633// If the node has a single use by a VSELECT then AVX512 targets may be able to
7634// fold as a predicated instruction.
7635static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget) {
7636 unsigned SizeInBits = V.getValueSizeInBits();
7637 if ((SizeInBits == 512 && Subtarget.hasAVX512()) ||
7638 (SizeInBits >= 128 && Subtarget.hasVLX())) {
7639 if (V.hasOneUse() && V->user_begin()->getOpcode() == ISD::VSELECT &&
7640 V->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
7641 return true;
7642 }
7643 }
7644 return false;
7645}
7646
7647/// Attempt to use the vbroadcast instruction to generate a splat value
7648/// from a splat BUILD_VECTOR which uses:
7649/// a. A single scalar load, or a constant.
7650/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
7651///
7652/// The VBROADCAST node is returned when a pattern is found,
7653/// or SDValue() otherwise.
7655 const SDLoc &dl,
7656 const X86Subtarget &Subtarget,
7657 SelectionDAG &DAG) {
7658 // VBROADCAST requires AVX.
7659 // TODO: Splats could be generated for non-AVX CPUs using SSE
7660 // instructions, but there's less potential gain for only 128-bit vectors.
7661 if (!Subtarget.hasAVX())
7662 return SDValue();
7663
7664 MVT VT = BVOp->getSimpleValueType(0);
7665 unsigned NumElts = VT.getVectorNumElements();
7666 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7667 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7668 "Unsupported vector type for broadcast.");
7669
7670 // See if the build vector is a repeating sequence of scalars (inc. splat).
7671 SDValue Ld;
7672 BitVector UndefElements;
7673 SmallVector<SDValue, 16> Sequence;
7674 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
7675 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
7676 if (Sequence.size() == 1)
7677 Ld = Sequence[0];
7678 }
7679
7680 // Attempt to use VBROADCASTM
7681 // From this pattern:
7682 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7683 // b. t1 = (build_vector t0 t0)
7684 //
7685 // Create (VBROADCASTM v2i1 X)
7686 if (!Sequence.empty() && Subtarget.hasCDI()) {
7687 // If not a splat, are the upper sequence values zeroable?
7688 unsigned SeqLen = Sequence.size();
7689 bool UpperZeroOrUndef =
7690 SeqLen == 1 ||
7691 llvm::all_of(ArrayRef(Sequence).drop_front(),
7692 [](SDValue V) { return !V || isNullConstantOrUndef(V); });
7693 SDValue Op0 = Sequence[0];
7694 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
7695 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
7696 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
7697 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
7698 ? Op0.getOperand(0)
7699 : Op0.getOperand(0).getOperand(0);
7700 MVT MaskVT = BOperand.getSimpleValueType();
7701 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
7702 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7703 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7704 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
7705 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
7706 unsigned Scale = 512 / VT.getSizeInBits();
7707 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
7708 }
7709 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
7710 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
7711 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
7712 return DAG.getBitcast(VT, Bcst);
7713 }
7714 }
7715 }
7716
7717 unsigned NumUndefElts = UndefElements.count();
7718 if (!Ld || (NumElts - NumUndefElts) <= 1) {
7719 APInt SplatValue, Undef;
7720 unsigned SplatBitSize;
7721 bool HasUndef;
7722 // Check if this is a repeated constant pattern suitable for broadcasting.
7723 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7724 SplatBitSize > VT.getScalarSizeInBits() &&
7725 SplatBitSize < VT.getSizeInBits()) {
7726 // Avoid replacing with broadcast when it's a use of a shuffle
7727 // instruction to preserve the present custom lowering of shuffles.
7728 if (isFoldableUseOfShuffle(BVOp))
7729 return SDValue();
7730 // replace BUILD_VECTOR with broadcast of the repeated constants.
7731 LLVMContext *Ctx = DAG.getContext();
7732 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7733 if (SplatBitSize == 32 || SplatBitSize == 64 ||
7734 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
7735 // Load the constant scalar/subvector and broadcast it.
7736 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7737 Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7738 SDValue CP = DAG.getConstantPool(C, PVT);
7739 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7740
7741 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7742 SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
7743 SDValue Ops[] = {DAG.getEntryNode(), CP};
7744 MachinePointerInfo MPI =
7746 SDValue Brdcst =
7748 MPI, Alignment, MachineMemOperand::MOLoad);
7749 return DAG.getBitcast(VT, Brdcst);
7750 }
7751 if (SplatBitSize > 64) {
7752 // Load the vector of constants and broadcast it.
7753 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7754 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7755 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7756 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
7757 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7758 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7759 SDValue Ops[] = {DAG.getEntryNode(), VCP};
7760 MachinePointerInfo MPI =
7763 Ops, VVT, MPI, Alignment,
7765 }
7766 }
7767
7768 // If we are moving a scalar into a vector (Ld must be set and all elements
7769 // but 1 are undef) and that operation is not obviously supported by
7770 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
7771 // That's better than general shuffling and may eliminate a load to GPR and
7772 // move from scalar to vector register.
7773 if (!Ld || NumElts - NumUndefElts != 1)
7774 return SDValue();
7775 unsigned ScalarSize = Ld.getValueSizeInBits();
7776 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
7777 return SDValue();
7778 }
7779
7780 bool ConstSplatVal =
7781 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7782 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7783
7784 // TODO: Handle broadcasts of non-constant sequences.
7785
7786 // Make sure that all of the users of a non-constant load are from the
7787 // BUILD_VECTOR node.
7788 // FIXME: Is the use count needed for non-constant, non-load case?
7789 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
7790 return SDValue();
7791
7792 unsigned ScalarSize = Ld.getValueSizeInBits();
7793 bool IsGE256 = (VT.getSizeInBits() >= 256);
7794
7795 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7796 // instruction to save 8 or more bytes of constant pool data.
7797 // TODO: If multiple splats are generated to load the same constant,
7798 // it may be detrimental to overall size. There needs to be a way to detect
7799 // that condition to know if this is truly a size win.
7800 bool OptForSize = DAG.shouldOptForSize();
7801
7802 // Handle broadcasting a single constant scalar from the constant pool
7803 // into a vector.
7804 // On Sandybridge (no AVX2), it is still better to load a constant vector
7805 // from the constant pool and not to broadcast it from a scalar.
7806 // But override that restriction when optimizing for size.
7807 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7808 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7809 EVT CVT = Ld.getValueType();
7810 assert(!CVT.isVector() && "Must not broadcast a vector type");
7811
7812 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
7813 // For size optimization, also splat v2f64 and v2i64, and for size opt
7814 // with AVX2, also splat i8 and i16.
7815 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7816 if (ScalarSize == 32 ||
7817 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
7818 (CVT == MVT::f16 && Subtarget.hasAVX2()) ||
7819 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7820 const Constant *C = nullptr;
7822 C = CI->getConstantIntValue();
7824 C = CF->getConstantFPValue();
7825
7826 assert(C && "Invalid constant type");
7827
7828 SDValue CP =
7830 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7831
7832 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7833 SDValue Ops[] = {DAG.getEntryNode(), CP};
7834 MachinePointerInfo MPI =
7836 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7837 MPI, Alignment, MachineMemOperand::MOLoad);
7838 }
7839 }
7840
7841 // Handle AVX2 in-register broadcasts.
7842 if (!IsLoad && Subtarget.hasInt256() &&
7843 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7844 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7845
7846 // The scalar source must be a normal load.
7847 if (!IsLoad)
7848 return SDValue();
7849
7850 // Make sure the non-chain result is only used by this build vector.
7851 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
7852 return SDValue();
7853
7854 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7855 (Subtarget.hasVLX() && ScalarSize == 64)) {
7856 auto *LN = cast<LoadSDNode>(Ld);
7857 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7858 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7859 SDValue BCast =
7861 LN->getMemoryVT(), LN->getMemOperand());
7862 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7863 return BCast;
7864 }
7865
7866 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7867 // double since there is no vbroadcastsd xmm
7868 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
7869 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
7870 auto *LN = cast<LoadSDNode>(Ld);
7871 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7872 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7873 SDValue BCast =
7875 LN->getMemoryVT(), LN->getMemOperand());
7876 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7877 return BCast;
7878 }
7879
7880 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
7881 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7882
7883 // Unsupported broadcast.
7884 return SDValue();
7885}
7886
7887/// For an EXTRACT_VECTOR_ELT with a constant index return the real
7888/// underlying vector and index.
7889///
7890/// Modifies \p ExtractedFromVec to the real vector and returns the real
7891/// index.
7892static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7893 SDValue ExtIdx) {
7894 int Idx = ExtIdx->getAsZExtVal();
7895 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7896 return Idx;
7897
7898 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7899 // lowered this:
7900 // (extract_vector_elt (v8f32 %1), Constant<6>)
7901 // to:
7902 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7903 // (extract_subvector (v8f32 %0), Constant<4>),
7904 // undef)
7905 // Constant<0>)
7906 // In this case the vector is the extract_subvector expression and the index
7907 // is 2, as specified by the shuffle.
7908 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7909 SDValue ShuffleVec = SVOp->getOperand(0);
7910 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7911 assert(ShuffleVecVT.getVectorElementType() ==
7912 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7913
7914 int ShuffleIdx = SVOp->getMaskElt(Idx);
7915 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7916 ExtractedFromVec = ShuffleVec;
7917 return ShuffleIdx;
7918 }
7919 return Idx;
7920}
7921
7923 SelectionDAG &DAG) {
7924 MVT VT = Op.getSimpleValueType();
7925
7926 // Skip if insert_vec_elt is not supported.
7927 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7929 return SDValue();
7930
7931 unsigned NumElems = Op.getNumOperands();
7932 SDValue VecIn1;
7933 SDValue VecIn2;
7934 SmallVector<unsigned, 4> InsertIndices;
7935 SmallVector<int, 8> Mask(NumElems, -1);
7936
7937 for (unsigned i = 0; i != NumElems; ++i) {
7938 unsigned Opc = Op.getOperand(i).getOpcode();
7939
7940 if (Opc == ISD::UNDEF)
7941 continue;
7942
7944 // Quit if more than 1 elements need inserting.
7945 if (InsertIndices.size() > 1)
7946 return SDValue();
7947
7948 InsertIndices.push_back(i);
7949 continue;
7950 }
7951
7952 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7953 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7954
7955 // Quit if non-constant index.
7956 if (!isa<ConstantSDNode>(ExtIdx))
7957 return SDValue();
7958 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7959
7960 // Quit if extracted from vector of different type.
7961 if (ExtractedFromVec.getValueType() != VT)
7962 return SDValue();
7963
7964 if (!VecIn1.getNode())
7965 VecIn1 = ExtractedFromVec;
7966 else if (VecIn1 != ExtractedFromVec) {
7967 if (!VecIn2.getNode())
7968 VecIn2 = ExtractedFromVec;
7969 else if (VecIn2 != ExtractedFromVec)
7970 // Quit if more than 2 vectors to shuffle
7971 return SDValue();
7972 }
7973
7974 if (ExtractedFromVec == VecIn1)
7975 Mask[i] = Idx;
7976 else if (ExtractedFromVec == VecIn2)
7977 Mask[i] = Idx + NumElems;
7978 }
7979
7980 if (!VecIn1.getNode())
7981 return SDValue();
7982
7983 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7984 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7985
7986 for (unsigned Idx : InsertIndices)
7987 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7988 DAG.getVectorIdxConstant(Idx, DL));
7989
7990 return NV;
7991}
7992
7993// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
7995 const X86Subtarget &Subtarget) {
7996 MVT VT = Op.getSimpleValueType();
7997 MVT IVT =
7998 VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16);
8000 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
8001 NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16,
8002 Op.getOperand(I)));
8003 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
8004 return DAG.getBitcast(VT, Res);
8005}
8006
8007// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
8009 SelectionDAG &DAG,
8010 const X86Subtarget &Subtarget) {
8011
8012 MVT VT = Op.getSimpleValueType();
8013 assert((VT.getVectorElementType() == MVT::i1) &&
8014 "Unexpected type in LowerBUILD_VECTORvXi1!");
8015 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
8016 ISD::isBuildVectorAllOnes(Op.getNode()))
8017 return Op;
8018
8019 uint64_t Immediate = 0;
8020 SmallVector<unsigned, 16> NonConstIdx;
8021 bool IsSplat = true;
8022 bool HasConstElts = false;
8023 int SplatIdx = -1;
8024 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
8025 SDValue In = Op.getOperand(idx);
8026 if (In.isUndef())
8027 continue;
8028 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
8029 Immediate |= (InC->getZExtValue() & 0x1) << idx;
8030 HasConstElts = true;
8031 } else {
8032 NonConstIdx.push_back(idx);
8033 }
8034 if (SplatIdx < 0)
8035 SplatIdx = idx;
8036 else if (In != Op.getOperand(SplatIdx))
8037 IsSplat = false;
8038 }
8039
8040 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
8041 if (IsSplat) {
8042 // The build_vector allows the scalar element to be larger than the vector
8043 // element type. We need to mask it to use as a condition unless we know
8044 // the upper bits are zero.
8045 // FIXME: Use computeKnownBits instead of checking specific opcode?
8046 SDValue Cond = Op.getOperand(SplatIdx);
8047 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
8048 if (Cond.getOpcode() != ISD::SETCC)
8049 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
8050 DAG.getConstant(1, dl, MVT::i8));
8051
8052 // Perform the select in the scalar domain so we can use cmov.
8053 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8054 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
8055 DAG.getAllOnesConstant(dl, MVT::i32),
8056 DAG.getConstant(0, dl, MVT::i32));
8057 Select = DAG.getBitcast(MVT::v32i1, Select);
8058 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
8059 } else {
8060 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8061 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
8062 DAG.getAllOnesConstant(dl, ImmVT),
8063 DAG.getConstant(0, dl, ImmVT));
8064 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8065 Select = DAG.getBitcast(VecVT, Select);
8066 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
8067 DAG.getVectorIdxConstant(0, dl));
8068 }
8069 }
8070
8071 // insert elements one by one
8072 SDValue DstVec;
8073 if (HasConstElts) {
8074 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8075 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
8076 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
8077 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
8078 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
8079 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
8080 } else {
8081 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8082 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
8083 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8084 DstVec = DAG.getBitcast(VecVT, Imm);
8085 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
8086 DAG.getVectorIdxConstant(0, dl));
8087 }
8088 } else
8089 DstVec = DAG.getUNDEF(VT);
8090
8091 for (unsigned InsertIdx : NonConstIdx) {
8092 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
8093 Op.getOperand(InsertIdx),
8094 DAG.getVectorIdxConstant(InsertIdx, dl));
8095 }
8096 return DstVec;
8097}
8098
8099LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
8100 switch (Opcode) {
8101 case X86ISD::PACKSS:
8102 case X86ISD::PACKUS:
8103 case X86ISD::FHADD:
8104 case X86ISD::FHSUB:
8105 case X86ISD::HADD:
8106 case X86ISD::HSUB:
8107 return true;
8108 }
8109 return false;
8110}
8111
8112/// This is a helper function of LowerToHorizontalOp().
8113/// This function checks that the build_vector \p N in input implements a
8114/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
8115/// may not match the layout of an x86 256-bit horizontal instruction.
8116/// In other words, if this returns true, then some extraction/insertion will
8117/// be required to produce a valid horizontal instruction.
8118///
8119/// Parameter \p Opcode defines the kind of horizontal operation to match.
8120/// For example, if \p Opcode is equal to ISD::ADD, then this function
8121/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
8122/// is equal to ISD::SUB, then this function checks if this is a horizontal
8123/// arithmetic sub.
8124///
8125/// This function only analyzes elements of \p N whose indices are
8126/// in range [BaseIdx, LastIdx).
8127///
8128/// TODO: This function was originally used to match both real and fake partial
8129/// horizontal operations, but the index-matching logic is incorrect for that.
8130/// See the corrected implementation in isHopBuildVector(). Can we reduce this
8131/// code because it is only used for partial h-op matching now?
8132static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
8133 const SDLoc &DL, SelectionDAG &DAG,
8134 unsigned BaseIdx, unsigned LastIdx,
8135 SDValue &V0, SDValue &V1) {
8136 EVT VT = N->getValueType(0);
8137 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
8138 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
8139 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
8140 "Invalid Vector in input!");
8141
8142 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
8143 bool CanFold = true;
8144 unsigned ExpectedVExtractIdx = BaseIdx;
8145 unsigned NumElts = LastIdx - BaseIdx;
8146 V0 = DAG.getUNDEF(VT);
8147 V1 = DAG.getUNDEF(VT);
8148
8149 // Check if N implements a horizontal binop.
8150 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
8151 SDValue Op = N->getOperand(i + BaseIdx);
8152
8153 // Skip UNDEFs.
8154 if (Op->isUndef()) {
8155 // Update the expected vector extract index.
8156 if (i * 2 == NumElts)
8157 ExpectedVExtractIdx = BaseIdx;
8158 ExpectedVExtractIdx += 2;
8159 continue;
8160 }
8161
8162 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
8163
8164 if (!CanFold)
8165 break;
8166
8167 SDValue Op0 = Op.getOperand(0);
8168 SDValue Op1 = Op.getOperand(1);
8169
8170 // Try to match the following pattern:
8171 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
8172 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
8174 Op0.getOperand(0) == Op1.getOperand(0) &&
8177 if (!CanFold)
8178 break;
8179
8180 unsigned I0 = Op0.getConstantOperandVal(1);
8181 unsigned I1 = Op1.getConstantOperandVal(1);
8182
8183 if (i * 2 < NumElts) {
8184 if (V0.isUndef()) {
8185 V0 = Op0.getOperand(0);
8186 if (V0.getValueType() != VT)
8187 return false;
8188 }
8189 } else {
8190 if (V1.isUndef()) {
8191 V1 = Op0.getOperand(0);
8192 if (V1.getValueType() != VT)
8193 return false;
8194 }
8195 if (i * 2 == NumElts)
8196 ExpectedVExtractIdx = BaseIdx;
8197 }
8198
8199 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
8200 if (I0 == ExpectedVExtractIdx)
8201 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
8202 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
8203 // Try to match the following dag sequence:
8204 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
8205 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
8206 } else
8207 CanFold = false;
8208
8209 ExpectedVExtractIdx += 2;
8210 }
8211
8212 return CanFold;
8213}
8214
8215/// Emit a sequence of two 128-bit horizontal add/sub followed by
8216/// a concat_vector.
8217///
8218/// This is a helper function of LowerToHorizontalOp().
8219/// This function expects two 256-bit vectors called V0 and V1.
8220/// At first, each vector is split into two separate 128-bit vectors.
8221/// Then, the resulting 128-bit vectors are used to implement two
8222/// horizontal binary operations.
8223///
8224/// The kind of horizontal binary operation is defined by \p X86Opcode.
8225///
8226/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
8227/// the two new horizontal binop.
8228/// When Mode is set, the first horizontal binop dag node would take as input
8229/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
8230/// horizontal binop dag node would take as input the lower 128-bit of V1
8231/// and the upper 128-bit of V1.
8232/// Example:
8233/// HADD V0_LO, V0_HI
8234/// HADD V1_LO, V1_HI
8235///
8236/// Otherwise, the first horizontal binop dag node takes as input the lower
8237/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
8238/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
8239/// Example:
8240/// HADD V0_LO, V1_LO
8241/// HADD V0_HI, V1_HI
8242///
8243/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
8244/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
8245/// the upper 128-bits of the result.
8246static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
8247 const SDLoc &DL, SelectionDAG &DAG,
8248 unsigned X86Opcode, bool Mode,
8249 bool isUndefLO, bool isUndefHI) {
8250 MVT VT = V0.getSimpleValueType();
8251 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
8252 "Invalid nodes in input!");
8253
8254 unsigned NumElts = VT.getVectorNumElements();
8255 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
8256 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
8257 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
8258 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
8259 MVT NewVT = V0_LO.getSimpleValueType();
8260
8261 SDValue LO = DAG.getUNDEF(NewVT);
8262 SDValue HI = DAG.getUNDEF(NewVT);
8263
8264 if (Mode) {
8265 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8266 if (!isUndefLO && !V0->isUndef())
8267 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
8268 if (!isUndefHI && !V1->isUndef())
8269 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
8270 } else {
8271 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8272 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
8273 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
8274
8275 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
8276 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
8277 }
8278
8279 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
8280}
8281
8282/// Returns true iff \p BV builds a vector with the result equivalent to
8283/// the result of ADDSUB/SUBADD operation.
8284/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
8285/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
8286/// \p Opnd0 and \p Opnd1.
8288 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8289 SDValue &Opnd0, SDValue &Opnd1,
8290 unsigned &NumExtracts, bool &IsSubAdd,
8291 bool &HasAllowContract) {
8292 using namespace SDPatternMatch;
8293
8294 MVT VT = BV->getSimpleValueType(0);
8295 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
8296 return false;
8297
8298 unsigned NumElts = VT.getVectorNumElements();
8299 SDValue InVec0 = DAG.getUNDEF(VT);
8300 SDValue InVec1 = DAG.getUNDEF(VT);
8301
8302 NumExtracts = 0;
8303 HasAllowContract = NumElts != 0;
8304
8305 // Odd-numbered elements in the input build vector are obtained from
8306 // adding/subtracting two integer/float elements.
8307 // Even-numbered elements in the input build vector are obtained from
8308 // subtracting/adding two integer/float elements.
8309 unsigned Opc[2] = {0, 0};
8310 for (unsigned i = 0, e = NumElts; i != e; ++i) {
8311 SDValue Op = BV->getOperand(i);
8312
8313 // Skip 'undef' values.
8314 unsigned Opcode = Op.getOpcode();
8315 if (Opcode == ISD::UNDEF)
8316 continue;
8317
8318 // Early exit if we found an unexpected opcode.
8319 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
8320 return false;
8321
8322 SDValue Op0 = Op.getOperand(0);
8323 SDValue Op1 = Op.getOperand(1);
8324
8325 // Try to match the following pattern:
8326 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
8327 // Early exit if we cannot match that sequence.
8328 if (!sd_match(Op0, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))) ||
8329 !sd_match(Op1, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))))
8330 return false;
8331
8332 // We found a valid add/sub node, make sure its the same opcode as previous
8333 // elements for this parity.
8334 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
8335 return false;
8336 Opc[i % 2] = Opcode;
8337
8338 // Update InVec0 and InVec1.
8339 if (InVec0.isUndef())
8340 InVec0 = Op0.getOperand(0);
8341 if (InVec1.isUndef())
8342 InVec1 = Op1.getOperand(0);
8343
8344 // Make sure that operands in input to each add/sub node always
8345 // come from a same pair of vectors.
8346 if (InVec0 != Op0.getOperand(0)) {
8347 if (Opcode == ISD::FSUB)
8348 return false;
8349
8350 // FADD is commutable. Try to commute the operands
8351 // and then test again.
8352 std::swap(Op0, Op1);
8353 if (InVec0 != Op0.getOperand(0))
8354 return false;
8355 }
8356
8357 if (InVec1 != Op1.getOperand(0))
8358 return false;
8359
8360 // Increment the number of extractions done.
8361 ++NumExtracts;
8362 HasAllowContract &= Op->getFlags().hasAllowContract();
8363 }
8364
8365 // Ensure we have found an opcode for both parities and that they are
8366 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
8367 // inputs are undef.
8368 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
8369 InVec0.isUndef() || InVec1.isUndef())
8370 return false;
8371
8372 IsSubAdd = Opc[0] == ISD::FADD;
8373
8374 Opnd0 = InVec0;
8375 Opnd1 = InVec1;
8376 return true;
8377}
8378
8379/// Returns true if is possible to fold MUL and an idiom that has already been
8380/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
8381/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
8382/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
8383///
8384/// Prior to calling this function it should be known that there is some
8385/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
8386/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
8387/// before replacement of such SDNode with ADDSUB operation. Thus the number
8388/// of \p Opnd0 uses is expected to be equal to 2.
8389/// For example, this function may be called for the following IR:
8390/// %AB = fmul fast <2 x double> %A, %B
8391/// %Sub = fsub fast <2 x double> %AB, %C
8392/// %Add = fadd fast <2 x double> %AB, %C
8393/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
8394/// <2 x i32> <i32 0, i32 3>
8395/// There is a def for %Addsub here, which potentially can be replaced by
8396/// X86ISD::ADDSUB operation:
8397/// %Addsub = X86ISD::ADDSUB %AB, %C
8398/// and such ADDSUB can further be replaced with FMADDSUB:
8399/// %Addsub = FMADDSUB %A, %B, %C.
8400///
8401/// The main reason why this method is called before the replacement of the
8402/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
8403/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8404/// FMADDSUB is.
8405static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
8406 SelectionDAG &DAG, SDValue &Opnd0,
8407 SDValue &Opnd1, SDValue &Opnd2,
8408 unsigned ExpectedUses,
8409 bool AllowSubAddOrAddSubContract) {
8410 if (Opnd0.getOpcode() != ISD::FMUL ||
8411 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8412 return false;
8413
8414 // FIXME: These checks must match the similar ones in
8415 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
8416 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
8417 // or MUL + ADDSUB to FMADDSUB.
8418 const TargetOptions &Options = DAG.getTarget().Options;
8419 bool AllowFusion =
8420 Options.AllowFPOpFusion == FPOpFusion::Fast ||
8421 (AllowSubAddOrAddSubContract && Opnd0->getFlags().hasAllowContract());
8422 if (!AllowFusion)
8423 return false;
8424
8425 Opnd2 = Opnd1;
8426 Opnd1 = Opnd0.getOperand(1);
8427 Opnd0 = Opnd0.getOperand(0);
8428
8429 return true;
8430}
8431
8432/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
8433/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
8434/// X86ISD::FMSUBADD node.
8436 const SDLoc &DL,
8437 const X86Subtarget &Subtarget,
8438 SelectionDAG &DAG) {
8439 SDValue Opnd0, Opnd1;
8440 unsigned NumExtracts;
8441 bool IsSubAdd;
8442 bool HasAllowContract;
8443 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, IsSubAdd,
8444 HasAllowContract))
8445 return SDValue();
8446
8447 MVT VT = BV->getSimpleValueType(0);
8448
8449 // Try to generate X86ISD::FMADDSUB node here.
8450 SDValue Opnd2;
8451 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts,
8452 HasAllowContract)) {
8453 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8454 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
8455 }
8456
8457 // We only support ADDSUB.
8458 if (IsSubAdd)
8459 return SDValue();
8460
8461 // There are no known X86 targets with 512-bit ADDSUB instructions!
8462 // Convert to blend(fsub,fadd).
8463 if (VT.is512BitVector()) {
8464 SmallVector<int> Mask;
8465 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
8466 Mask.push_back(I);
8467 Mask.push_back(I + E + 1);
8468 }
8469 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
8470 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
8471 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
8472 }
8473
8474 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
8475}
8476
8478 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
8479 // Initialize outputs to known values.
8480 MVT VT = BV->getSimpleValueType(0);
8481 HOpcode = ISD::DELETED_NODE;
8482 V0 = DAG.getUNDEF(VT);
8483 V1 = DAG.getUNDEF(VT);
8484
8485 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8486 // half of the result is calculated independently from the 128-bit halves of
8487 // the inputs, so that makes the index-checking logic below more complicated.
8488 unsigned NumElts = VT.getVectorNumElements();
8489 unsigned GenericOpcode = ISD::DELETED_NODE;
8490 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
8491 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8492 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8493 for (unsigned i = 0; i != Num128BitChunks; ++i) {
8494 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8495 // Ignore undef elements.
8496 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
8497 if (Op.isUndef())
8498 continue;
8499
8500 // If there's an opcode mismatch, we're done.
8501 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
8502 return false;
8503
8504 // Initialize horizontal opcode.
8505 if (HOpcode == ISD::DELETED_NODE) {
8506 GenericOpcode = Op.getOpcode();
8507 switch (GenericOpcode) {
8508 // clang-format off
8509 case ISD::ADD: HOpcode = X86ISD::HADD; break;
8510 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
8511 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
8512 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
8513 default: return false;
8514 // clang-format on
8515 }
8516 }
8517
8518 SDValue Op0 = Op.getOperand(0);
8519 SDValue Op1 = Op.getOperand(1);
8520 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8522 Op0.getOperand(0) != Op1.getOperand(0) ||
8524 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
8525 return false;
8526
8527 // The source vector is chosen based on which 64-bit half of the
8528 // destination vector is being calculated.
8529 if (j < NumEltsIn64Bits) {
8530 if (V0.isUndef())
8531 V0 = Op0.getOperand(0);
8532 } else {
8533 if (V1.isUndef())
8534 V1 = Op0.getOperand(0);
8535 }
8536
8537 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8538 if (SourceVec != Op0.getOperand(0))
8539 return false;
8540
8541 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
8542 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
8543 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
8544 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8545 (j % NumEltsIn64Bits) * 2;
8546 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
8547 continue;
8548
8549 // If this is not a commutative op, this does not match.
8550 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
8551 return false;
8552
8553 // Addition is commutative, so try swapping the extract indexes.
8554 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
8555 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
8556 continue;
8557
8558 // Extract indexes do not match horizontal requirement.
8559 return false;
8560 }
8561 }
8562 // We matched. Opcode and operands are returned by reference as arguments.
8563 return true;
8564}
8565
8567 const SDLoc &DL, SelectionDAG &DAG,
8568 unsigned HOpcode, SDValue V0, SDValue V1) {
8569 // If either input vector is not the same size as the build vector,
8570 // extract/insert the low bits to the correct size.
8571 // This is free (examples: zmm --> xmm, xmm --> ymm).
8572 MVT VT = BV->getSimpleValueType(0);
8573 unsigned Width = VT.getSizeInBits();
8574 if (V0.getValueSizeInBits() > Width)
8575 V0 = extractSubVector(V0, 0, DAG, DL, Width);
8576 else if (V0.getValueSizeInBits() < Width)
8577 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width);
8578
8579 if (V1.getValueSizeInBits() > Width)
8580 V1 = extractSubVector(V1, 0, DAG, DL, Width);
8581 else if (V1.getValueSizeInBits() < Width)
8582 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width);
8583
8584 unsigned NumElts = VT.getVectorNumElements();
8585 APInt DemandedElts = APInt::getAllOnes(NumElts);
8586 for (unsigned i = 0; i != NumElts; ++i)
8587 if (BV->getOperand(i).isUndef())
8588 DemandedElts.clearBit(i);
8589
8590 // If we don't need the upper xmm, then perform as a xmm hop.
8591 unsigned HalfNumElts = NumElts / 2;
8592 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
8593 MVT HalfVT = VT.getHalfNumVectorElementsVT();
8594 V0 = extractSubVector(V0, 0, DAG, DL, 128);
8595 V1 = extractSubVector(V1, 0, DAG, DL, 128);
8596 SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1);
8597 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256);
8598 }
8599
8600 return DAG.getNode(HOpcode, DL, VT, V0, V1);
8601}
8602
8603/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
8605 const X86Subtarget &Subtarget,
8606 SelectionDAG &DAG) {
8607 // We need at least 2 non-undef elements to make this worthwhile by default.
8608 unsigned NumNonUndefs =
8609 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
8610 if (NumNonUndefs < 2)
8611 return SDValue();
8612
8613 // There are 4 sets of horizontal math operations distinguished by type:
8614 // int/FP at 128-bit/256-bit. Each type was introduced with a different
8615 // subtarget feature. Try to match those "native" patterns first.
8616 MVT VT = BV->getSimpleValueType(0);
8617 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
8618 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
8619 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
8620 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
8621 unsigned HOpcode;
8622 SDValue V0, V1;
8623 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
8624 return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
8625 }
8626
8627 // Try harder to match 256-bit ops by using extract/concat.
8628 if (!Subtarget.hasAVX() || !VT.is256BitVector())
8629 return SDValue();
8630
8631 // Count the number of UNDEF operands in the build_vector in input.
8632 unsigned NumElts = VT.getVectorNumElements();
8633 unsigned Half = NumElts / 2;
8634 unsigned NumUndefsLO = 0;
8635 unsigned NumUndefsHI = 0;
8636 for (unsigned i = 0, e = Half; i != e; ++i)
8637 if (BV->getOperand(i)->isUndef())
8638 NumUndefsLO++;
8639
8640 for (unsigned i = Half, e = NumElts; i != e; ++i)
8641 if (BV->getOperand(i)->isUndef())
8642 NumUndefsHI++;
8643
8644 SDValue InVec0, InVec1;
8645 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
8646 SDValue InVec2, InVec3;
8647 unsigned X86Opcode;
8648 bool CanFold = true;
8649
8650 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
8651 isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
8652 InVec3) &&
8653 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8654 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8655 X86Opcode = X86ISD::HADD;
8656 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
8657 InVec1) &&
8658 isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
8659 InVec3) &&
8660 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8661 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8662 X86Opcode = X86ISD::HSUB;
8663 else
8664 CanFold = false;
8665
8666 if (CanFold) {
8667 // Do not try to expand this build_vector into a pair of horizontal
8668 // add/sub if we can emit a pair of scalar add/sub.
8669 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8670 return SDValue();
8671
8672 // Convert this build_vector into a pair of horizontal binops followed by
8673 // a concat vector. We must adjust the outputs from the partial horizontal
8674 // matching calls above to account for undefined vector halves.
8675 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
8676 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
8677 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
8678 bool isUndefLO = NumUndefsLO == Half;
8679 bool isUndefHI = NumUndefsHI == Half;
8680 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
8681 isUndefHI);
8682 }
8683 }
8684
8685 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
8686 VT == MVT::v16i16) {
8687 unsigned X86Opcode;
8688 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
8689 InVec1))
8690 X86Opcode = X86ISD::HADD;
8691 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
8692 InVec1))
8693 X86Opcode = X86ISD::HSUB;
8694 else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
8695 InVec1))
8696 X86Opcode = X86ISD::FHADD;
8697 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
8698 InVec1))
8699 X86Opcode = X86ISD::FHSUB;
8700 else
8701 return SDValue();
8702
8703 // Don't try to expand this build_vector into a pair of horizontal add/sub
8704 // if we can simply emit a pair of scalar add/sub.
8705 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8706 return SDValue();
8707
8708 // Convert this build_vector into two horizontal add/sub followed by
8709 // a concat vector.
8710 bool isUndefLO = NumUndefsLO == Half;
8711 bool isUndefHI = NumUndefsHI == Half;
8712 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
8713 isUndefLO, isUndefHI);
8714 }
8715
8716 return SDValue();
8717}
8718
8719static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
8720 SelectionDAG &DAG);
8721
8722/// If a BUILD_VECTOR's source elements all apply the same bit operation and
8723/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
8724/// just apply the bit to the vectors.
8725/// NOTE: Its not in our interest to start make a general purpose vectorizer
8726/// from this, but enough scalar bit operations are created from the later
8727/// legalization + scalarization stages to need basic support.
8729 const X86Subtarget &Subtarget,
8730 SelectionDAG &DAG) {
8731 MVT VT = Op->getSimpleValueType(0);
8732 unsigned NumElems = VT.getVectorNumElements();
8733 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8734
8735 // Check that all elements have the same opcode.
8736 // TODO: Should we allow UNDEFS and if so how many?
8737 unsigned Opcode = Op->getOperand(0).getOpcode();
8738 for (unsigned i = 1; i < NumElems; ++i)
8739 if (Opcode != Op->getOperand(i).getOpcode())
8740 return SDValue();
8741
8742 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
8743 bool IsShift = false;
8744 switch (Opcode) {
8745 default:
8746 return SDValue();
8747 case ISD::SHL:
8748 case ISD::SRL:
8749 case ISD::SRA:
8750 IsShift = true;
8751 break;
8752 case ISD::AND:
8753 case ISD::XOR:
8754 case ISD::OR:
8755 // Don't do this if the buildvector is a splat - we'd replace one
8756 // constant with an entire vector.
8757 if (Op->getSplatValue())
8758 return SDValue();
8759 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
8760 return SDValue();
8761 break;
8762 }
8763
8764 SmallVector<SDValue, 4> LHSElts, RHSElts;
8765 for (SDValue Elt : Op->ops()) {
8766 SDValue LHS = Elt.getOperand(0);
8767 SDValue RHS = Elt.getOperand(1);
8768
8769 // We expect the canonicalized RHS operand to be the constant.
8771 return SDValue();
8772
8773 // Extend shift amounts.
8774 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
8775 if (!IsShift)
8776 return SDValue();
8777 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
8778 }
8779
8780 LHSElts.push_back(LHS);
8781 RHSElts.push_back(RHS);
8782 }
8783
8784 // Limit to shifts by uniform immediates.
8785 // TODO: Only accept vXi8/vXi64 special cases?
8786 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
8787 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
8788 return SDValue();
8789
8790 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
8791 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
8792 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
8793
8794 if (!IsShift)
8795 return Res;
8796
8797 // Immediately lower the shift to ensure the constant build vector doesn't
8798 // get converted to a constant pool before the shift is lowered.
8799 return LowerShift(Res, Subtarget, DAG);
8800}
8801
8802static bool isShuffleFoldableLoad(SDValue);
8803
8804/// Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats
8805/// representing a blend.
8807 X86Subtarget const &Subtarget,
8808 SelectionDAG &DAG) {
8809 MVT VT = BVOp->getSimpleValueType(0u);
8810
8811 if (VT != MVT::v4f64)
8812 return SDValue();
8813
8814 // Collect unique operands.
8815 auto UniqueOps = SmallSet<SDValue, 16u>();
8816 for (SDValue Op : BVOp->ops()) {
8817 if (isIntOrFPConstant(Op) || Op.isUndef())
8818 return SDValue();
8819 UniqueOps.insert(Op);
8820 }
8821
8822 // Candidate BUILD_VECTOR must have 2 unique operands.
8823 if (UniqueOps.size() != 2u)
8824 return SDValue();
8825
8826 SDValue Op0 = BVOp->getOperand(0u);
8827 UniqueOps.erase(Op0);
8828 SDValue Op1 = *UniqueOps.begin();
8829
8830 if (Subtarget.hasAVX2() || isShuffleFoldableLoad(Op0) ||
8831 isShuffleFoldableLoad(Op1)) {
8832 // Create shuffle mask.
8833 auto const NumElems = VT.getVectorNumElements();
8834 SmallVector<int, 16u> Mask(NumElems);
8835 for (auto I = 0u; I < NumElems; ++I) {
8836 SDValue Op = BVOp->getOperand(I);
8837 Mask[I] = Op == Op0 ? I : I + NumElems;
8838 }
8839 // Create shuffle of splats.
8840 SDValue NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0);
8841 SDValue NewOp1 = DAG.getSplatBuildVector(VT, DL, Op1);
8842 return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, Mask);
8843 }
8844
8845 return SDValue();
8846}
8847
8848/// Create a vector constant without a load. SSE/AVX provide the bare minimum
8849/// functionality to do this, so it's all zeros, all ones, or some derivation
8850/// that is cheap to calculate.
8852 SelectionDAG &DAG,
8853 const X86Subtarget &Subtarget) {
8854 MVT VT = Op.getSimpleValueType();
8855
8856 // Vectors containing all zeros can be matched by pxor and xorps.
8857 if (ISD::isBuildVectorAllZeros(Op.getNode()))
8858 return Op;
8859
8860 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
8861 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
8862 // vpcmpeqd on 256-bit vectors.
8863 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
8864 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
8865 return Op;
8866
8867 return getOnesVector(VT, DAG, DL);
8868 }
8869
8870 return SDValue();
8871}
8872
8873/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
8874/// from a vector of source values and a vector of extraction indices.
8875/// The vectors might be manipulated to match the type of the permute op.
8876static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
8877 const SDLoc &DL, SelectionDAG &DAG,
8878 const X86Subtarget &Subtarget) {
8879 MVT ShuffleVT = VT;
8880 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8881 unsigned NumElts = VT.getVectorNumElements();
8882 unsigned SizeInBits = VT.getSizeInBits();
8883
8884 // Adjust IndicesVec to match VT size.
8885 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
8886 "Illegal variable permute mask size");
8887 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
8888 // Narrow/widen the indices vector to the correct size.
8889 if (IndicesVec.getValueSizeInBits() > SizeInBits)
8890 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
8891 NumElts * VT.getScalarSizeInBits());
8892 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
8893 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
8894 SDLoc(IndicesVec), SizeInBits);
8895 // Zero-extend the index elements within the vector.
8896 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
8897 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
8898 IndicesVT, IndicesVec);
8899 }
8900 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
8901
8902 // Handle SrcVec that don't match VT type.
8903 if (SrcVec.getValueSizeInBits() != SizeInBits) {
8904 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
8905 // Handle larger SrcVec by treating it as a larger permute.
8906 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
8907 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
8908 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8909 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
8910 Subtarget, DAG, SDLoc(IndicesVec));
8911 SDValue NewSrcVec =
8912 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8913 if (NewSrcVec)
8914 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
8915 return SDValue();
8916 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
8917 // Widen smaller SrcVec to match VT.
8918 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
8919 } else
8920 return SDValue();
8921 }
8922
8923 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
8924 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
8925 EVT SrcVT = Idx.getValueType();
8926 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
8927 uint64_t IndexScale = 0;
8928 uint64_t IndexOffset = 0;
8929
8930 // If we're scaling a smaller permute op, then we need to repeat the
8931 // indices, scaling and offsetting them as well.
8932 // e.g. v4i32 -> v16i8 (Scale = 4)
8933 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
8934 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
8935 for (uint64_t i = 0; i != Scale; ++i) {
8936 IndexScale |= Scale << (i * NumDstBits);
8937 IndexOffset |= i << (i * NumDstBits);
8938 }
8939
8940 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
8941 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
8942 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
8943 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
8944 return Idx;
8945 };
8946
8947 unsigned Opcode = 0;
8948 switch (VT.SimpleTy) {
8949 default:
8950 break;
8951 case MVT::v16i8:
8952 if (Subtarget.hasSSSE3())
8953 Opcode = X86ISD::PSHUFB;
8954 break;
8955 case MVT::v8i16:
8956 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8957 Opcode = X86ISD::VPERMV;
8958 else if (Subtarget.hasSSSE3()) {
8959 Opcode = X86ISD::PSHUFB;
8960 ShuffleVT = MVT::v16i8;
8961 }
8962 break;
8963 case MVT::v4f32:
8964 case MVT::v4i32:
8965 if (Subtarget.hasAVX()) {
8966 Opcode = X86ISD::VPERMILPV;
8967 ShuffleVT = MVT::v4f32;
8968 } else if (Subtarget.hasSSSE3()) {
8969 Opcode = X86ISD::PSHUFB;
8970 ShuffleVT = MVT::v16i8;
8971 }
8972 break;
8973 case MVT::v2f64:
8974 case MVT::v2i64:
8975 if (Subtarget.hasAVX()) {
8976 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8977 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8978 Opcode = X86ISD::VPERMILPV;
8979 ShuffleVT = MVT::v2f64;
8980 } else if (Subtarget.hasSSE41()) {
8981 // SSE41 can compare v2i64 - select between indices 0 and 1.
8982 return DAG.getSelectCC(
8983 DL, IndicesVec,
8984 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8985 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8986 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8988 }
8989 break;
8990 case MVT::v32i8:
8991 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8992 Opcode = X86ISD::VPERMV;
8993 else if (Subtarget.hasXOP()) {
8994 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
8995 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
8996 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
8997 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
8998 return DAG.getNode(
9000 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
9001 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
9002 } else if (Subtarget.hasAVX()) {
9003 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
9004 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
9005 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
9006 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
9007 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
9009 // Permute Lo and Hi and then select based on index range.
9010 // This works as SHUFB uses bits[3:0] to permute elements and we don't
9011 // care about the bit[7] as its just an index vector.
9012 SDValue Idx = Ops[2];
9013 EVT VT = Idx.getValueType();
9014 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
9015 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
9016 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
9018 };
9019 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
9020 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
9021 PSHUFBBuilder);
9022 }
9023 break;
9024 case MVT::v16i16:
9025 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9026 Opcode = X86ISD::VPERMV;
9027 else if (Subtarget.hasAVX()) {
9028 // Scale to v32i8 and perform as v32i8.
9029 IndicesVec = ScaleIndices(IndicesVec, 2);
9030 return DAG.getBitcast(
9032 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
9033 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
9034 }
9035 break;
9036 case MVT::v8f32:
9037 case MVT::v8i32:
9038 if (Subtarget.hasAVX2())
9039 Opcode = X86ISD::VPERMV;
9040 else if (Subtarget.hasAVX()) {
9041 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
9042 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9043 {0, 1, 2, 3, 0, 1, 2, 3});
9044 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9045 {4, 5, 6, 7, 4, 5, 6, 7});
9046 if (Subtarget.hasXOP())
9047 return DAG.getBitcast(
9048 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
9049 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9050 // Permute Lo and Hi and then select based on index range.
9051 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
9052 SDValue Res = DAG.getSelectCC(
9053 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
9054 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
9055 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
9057 return DAG.getBitcast(VT, Res);
9058 }
9059 break;
9060 case MVT::v4i64:
9061 case MVT::v4f64:
9062 if (Subtarget.hasAVX512()) {
9063 if (!Subtarget.hasVLX()) {
9064 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
9065 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
9066 SDLoc(SrcVec));
9067 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
9068 DAG, SDLoc(IndicesVec));
9069 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
9070 DAG, Subtarget);
9071 return extract256BitVector(Res, 0, DAG, DL);
9072 }
9073 Opcode = X86ISD::VPERMV;
9074 } else if (Subtarget.hasAVX()) {
9075 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
9076 SDValue LoLo =
9077 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
9078 SDValue HiHi =
9079 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
9080 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
9081 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
9082 if (Subtarget.hasXOP())
9083 return DAG.getBitcast(
9084 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
9085 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9086 // Permute Lo and Hi and then select based on index range.
9087 // This works as VPERMILPD only uses index bit[1] to permute elements.
9088 SDValue Res = DAG.getSelectCC(
9089 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
9090 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
9091 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
9093 return DAG.getBitcast(VT, Res);
9094 }
9095 break;
9096 case MVT::v64i8:
9097 if (Subtarget.hasVBMI())
9098 Opcode = X86ISD::VPERMV;
9099 break;
9100 case MVT::v32i16:
9101 if (Subtarget.hasBWI())
9102 Opcode = X86ISD::VPERMV;
9103 break;
9104 case MVT::v16f32:
9105 case MVT::v16i32:
9106 case MVT::v8f64:
9107 case MVT::v8i64:
9108 if (Subtarget.hasAVX512())
9109 Opcode = X86ISD::VPERMV;
9110 break;
9111 }
9112 if (!Opcode)
9113 return SDValue();
9114
9115 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
9116 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
9117 "Illegal variable permute shuffle type");
9118
9119 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
9120 if (Scale > 1)
9121 IndicesVec = ScaleIndices(IndicesVec, Scale);
9122
9123 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
9124 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
9125
9126 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
9127 SDValue Res = Opcode == X86ISD::VPERMV
9128 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
9129 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
9130 return DAG.getBitcast(VT, Res);
9131}
9132
9133// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
9134// reasoned to be a permutation of a vector by indices in a non-constant vector.
9135// (build_vector (extract_elt V, (extract_elt I, 0)),
9136// (extract_elt V, (extract_elt I, 1)),
9137// ...
9138// ->
9139// (vpermv I, V)
9140//
9141// TODO: Handle undefs
9142// TODO: Utilize pshufb and zero mask blending to support more efficient
9143// construction of vectors with constant-0 elements.
9144static SDValue
9146 SelectionDAG &DAG,
9147 const X86Subtarget &Subtarget) {
9148 SDValue SrcVec, IndicesVec;
9149
9150 auto PeekThroughFreeze = [](SDValue N) {
9151 if (N->getOpcode() == ISD::FREEZE && N.hasOneUse())
9152 return N->getOperand(0);
9153 return N;
9154 };
9155 // Check for a match of the permute source vector and permute index elements.
9156 // This is done by checking that the i-th build_vector operand is of the form:
9157 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
9158 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
9159 SDValue Op = PeekThroughFreeze(V.getOperand(Idx));
9160 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9161 return SDValue();
9162
9163 // If this is the first extract encountered in V, set the source vector,
9164 // otherwise verify the extract is from the previously defined source
9165 // vector.
9166 if (!SrcVec)
9167 SrcVec = Op.getOperand(0);
9168 else if (SrcVec != Op.getOperand(0))
9169 return SDValue();
9170 SDValue ExtractedIndex = Op->getOperand(1);
9171 // Peek through extends.
9172 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
9173 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
9174 ExtractedIndex = ExtractedIndex.getOperand(0);
9175 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9176 return SDValue();
9177
9178 // If this is the first extract from the index vector candidate, set the
9179 // indices vector, otherwise verify the extract is from the previously
9180 // defined indices vector.
9181 if (!IndicesVec)
9182 IndicesVec = ExtractedIndex.getOperand(0);
9183 else if (IndicesVec != ExtractedIndex.getOperand(0))
9184 return SDValue();
9185
9186 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
9187 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
9188 return SDValue();
9189 }
9190
9191 MVT VT = V.getSimpleValueType();
9192 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
9193}
9194
9195SDValue
9196X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
9197 SDLoc dl(Op);
9198
9199 MVT VT = Op.getSimpleValueType();
9200 MVT EltVT = VT.getVectorElementType();
9201 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
9202 unsigned NumElems = Op.getNumOperands();
9203
9204 // Generate vectors for predicate vectors.
9205 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
9206 return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
9207
9208 if (VT.getVectorElementType() == MVT::bf16 &&
9209 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
9210 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
9211
9212 if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget))
9213 return VectorCst;
9214
9215 unsigned EVTBits = EltVT.getSizeInBits();
9216 APInt UndefMask = APInt::getZero(NumElems);
9217 APInt FrozenUndefMask = APInt::getZero(NumElems);
9218 APInt ZeroMask = APInt::getZero(NumElems);
9219 APInt NonZeroMask = APInt::getZero(NumElems);
9220 bool IsAllConstants = true;
9221 bool OneUseFrozenUndefs = true;
9222 SmallSet<SDValue, 8> Values;
9223 unsigned NumConstants = NumElems;
9224 for (unsigned i = 0; i < NumElems; ++i) {
9225 SDValue Elt = Op.getOperand(i);
9226 if (Elt.isUndef()) {
9227 UndefMask.setBit(i);
9228 continue;
9229 }
9230 if (ISD::isFreezeUndef(Elt.getNode())) {
9231 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
9232 FrozenUndefMask.setBit(i);
9233 continue;
9234 }
9235 Values.insert(Elt);
9236 if (!isIntOrFPConstant(Elt)) {
9237 IsAllConstants = false;
9238 NumConstants--;
9239 }
9240 if (X86::isZeroNode(Elt)) {
9241 ZeroMask.setBit(i);
9242 } else {
9243 NonZeroMask.setBit(i);
9244 }
9245 }
9246
9247 // All undef vector. Return an UNDEF.
9248 if (UndefMask.isAllOnes())
9249 return DAG.getUNDEF(VT);
9250
9251 // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
9252 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
9253 return DAG.getFreeze(DAG.getUNDEF(VT));
9254
9255 // All undef/freeze(undef)/zero vector. Return a zero vector.
9256 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
9257 return getZeroVector(VT, Subtarget, DAG, dl);
9258
9259 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
9260 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
9261 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
9262 // and blend the FREEZE-UNDEF operands back in.
9263 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
9264 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
9265 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
9266 SmallVector<int, 16> BlendMask(NumElems, -1);
9267 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
9268 for (unsigned i = 0; i < NumElems; ++i) {
9269 if (UndefMask[i]) {
9270 BlendMask[i] = -1;
9271 continue;
9272 }
9273 BlendMask[i] = i;
9274 if (!FrozenUndefMask[i])
9275 Elts[i] = Op.getOperand(i);
9276 else
9277 BlendMask[i] += NumElems;
9278 }
9279 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
9280 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
9281 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
9282 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
9283 }
9284
9285 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
9286
9287 // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
9288 // be better off lowering to a smaller build vector and padding with
9289 // undef/zero.
9290 if ((VT.is256BitVector() || VT.is512BitVector()) &&
9292 unsigned UpperElems = NumElems / 2;
9293 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
9294 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
9295 if (NumUpperUndefsOrZeros >= UpperElems) {
9296 if (VT.is512BitVector() &&
9297 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
9298 UpperElems = NumElems - (NumElems / 4);
9299 // If freeze(undef) is in any upper elements, force to zero.
9300 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
9301 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
9302 SDValue NewBV =
9303 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
9304 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
9305 }
9306 }
9307
9308 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
9309 return AddSub;
9310 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
9311 return HorizontalOp;
9312 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
9313 return Broadcast;
9314 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
9315 return BitOp;
9316 if (SDValue Blend = lowerBuildVectorAsBlend(BV, dl, Subtarget, DAG))
9317 return Blend;
9318
9319 unsigned NumZero = ZeroMask.popcount();
9320 unsigned NumNonZero = NonZeroMask.popcount();
9321
9322 // If we are inserting one variable into a vector of non-zero constants, try
9323 // to avoid loading each constant element as a scalar. Load the constants as a
9324 // vector and then insert the variable scalar element. If insertion is not
9325 // supported, fall back to a shuffle to get the scalar blended with the
9326 // constants. Insertion into a zero vector is handled as a special-case
9327 // somewhere below here.
9328 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
9329 FrozenUndefMask.isZero() &&
9332 // Create an all-constant vector. The variable element in the old
9333 // build vector is replaced by undef in the constant vector. Save the
9334 // variable scalar element and its index for use in the insertelement.
9335 LLVMContext &Context = *DAG.getContext();
9336 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
9337 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
9338 SDValue VarElt;
9339 SDValue InsIndex;
9340 for (unsigned i = 0; i != NumElems; ++i) {
9341 SDValue Elt = Op.getOperand(i);
9342 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
9343 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
9344 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
9345 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
9346 else if (!Elt.isUndef()) {
9347 assert(!VarElt.getNode() && !InsIndex.getNode() &&
9348 "Expected one variable element in this vector");
9349 VarElt = Elt;
9350 InsIndex = DAG.getVectorIdxConstant(i, dl);
9351 }
9352 }
9353 Constant *CV = ConstantVector::get(ConstVecOps);
9354 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
9355
9356 // The constants we just created may not be legal (eg, floating point). We
9357 // must lower the vector right here because we can not guarantee that we'll
9358 // legalize it before loading it. This is also why we could not just create
9359 // a new build vector here. If the build vector contains illegal constants,
9360 // it could get split back up into a series of insert elements.
9361 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
9362 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
9363 MachineFunction &MF = DAG.getMachineFunction();
9364 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
9365 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
9366 unsigned InsertC = InsIndex->getAsZExtVal();
9367 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
9368 if (InsertC < NumEltsInLow128Bits)
9369 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
9370
9371 // There's no good way to insert into the high elements of a >128-bit
9372 // vector, so use shuffles to avoid an extract/insert sequence.
9373 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
9374 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
9375 SmallVector<int, 8> ShuffleMask;
9376 unsigned NumElts = VT.getVectorNumElements();
9377 for (unsigned i = 0; i != NumElts; ++i)
9378 ShuffleMask.push_back(i == InsertC ? NumElts : i);
9379 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
9380 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
9381 }
9382
9383 // Special case for single non-zero, non-undef, element.
9384 if (NumNonZero == 1) {
9385 unsigned Idx = NonZeroMask.countr_zero();
9386 SDValue Item = Op.getOperand(Idx);
9387
9388 // If we have a constant or non-constant insertion into the low element of
9389 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
9390 // the rest of the elements. This will be matched as movd/movq/movss/movsd
9391 // depending on what the source datatype is.
9392 if (Idx == 0) {
9393 if (NumZero == 0)
9394 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9395
9396 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
9397 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
9398 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
9399 assert((VT.is128BitVector() || VT.is256BitVector() ||
9400 VT.is512BitVector()) &&
9401 "Expected an SSE value type!");
9402 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9403 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
9404 // zero vector.
9405 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9406 }
9407
9408 // We can't directly insert an i8 or i16 into a vector, so zero extend
9409 // it to i32 first.
9410 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
9411 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
9412 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
9413 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
9414 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9415 return DAG.getBitcast(VT, Item);
9416 }
9417 }
9418
9419 // Is it a vector logical left shift?
9420 if (NumElems == 2 && Idx == 1 &&
9421 X86::isZeroNode(Op.getOperand(0)) &&
9422 !X86::isZeroNode(Op.getOperand(1))) {
9423 unsigned NumBits = VT.getSizeInBits();
9424 return getVShift(true, VT,
9426 VT, Op.getOperand(1)),
9427 NumBits/2, DAG, *this, dl);
9428 }
9429
9430 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
9431 return SDValue();
9432
9433 // Otherwise, if this is a vector with i32 or f32 elements, and the element
9434 // is a non-constant being inserted into an element other than the low one,
9435 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
9436 // movd/movss) to move this into the low element, then shuffle it into
9437 // place.
9438 if (EVTBits == 32) {
9439 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9440 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
9441 }
9442 }
9443
9444 // Splat is obviously ok. Let legalizer expand it to a shuffle.
9445 if (Values.size() == 1) {
9446 if (EVTBits == 32) {
9447 // Instead of a shuffle like this:
9448 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
9449 // Check if it's possible to issue this instead.
9450 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
9451 unsigned Idx = NonZeroMask.countr_zero();
9452 SDValue Item = Op.getOperand(Idx);
9453 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
9454 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
9455 }
9456 return SDValue();
9457 }
9458
9459 // A vector full of immediates; various special cases are already
9460 // handled, so this is best done with a single constant-pool load.
9461 if (IsAllConstants)
9462 return SDValue();
9463
9464 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget))
9465 return V;
9466
9467 // See if we can use a vector load to get all of the elements.
9468 {
9469 SmallVector<SDValue, 64> Ops(Op->ops().take_front(NumElems));
9470 if (SDValue LD =
9471 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
9472 return LD;
9473 }
9474
9475 // If this is a splat of pairs of 32-bit elements, we can use a narrower
9476 // build_vector and broadcast it.
9477 // TODO: We could probably generalize this more.
9478 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
9479 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9480 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9481 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
9482 // Make sure all the even/odd operands match.
9483 for (unsigned i = 2; i != NumElems; ++i)
9484 if (Ops[i % 2] != Op.getOperand(i))
9485 return false;
9486 return true;
9487 };
9488 if (CanSplat(Op, NumElems, Ops)) {
9489 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
9490 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
9491 // Create a new build vector and cast to v2i64/v2f64.
9492 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
9493 DAG.getBuildVector(NarrowVT, dl, Ops));
9494 // Broadcast from v2i64/v2f64 and cast to final VT.
9495 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
9496 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
9497 NewBV));
9498 }
9499 }
9500
9501 // For AVX-length vectors, build the individual 128-bit pieces and use
9502 // shuffles to put them in place.
9503 if (VT.getSizeInBits() > 128) {
9504 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
9505
9506 // Build both the lower and upper subvector.
9507 SDValue Lower =
9508 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
9510 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
9511
9512 // Recreate the wider vector with the lower and upper part.
9513 return concatSubVectors(Lower, Upper, DAG, dl);
9514 }
9515
9516 // Let legalizer expand 2-wide build_vectors.
9517 if (EVTBits == 64) {
9518 if (NumNonZero == 1) {
9519 // One half is zero or undef.
9520 unsigned Idx = NonZeroMask.countr_zero();
9521 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
9522 Op.getOperand(Idx));
9523 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
9524 }
9525 return SDValue();
9526 }
9527
9528 // If element VT is < 32 bits, convert it to inserts into a zero vector.
9529 if (EVTBits == 8 && NumElems == 16)
9530 if (SDValue V = LowerBuildVectorv16i8(Op, dl, NonZeroMask, NumNonZero,
9531 NumZero, DAG, Subtarget))
9532 return V;
9533
9534 if (EltVT == MVT::i16 && NumElems == 8)
9535 if (SDValue V = LowerBuildVectorv8i16(Op, dl, NonZeroMask, NumNonZero,
9536 NumZero, DAG, Subtarget))
9537 return V;
9538
9539 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
9540 if (EVTBits == 32 && NumElems == 4)
9541 if (SDValue V = LowerBuildVectorv4x32(Op, dl, DAG, Subtarget))
9542 return V;
9543
9544 // If element VT is == 32 bits, turn it into a number of shuffles.
9545 if (NumElems == 4 && NumZero > 0) {
9546 SmallVector<SDValue, 8> Ops(NumElems);
9547 for (unsigned i = 0; i < 4; ++i) {
9548 bool isZero = !NonZeroMask[i];
9549 if (isZero)
9550 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
9551 else
9552 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9553 }
9554
9555 for (unsigned i = 0; i < 2; ++i) {
9556 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
9557 default: llvm_unreachable("Unexpected NonZero count");
9558 case 0:
9559 Ops[i] = Ops[i*2]; // Must be a zero vector.
9560 break;
9561 case 1:
9562 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
9563 break;
9564 case 2:
9565 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9566 break;
9567 case 3:
9568 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9569 break;
9570 }
9571 }
9572
9573 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
9574 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
9575 int MaskVec[] = {
9576 Reverse1 ? 1 : 0,
9577 Reverse1 ? 0 : 1,
9578 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
9579 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
9580 };
9581 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
9582 }
9583
9584 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
9585
9586 // Check for a build vector from mostly shuffle plus few inserting.
9587 if (SDValue Sh = buildFromShuffleMostly(Op, dl, DAG))
9588 return Sh;
9589
9590 // For SSE 4.1, use insertps to put the high elements into the low element.
9591 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
9593 if (!Op.getOperand(0).isUndef())
9594 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
9595 else
9596 Result = DAG.getUNDEF(VT);
9597
9598 for (unsigned i = 1; i < NumElems; ++i) {
9599 if (Op.getOperand(i).isUndef()) continue;
9600 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
9601 Op.getOperand(i), DAG.getVectorIdxConstant(i, dl));
9602 }
9603 return Result;
9604 }
9605
9606 // Otherwise, expand into a number of unpckl*, start by extending each of
9607 // our (non-undef) elements to the full vector width with the element in the
9608 // bottom slot of the vector (which generates no code for SSE).
9609 SmallVector<SDValue, 8> Ops(NumElems);
9610 for (unsigned i = 0; i < NumElems; ++i) {
9611 if (!Op.getOperand(i).isUndef())
9612 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9613 else
9614 Ops[i] = DAG.getUNDEF(VT);
9615 }
9616
9617 // Next, we iteratively mix elements, e.g. for v4f32:
9618 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
9619 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
9620 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
9621 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
9622 // Generate scaled UNPCKL shuffle mask.
9623 SmallVector<int, 16> Mask;
9624 for(unsigned i = 0; i != Scale; ++i)
9625 Mask.push_back(i);
9626 for (unsigned i = 0; i != Scale; ++i)
9627 Mask.push_back(NumElems+i);
9628 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
9629
9630 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
9631 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
9632 }
9633 return Ops[0];
9634}
9635
9636// 256-bit AVX can use the vinsertf128 instruction
9637// to create 256-bit vectors from two other 128-bit ones.
9638// TODO: Detect subvector broadcast here instead of DAG combine?
9640 SelectionDAG &DAG,
9641 const X86Subtarget &Subtarget) {
9642 MVT ResVT = Op.getSimpleValueType();
9643 assert((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
9644 "Value type must be 256-/512-bit wide");
9645
9646 unsigned NumOperands = Op.getNumOperands();
9647 unsigned NumFreezeUndef = 0;
9648 unsigned NumZero = 0;
9649 unsigned NumNonZero = 0;
9650 unsigned NonZeros = 0;
9651 SmallSet<SDValue, 4> Undefs;
9652 for (unsigned i = 0; i != NumOperands; ++i) {
9653 SDValue SubVec = Op.getOperand(i);
9654 if (SubVec.isUndef())
9655 continue;
9656 if (ISD::isFreezeUndef(SubVec.getNode())) {
9657 // If the freeze(undef) has multiple uses then we must fold to zero.
9658 if (SubVec.hasOneUse()) {
9659 ++NumFreezeUndef;
9660 } else {
9661 ++NumZero;
9662 Undefs.insert(SubVec);
9663 }
9664 }
9665 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9666 ++NumZero;
9667 else {
9668 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9669 NonZeros |= 1 << i;
9670 ++NumNonZero;
9671 }
9672 }
9673
9674 // If we have more than 2 non-zeros, build each half separately.
9675 if (NumNonZero > 2) {
9676 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9677 ArrayRef<SDUse> Ops = Op->ops();
9678 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9679 Ops.slice(0, NumOperands/2));
9680 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9681 Ops.slice(NumOperands/2));
9682 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9683 }
9684
9685 // Otherwise, build it up through insert_subvectors.
9686 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
9687 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
9688 : DAG.getUNDEF(ResVT));
9689
9690 // Replace Undef operands with ZeroVector.
9691 for (SDValue U : Undefs)
9693 U, getZeroVector(U.getSimpleValueType(), Subtarget, DAG, dl));
9694
9695 MVT SubVT = Op.getOperand(0).getSimpleValueType();
9696 unsigned NumSubElems = SubVT.getVectorNumElements();
9697 for (unsigned i = 0; i != NumOperands; ++i) {
9698 if ((NonZeros & (1 << i)) == 0)
9699 continue;
9700
9701 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(i),
9702 DAG.getVectorIdxConstant(i * NumSubElems, dl));
9703 }
9704
9705 return Vec;
9706}
9707
9708// Returns true if the given node is a type promotion (by concatenating i1
9709// zeros) of the result of a node that already zeros all upper bits of
9710// k-register.
9711// TODO: Merge this with LowerAVXCONCAT_VECTORS?
9713 const X86Subtarget &Subtarget,
9714 SelectionDAG & DAG) {
9715 MVT ResVT = Op.getSimpleValueType();
9716 unsigned NumOperands = Op.getNumOperands();
9717 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
9718 "Unexpected number of operands in CONCAT_VECTORS");
9719
9720 uint64_t Zeros = 0;
9721 uint64_t NonZeros = 0;
9722 for (unsigned i = 0; i != NumOperands; ++i) {
9723 SDValue SubVec = Op.getOperand(i);
9724 if (SubVec.isUndef())
9725 continue;
9726 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9727 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9728 Zeros |= (uint64_t)1 << i;
9729 else
9730 NonZeros |= (uint64_t)1 << i;
9731 }
9732
9733 unsigned NumElems = ResVT.getVectorNumElements();
9734
9735 // If we are inserting non-zero vector and there are zeros in LSBs and undef
9736 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
9737 // insert_subvector will give us two kshifts.
9738 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
9739 Log2_64(NonZeros) != NumOperands - 1) {
9740 unsigned Idx = Log2_64(NonZeros);
9741 SDValue SubVec = Op.getOperand(Idx);
9742 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9743 MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget);
9744 Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl);
9745 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,
9746 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
9747 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
9748 DAG.getVectorIdxConstant(0, dl));
9749 }
9750
9751 // If there are zero or one non-zeros we can handle this very simply.
9752 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
9753 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
9754 if (!NonZeros)
9755 return Vec;
9756 unsigned Idx = Log2_64(NonZeros);
9757 SDValue SubVec = Op.getOperand(Idx);
9758 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9759 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
9760 DAG.getVectorIdxConstant(Idx * SubVecNumElts, dl));
9761 }
9762
9763 if (NumOperands > 2) {
9764 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9765 ArrayRef<SDUse> Ops = Op->ops();
9766 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9767 Ops.slice(0, NumOperands / 2));
9768 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9769 Ops.slice(NumOperands / 2));
9770 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9771 }
9772
9773 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");
9774
9775 if (ResVT.getVectorNumElements() >= 16)
9776 return Op; // The operation is legal with KUNPCK
9777
9778 SDValue Vec =
9779 DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, DAG.getUNDEF(ResVT),
9780 Op.getOperand(0), DAG.getVectorIdxConstant(0, dl));
9781 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
9782 DAG.getVectorIdxConstant(NumElems / 2, dl));
9783}
9784
9786 const X86Subtarget &Subtarget,
9787 SelectionDAG &DAG) {
9788 SDLoc DL(Op);
9789 MVT VT = Op.getSimpleValueType();
9790 if (VT.getVectorElementType() == MVT::i1)
9791 return LowerCONCAT_VECTORSvXi1(Op, DL, Subtarget, DAG);
9792
9793 // AVX can use the vinsertf128 instruction to create 256-bit vectors
9794 // from two other 128-bit ones.
9795 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
9796 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
9797 (VT.is512BitVector() &&
9798 (Op.getNumOperands() == 2 || Op.getNumOperands() == 4)));
9799 return LowerAVXCONCAT_VECTORS(Op, DL, DAG, Subtarget);
9800}
9801
9802//===----------------------------------------------------------------------===//
9803// Vector shuffle lowering
9804//
9805// This is an experimental code path for lowering vector shuffles on x86. It is
9806// designed to handle arbitrary vector shuffles and blends, gracefully
9807// degrading performance as necessary. It works hard to recognize idiomatic
9808// shuffles and lower them to optimal instruction patterns without leaving
9809// a framework that allows reasonably efficient handling of all vector shuffle
9810// patterns.
9811//===----------------------------------------------------------------------===//
9812
9813/// Checks whether the vector elements referenced by two shuffle masks are
9814/// equivalent.
9815static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
9816 int Idx, int ExpectedIdx) {
9817 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
9818 ExpectedIdx < MaskSize && "Out of range element index");
9819 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
9820 return false;
9821
9822 EVT VT = Op.getValueType();
9823 EVT ExpectedVT = ExpectedOp.getValueType();
9824
9825 // Sources must be vectors and match the mask's element count.
9826 if (!VT.isVector() || !ExpectedVT.isVector() ||
9827 (int)VT.getVectorNumElements() != MaskSize ||
9828 (int)ExpectedVT.getVectorNumElements() != MaskSize)
9829 return false;
9830
9831 // Exact match.
9832 if (Idx == ExpectedIdx && Op == ExpectedOp)
9833 return true;
9834
9835 switch (Op.getOpcode()) {
9836 case ISD::BUILD_VECTOR:
9837 // If the values are build vectors, we can look through them to find
9838 // equivalent inputs that make the shuffles equivalent.
9839 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
9840 case ISD::BITCAST: {
9842 EVT SrcVT = Src.getValueType();
9843 if (Op == ExpectedOp && SrcVT.isVector()) {
9844 if ((SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) {
9845 unsigned Scale = SrcVT.getScalarSizeInBits() / VT.getScalarSizeInBits();
9846 return (Idx % Scale) == (ExpectedIdx % Scale) &&
9847 IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
9848 Idx / Scale, ExpectedIdx / Scale);
9849 }
9850 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0) {
9851 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
9852 for (unsigned I = 0; I != Scale; ++I)
9853 if (!IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
9854 (Idx * Scale) + I,
9855 (ExpectedIdx * Scale) + I))
9856 return false;
9857 return true;
9858 }
9859 }
9860 break;
9861 }
9862 case ISD::VECTOR_SHUFFLE: {
9863 auto *SVN = cast<ShuffleVectorSDNode>(Op);
9864 return Op == ExpectedOp &&
9865 SVN->getMaskElt(Idx) == SVN->getMaskElt(ExpectedIdx);
9866 }
9867 case X86ISD::VBROADCAST:
9869 return Op == ExpectedOp;
9871 if (Op == ExpectedOp) {
9872 auto *MemOp = cast<MemSDNode>(Op);
9873 unsigned NumMemElts = MemOp->getMemoryVT().getVectorNumElements();
9874 return (Idx % NumMemElts) == (ExpectedIdx % NumMemElts);
9875 }
9876 break;
9877 case X86ISD::VPERMI: {
9878 if (Op == ExpectedOp) {
9880 DecodeVPERMMask(MaskSize, Op.getConstantOperandVal(1), Mask);
9881 SDValue Src = Op.getOperand(0);
9882 return IsElementEquivalent(MaskSize, Src, Src, Mask[Idx],
9883 Mask[ExpectedIdx]);
9884 }
9885 break;
9886 }
9887 case X86ISD::HADD:
9888 case X86ISD::HSUB:
9889 case X86ISD::FHADD:
9890 case X86ISD::FHSUB:
9891 case X86ISD::PACKSS:
9892 case X86ISD::PACKUS:
9893 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
9894 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
9895 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
9896 int NumElts = VT.getVectorNumElements();
9897 int NumLanes = VT.getSizeInBits() / 128;
9898 int NumEltsPerLane = NumElts / NumLanes;
9899 int NumHalfEltsPerLane = NumEltsPerLane / 2;
9900 bool SameLane = (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
9901 bool SameElt =
9902 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
9903 return SameLane && SameElt;
9904 }
9905 break;
9906 }
9907
9908 return false;
9909}
9910
9911/// Tiny helper function to identify a no-op mask.
9912///
9913/// This is a somewhat boring predicate function. It checks whether the mask
9914/// array input, which is assumed to be a single-input shuffle mask of the kind
9915/// used by the X86 shuffle instructions (not a fully general
9916/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
9917/// in-place shuffle are 'no-op's.
9919 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9920 assert(Mask[i] >= -1 && "Out of bound mask element!");
9921 if (Mask[i] >= 0 && Mask[i] != i)
9922 return false;
9923 }
9924 return true;
9925}
9926
9927/// Test whether there are elements crossing LaneSizeInBits lanes in this
9928/// shuffle mask.
9929///
9930/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
9931/// and we routinely test for these.
9932static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
9933 unsigned ScalarSizeInBits,
9934 ArrayRef<int> Mask) {
9935 assert(LaneSizeInBits && ScalarSizeInBits &&
9936 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9937 "Illegal shuffle lane size");
9938 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
9939 int Size = Mask.size();
9940 for (int i = 0; i < Size; ++i)
9941 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
9942 return true;
9943 return false;
9944}
9945
9946/// Test whether there are elements crossing 128-bit lanes in this
9947/// shuffle mask.
9949 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
9950}
9951
9952/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
9953/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
9954/// better support 'repeated mask + lane permute' style shuffles.
9955static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
9956 unsigned ScalarSizeInBits,
9957 ArrayRef<int> Mask) {
9958 assert(LaneSizeInBits && ScalarSizeInBits &&
9959 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9960 "Illegal shuffle lane size");
9961 int NumElts = Mask.size();
9962 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
9963 int NumLanes = NumElts / NumEltsPerLane;
9964 if (NumLanes > 1) {
9965 for (int i = 0; i != NumLanes; ++i) {
9966 int SrcLane = -1;
9967 for (int j = 0; j != NumEltsPerLane; ++j) {
9968 int M = Mask[(i * NumEltsPerLane) + j];
9969 if (M < 0)
9970 continue;
9971 int Lane = (M % NumElts) / NumEltsPerLane;
9972 if (SrcLane >= 0 && SrcLane != Lane)
9973 return true;
9974 SrcLane = Lane;
9975 }
9976 }
9977 }
9978 return false;
9979}
9980
9981/// Test whether a shuffle mask is equivalent within each sub-lane.
9982///
9983/// This checks a shuffle mask to see if it is performing the same
9984/// lane-relative shuffle in each sub-lane. This trivially implies
9985/// that it is also not lane-crossing. It may however involve a blend from the
9986/// same lane of a second vector.
9987///
9988/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
9989/// non-trivial to compute in the face of undef lanes. The representation is
9990/// suitable for use with existing 128-bit shuffles as entries from the second
9991/// vector have been remapped to [LaneSize, 2*LaneSize).
9992static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
9993 ArrayRef<int> Mask,
9994 SmallVectorImpl<int> &RepeatedMask) {
9995 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
9996 RepeatedMask.assign(LaneSize, -1);
9997 int Size = Mask.size();
9998 for (int i = 0; i < Size; ++i) {
9999 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
10000 if (Mask[i] < 0)
10001 continue;
10002 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10003 // This entry crosses lanes, so there is no way to model this shuffle.
10004 return false;
10005
10006 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
10007 // Adjust second vector indices to start at LaneSize instead of Size.
10008 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
10009 : Mask[i] % LaneSize + LaneSize;
10010 if (RepeatedMask[i % LaneSize] < 0)
10011 // This is the first non-undef entry in this slot of a 128-bit lane.
10012 RepeatedMask[i % LaneSize] = LocalM;
10013 else if (RepeatedMask[i % LaneSize] != LocalM)
10014 // Found a mismatch with the repeated mask.
10015 return false;
10016 }
10017 return true;
10018}
10019
10020/// Test whether a shuffle mask is equivalent within each 128-bit lane.
10021static bool
10023 SmallVectorImpl<int> &RepeatedMask) {
10024 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10025}
10026
10027static bool
10029 SmallVector<int, 32> RepeatedMask;
10030 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10031}
10032
10033/// Test whether a shuffle mask is equivalent within each 256-bit lane.
10034static bool
10036 SmallVectorImpl<int> &RepeatedMask) {
10037 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
10038}
10039
10040/// Test whether a target shuffle mask is equivalent within each sub-lane.
10041/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10042static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
10043 unsigned EltSizeInBits,
10044 ArrayRef<int> Mask,
10045 SmallVectorImpl<int> &RepeatedMask) {
10046 int LaneSize = LaneSizeInBits / EltSizeInBits;
10047 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
10048 int Size = Mask.size();
10049 for (int i = 0; i < Size; ++i) {
10050 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
10051 if (Mask[i] == SM_SentinelUndef)
10052 continue;
10053 if (Mask[i] == SM_SentinelZero) {
10054 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
10055 return false;
10056 RepeatedMask[i % LaneSize] = SM_SentinelZero;
10057 continue;
10058 }
10059 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10060 // This entry crosses lanes, so there is no way to model this shuffle.
10061 return false;
10062
10063 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
10064 // later vector indices to start at multiples of LaneSize instead of Size.
10065 int LaneM = Mask[i] / Size;
10066 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
10067 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
10068 // This is the first non-undef entry in this slot of a 128-bit lane.
10069 RepeatedMask[i % LaneSize] = LocalM;
10070 else if (RepeatedMask[i % LaneSize] != LocalM)
10071 // Found a mismatch with the repeated mask.
10072 return false;
10073 }
10074 return true;
10075}
10076
10077/// Test whether a target shuffle mask is equivalent within each sub-lane.
10078/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10079static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
10080 ArrayRef<int> Mask,
10081 SmallVectorImpl<int> &RepeatedMask) {
10082 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
10083 Mask, RepeatedMask);
10084}
10085
10086/// Checks whether a shuffle mask is equivalent to an explicit list of
10087/// arguments.
10088///
10089/// This is a fast way to test a shuffle mask against a fixed pattern:
10090///
10091/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
10092///
10093/// It returns true if the mask is exactly as wide as the argument list, and
10094/// each element of the mask is either -1 (signifying undef) or the value given
10095/// in the argument.
10096static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
10097 SDValue V1 = SDValue(),
10098 SDValue V2 = SDValue()) {
10099 int Size = Mask.size();
10100 if (Size != (int)ExpectedMask.size())
10101 return false;
10102
10103 for (int i = 0; i < Size; ++i) {
10104 assert(Mask[i] >= -1 && "Out of bound mask element!");
10105 int MaskIdx = Mask[i];
10106 int ExpectedIdx = ExpectedMask[i];
10107 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
10108 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10109 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10110 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10111 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10112 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10113 return false;
10114 }
10115 }
10116 return true;
10117}
10118
10119/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
10120///
10121/// The masks must be exactly the same width.
10122///
10123/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
10124/// value in ExpectedMask is always accepted. Otherwise the indices must match.
10125///
10126/// SM_SentinelZero is accepted as a valid negative index but must match in
10127/// both, or via a known bits test.
10129 ArrayRef<int> ExpectedMask,
10130 const SelectionDAG &DAG,
10131 SDValue V1 = SDValue(),
10132 SDValue V2 = SDValue()) {
10133 int Size = Mask.size();
10134 if (Size != (int)ExpectedMask.size())
10135 return false;
10136 assert(llvm::all_of(ExpectedMask,
10137 [Size](int M) {
10138 return M == SM_SentinelZero ||
10139 isInRange(M, 0, 2 * Size);
10140 }) &&
10141 "Illegal target shuffle mask");
10142
10143 // Check for out-of-range target shuffle mask indices.
10144 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
10145 return false;
10146
10147 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
10148 if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||
10149 !V1.getValueType().isVector()))
10150 V1 = SDValue();
10151 if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||
10152 !V2.getValueType().isVector()))
10153 V2 = SDValue();
10154
10155 APInt ZeroV1 = APInt::getZero(Size);
10156 APInt ZeroV2 = APInt::getZero(Size);
10157
10158 for (int i = 0; i < Size; ++i) {
10159 int MaskIdx = Mask[i];
10160 int ExpectedIdx = ExpectedMask[i];
10161 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
10162 continue;
10163 // If we failed to match an expected SM_SentinelZero then early out.
10164 if (ExpectedIdx < 0)
10165 return false;
10166 if (MaskIdx == SM_SentinelZero) {
10167 // If we need this expected index to be a zero element, then update the
10168 // relevant zero mask and perform the known bits at the end to minimize
10169 // repeated computes.
10170 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10171 if (ExpectedV &&
10172 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
10173 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10174 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
10175 ZeroMask.setBit(BitIdx);
10176 continue;
10177 }
10178 }
10179 if (MaskIdx >= 0) {
10180 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10181 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10182 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10183 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10184 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10185 continue;
10186 }
10187 return false;
10188 }
10189 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
10190 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
10191}
10192
10193// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
10194// instructions.
10196 const SelectionDAG &DAG) {
10197 if (VT != MVT::v8i32 && VT != MVT::v8f32)
10198 return false;
10199
10200 SmallVector<int, 8> Unpcklwd;
10201 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
10202 /* Unary = */ false);
10203 SmallVector<int, 8> Unpckhwd;
10204 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
10205 /* Unary = */ false);
10206 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
10207 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
10208 return IsUnpackwdMask;
10209}
10210
10212 const SelectionDAG &DAG) {
10213 // Create 128-bit vector type based on mask size.
10214 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
10215 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
10216
10217 // We can't assume a canonical shuffle mask, so try the commuted version too.
10218 SmallVector<int, 4> CommutedMask(Mask);
10220
10221 // Match any of unary/binary or low/high.
10222 for (unsigned i = 0; i != 4; ++i) {
10223 SmallVector<int, 16> UnpackMask;
10224 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
10225 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
10226 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
10227 return true;
10228 }
10229 return false;
10230}
10231
10232/// Return true if a shuffle mask chooses elements identically in its top and
10233/// bottom halves. For example, any splat mask has the same top and bottom
10234/// halves. If an element is undefined in only one half of the mask, the halves
10235/// are not considered identical.
10237 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
10238 unsigned HalfSize = Mask.size() / 2;
10239 for (unsigned i = 0; i != HalfSize; ++i) {
10240 if (Mask[i] != Mask[i + HalfSize])
10241 return false;
10242 }
10243 return true;
10244}
10245
10246/// Get a 4-lane 8-bit shuffle immediate for a mask.
10247///
10248/// This helper function produces an 8-bit shuffle immediate corresponding to
10249/// the ubiquitous shuffle encoding scheme used in x86 instructions for
10250/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
10251/// example.
10252///
10253/// NB: We rely heavily on "undef" masks preserving the input lane.
10254static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
10255 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
10256 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
10257 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
10258 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
10259 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
10260
10261 // If the mask only uses one non-undef element, then fully 'splat' it to
10262 // improve later broadcast matching.
10263 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10264 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
10265
10266 int FirstElt = Mask[FirstIndex];
10267 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
10268 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
10269
10270 unsigned Imm = 0;
10271 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
10272 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
10273 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
10274 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
10275 return Imm;
10276}
10277
10279 SelectionDAG &DAG) {
10280 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
10281}
10282
10283// Canonicalize SHUFPD mask to improve chances of further folding.
10284// Mask elements are assumed to be -1, 0 or 1 to match the SHUFPD lo/hi pattern.
10285static unsigned getSHUFPDImm(ArrayRef<int> Mask) {
10286 assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) &&
10287 "Unexpected SHUFPD mask size");
10288 assert(all_of(Mask, [](int M) { return -1 <= M && M <= 1; }) &&
10289 "Unexpected SHUFPD mask elements");
10290
10291 // If the mask only uses one non-undef element, then fully 'splat' it to
10292 // improve later broadcast matching.
10293 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10294 assert(0 <= FirstIndex && FirstIndex < (int)Mask.size() &&
10295 "All undef shuffle mask");
10296
10297 int FirstElt = Mask[FirstIndex];
10298 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }) &&
10299 count_if(Mask, [FirstElt](int M) { return M == FirstElt; }) > 1) {
10300 unsigned Imm = 0;
10301 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10302 Imm |= FirstElt << I;
10303 return Imm;
10304 }
10305
10306 // Attempt to keep any undef elements in place to improve chances of the
10307 // shuffle becoming a (commutative) blend.
10308 unsigned Imm = 0;
10309 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10310 Imm |= (Mask[I] < 0 ? (I & 1) : Mask[I]) << I;
10311
10312 return Imm;
10313}
10314
10316 SelectionDAG &DAG) {
10317 return DAG.getTargetConstant(getSHUFPDImm(Mask), DL, MVT::i8);
10318}
10319
10320// The Shuffle result is as follow:
10321// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
10322// Each Zeroable's element correspond to a particular Mask's element.
10323// As described in computeZeroableShuffleElements function.
10324//
10325// The function looks for a sub-mask that the nonzero elements are in
10326// increasing order. If such sub-mask exist. The function returns true.
10327static bool isNonZeroElementsInOrder(const APInt &Zeroable,
10328 ArrayRef<int> Mask, const EVT &VectorType,
10329 bool &IsZeroSideLeft) {
10330 int NextElement = -1;
10331 // Check if the Mask's nonzero elements are in increasing order.
10332 for (int i = 0, e = Mask.size(); i < e; i++) {
10333 // Checks if the mask's zeros elements are built from only zeros.
10334 assert(Mask[i] >= -1 && "Out of bound mask element!");
10335 if (Mask[i] < 0)
10336 return false;
10337 if (Zeroable[i])
10338 continue;
10339 // Find the lowest non zero element
10340 if (NextElement < 0) {
10341 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
10342 IsZeroSideLeft = NextElement != 0;
10343 }
10344 // Exit if the mask's non zero elements are not in increasing order.
10345 if (NextElement != Mask[i])
10346 return false;
10347 NextElement++;
10348 }
10349 return true;
10350}
10351
10352static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
10354 const X86Subtarget &Subtarget,
10355 unsigned Depth = 0);
10356
10357/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
10359 ArrayRef<int> Mask, SDValue V1,
10360 SDValue V2, const APInt &Zeroable,
10361 const X86Subtarget &Subtarget,
10362 SelectionDAG &DAG) {
10363 int Size = Mask.size();
10364 int LaneSize = 128 / VT.getScalarSizeInBits();
10365 const int NumBytes = VT.getSizeInBits() / 8;
10366 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
10367
10368 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
10369 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
10370 (Subtarget.hasBWI() && VT.is512BitVector()));
10371
10372 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
10373 // Sign bit set in i8 mask means zero element.
10374 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
10375
10376 SDValue V;
10377 for (int i = 0; i < NumBytes; ++i) {
10378 int M = Mask[i / NumEltBytes];
10379 if (M < 0) {
10380 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
10381 continue;
10382 }
10383 if (Zeroable[i / NumEltBytes]) {
10384 PSHUFBMask[i] = ZeroMask;
10385 continue;
10386 }
10387
10388 // We can only use a single input of V1 or V2.
10389 SDValue SrcV = (M >= Size ? V2 : V1);
10390 if (V && V != SrcV)
10391 return SDValue();
10392 V = SrcV;
10393 M %= Size;
10394
10395 // PSHUFB can't cross lanes, ensure this doesn't happen.
10396 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
10397 return SDValue();
10398
10399 M = M % LaneSize;
10400 M = M * NumEltBytes + (i % NumEltBytes);
10401 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
10402 }
10403 assert(V && "Failed to find a source input");
10404
10405 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
10406 return DAG.getBitcast(
10407 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
10408 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
10409}
10410
10411static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
10412 const X86Subtarget &Subtarget, SelectionDAG &DAG,
10413 const SDLoc &dl);
10414
10415// X86 has dedicated shuffle that can be lowered to VEXPAND
10417 SDValue V2, ArrayRef<int> Mask,
10418 const APInt &Zeroable,
10419 const X86Subtarget &Subtarget,
10420 SelectionDAG &DAG) {
10421 bool IsLeftZeroSide = true;
10422 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
10423 IsLeftZeroSide))
10424 return SDValue();
10425 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
10427 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
10428 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
10429 unsigned NumElts = VT.getVectorNumElements();
10430 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
10431 "Unexpected number of vector elements");
10432 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
10433 Subtarget, DAG, DL);
10434 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
10435 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
10436 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
10437}
10438
10439static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
10440 unsigned &UnpackOpcode, bool IsUnary,
10441 ArrayRef<int> TargetMask, const SDLoc &DL,
10442 SelectionDAG &DAG,
10443 const X86Subtarget &Subtarget) {
10444 int NumElts = VT.getVectorNumElements();
10445
10446 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
10447 for (int i = 0; i != NumElts; i += 2) {
10448 int M1 = TargetMask[i + 0];
10449 int M2 = TargetMask[i + 1];
10450 Undef1 &= (SM_SentinelUndef == M1);
10451 Undef2 &= (SM_SentinelUndef == M2);
10452 Zero1 &= isUndefOrZero(M1);
10453 Zero2 &= isUndefOrZero(M2);
10454 }
10455 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
10456 "Zeroable shuffle detected");
10457
10458 // Attempt to match the target mask against the unpack lo/hi mask patterns.
10459 SmallVector<int, 64> Unpckl, Unpckh;
10460 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
10461 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
10462 (IsUnary ? V1 : V2))) {
10463 UnpackOpcode = X86ISD::UNPCKL;
10464 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10465 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10466 return true;
10467 }
10468
10469 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
10470 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
10471 (IsUnary ? V1 : V2))) {
10472 UnpackOpcode = X86ISD::UNPCKH;
10473 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10474 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10475 return true;
10476 }
10477
10478 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
10479 if (IsUnary && (Zero1 || Zero2)) {
10480 // Don't bother if we can blend instead.
10481 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
10482 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
10483 return false;
10484
10485 bool MatchLo = true, MatchHi = true;
10486 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
10487 int M = TargetMask[i];
10488
10489 // Ignore if the input is known to be zero or the index is undef.
10490 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
10491 (M == SM_SentinelUndef))
10492 continue;
10493
10494 MatchLo &= (M == Unpckl[i]);
10495 MatchHi &= (M == Unpckh[i]);
10496 }
10497
10498 if (MatchLo || MatchHi) {
10499 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10500 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10501 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10502 return true;
10503 }
10504 }
10505
10506 // If a binary shuffle, commute and try again.
10507 if (!IsUnary) {
10509 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
10510 UnpackOpcode = X86ISD::UNPCKL;
10511 std::swap(V1, V2);
10512 return true;
10513 }
10514
10516 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
10517 UnpackOpcode = X86ISD::UNPCKH;
10518 std::swap(V1, V2);
10519 return true;
10520 }
10521 }
10522
10523 return false;
10524}
10525
10526// X86 has dedicated unpack instructions that can handle specific blend
10527// operations: UNPCKH and UNPCKL.
10529 SDValue V2, ArrayRef<int> Mask,
10530 SelectionDAG &DAG) {
10531 SmallVector<int, 8> Unpckl;
10532 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
10533 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10534 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
10535
10536 SmallVector<int, 8> Unpckh;
10537 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
10538 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10539 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
10540
10541 // Commute and try again.
10543 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10544 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
10545
10547 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10548 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
10549
10550 return SDValue();
10551}
10552
10553/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
10554/// followed by unpack 256-bit.
10556 SDValue V2, ArrayRef<int> Mask,
10557 SelectionDAG &DAG) {
10558 SmallVector<int, 32> Unpckl, Unpckh;
10559 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
10560 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
10561
10562 unsigned UnpackOpcode;
10563 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10564 UnpackOpcode = X86ISD::UNPCKL;
10565 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10566 UnpackOpcode = X86ISD::UNPCKH;
10567 else
10568 return SDValue();
10569
10570 // This is a "natural" unpack operation (rather than the 128-bit sectored
10571 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
10572 // input in order to use the x86 instruction.
10573 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
10574 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
10575 V1 = DAG.getBitcast(VT, V1);
10576 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
10577}
10578
10579// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
10580// source into the lower elements and zeroing the upper elements.
10581static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
10582 ArrayRef<int> Mask, const APInt &Zeroable,
10583 const X86Subtarget &Subtarget) {
10584 if (!VT.is512BitVector() && !Subtarget.hasVLX())
10585 return false;
10586
10587 unsigned NumElts = Mask.size();
10588 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10589 unsigned MaxScale = 64 / EltSizeInBits;
10590
10591 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10592 unsigned SrcEltBits = EltSizeInBits * Scale;
10593 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10594 continue;
10595 unsigned NumSrcElts = NumElts / Scale;
10596 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
10597 continue;
10598 unsigned UpperElts = NumElts - NumSrcElts;
10599 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10600 continue;
10601 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
10602 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
10603 DstVT = MVT::getIntegerVT(EltSizeInBits);
10604 if ((NumSrcElts * EltSizeInBits) >= 128) {
10605 // ISD::TRUNCATE
10606 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
10607 } else {
10608 // X86ISD::VTRUNC
10609 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
10610 }
10611 return true;
10612 }
10613
10614 return false;
10615}
10616
10617// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
10618// element padding to the final DstVT.
10619static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
10620 const X86Subtarget &Subtarget,
10621 SelectionDAG &DAG, bool ZeroUppers) {
10622 MVT SrcVT = Src.getSimpleValueType();
10623 MVT DstSVT = DstVT.getScalarType();
10624 unsigned NumDstElts = DstVT.getVectorNumElements();
10625 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10626 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
10627
10628 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
10629 return SDValue();
10630
10631 // Perform a direct ISD::TRUNCATE if possible.
10632 if (NumSrcElts == NumDstElts)
10633 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
10634
10635 if (NumSrcElts > NumDstElts) {
10636 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10637 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10638 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
10639 }
10640
10641 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
10642 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10643 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10644 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10645 DstVT.getSizeInBits());
10646 }
10647
10648 // Non-VLX targets must truncate from a 512-bit type, so we need to
10649 // widen, truncate and then possibly extract the original subvector.
10650 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
10651 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
10652 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
10653 }
10654
10655 // Fallback to a X86ISD::VTRUNC, padding if necessary.
10656 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
10657 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
10658 if (DstVT != TruncVT)
10659 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10660 DstVT.getSizeInBits());
10661 return Trunc;
10662}
10663
10664// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
10665//
10666// An example is the following:
10667//
10668// t0: ch = EntryToken
10669// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
10670// t25: v4i32 = truncate t2
10671// t41: v8i16 = bitcast t25
10672// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
10673// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
10674// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
10675// t18: v2i64 = bitcast t51
10676//
10677// One can just use a single vpmovdw instruction, without avx512vl we need to
10678// use the zmm variant and extract the lower subvector, padding with zeroes.
10679// TODO: Merge with lowerShuffleAsVTRUNC.
10681 SDValue V2, ArrayRef<int> Mask,
10682 const APInt &Zeroable,
10683 const X86Subtarget &Subtarget,
10684 SelectionDAG &DAG) {
10685 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
10686 if (!Subtarget.hasAVX512())
10687 return SDValue();
10688
10689 unsigned NumElts = VT.getVectorNumElements();
10690 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10691 unsigned MaxScale = 64 / EltSizeInBits;
10692 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10693 unsigned SrcEltBits = EltSizeInBits * Scale;
10694 unsigned NumSrcElts = NumElts / Scale;
10695 unsigned UpperElts = NumElts - NumSrcElts;
10696 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
10697 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10698 continue;
10699
10700 // Attempt to find a matching source truncation, but as a fall back VLX
10701 // cases can use the VPMOV directly.
10702 SDValue Src = peekThroughBitcasts(V1);
10703 if (Src.getOpcode() == ISD::TRUNCATE &&
10704 Src.getScalarValueSizeInBits() == SrcEltBits) {
10705 Src = Src.getOperand(0);
10706 } else if (Subtarget.hasVLX()) {
10707 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10708 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10709 Src = DAG.getBitcast(SrcVT, Src);
10710 // Don't do this if PACKSS/PACKUS could perform it cheaper.
10711 if (Scale == 2 &&
10712 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
10713 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
10714 return SDValue();
10715 } else
10716 return SDValue();
10717
10718 // VPMOVWB is only available with avx512bw.
10719 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
10720 return SDValue();
10721
10722 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
10723 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10724 }
10725
10726 return SDValue();
10727}
10728
10729// Attempt to match binary shuffle patterns as a truncate.
10731 SDValue V2, ArrayRef<int> Mask,
10732 const APInt &Zeroable,
10733 const X86Subtarget &Subtarget,
10734 SelectionDAG &DAG) {
10735 assert((VT.is128BitVector() || VT.is256BitVector()) &&
10736 "Unexpected VTRUNC type");
10737 if (!Subtarget.hasAVX512() ||
10738 (VT.is256BitVector() && !Subtarget.useAVX512Regs()))
10739 return SDValue();
10740
10741 unsigned NumElts = VT.getVectorNumElements();
10742 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10743 unsigned MaxScale = 64 / EltSizeInBits;
10744 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10745 // TODO: Support non-BWI VPMOVWB truncations?
10746 unsigned SrcEltBits = EltSizeInBits * Scale;
10747 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10748 continue;
10749
10750 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
10751 // Bail if the V2 elements are undef.
10752 unsigned NumHalfSrcElts = NumElts / Scale;
10753 unsigned NumSrcElts = 2 * NumHalfSrcElts;
10754 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
10755 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
10756 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
10757 continue;
10758
10759 // The elements beyond the truncation must be undef/zero.
10760 unsigned UpperElts = NumElts - NumSrcElts;
10761 if (UpperElts > 0 &&
10762 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10763 continue;
10764 bool UndefUppers =
10765 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
10766
10767 // As we're using both sources then we need to concat them together
10768 // and truncate from the double-sized src.
10769 MVT ConcatVT = VT.getDoubleNumVectorElementsVT();
10770
10771 // For offset truncations, ensure that the concat is cheap.
10772 SDValue Src =
10773 combineConcatVectorOps(DL, ConcatVT, {V1, V2}, DAG, Subtarget);
10774 if (!Src) {
10775 if (Offset)
10776 continue;
10777 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
10778 }
10779
10780 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10781 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10782 Src = DAG.getBitcast(SrcVT, Src);
10783
10784 // Shift the offset'd elements into place for the truncation.
10785 // TODO: Use getTargetVShiftByConstNode.
10786 if (Offset)
10787 Src = DAG.getNode(
10788 X86ISD::VSRLI, DL, SrcVT, Src,
10789 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
10790
10791 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10792 }
10793 }
10794
10795 return SDValue();
10796}
10797
10798/// Check whether a compaction lowering can be done by dropping even/odd
10799/// elements and compute how many times even/odd elements must be dropped.
10800///
10801/// This handles shuffles which take every Nth element where N is a power of
10802/// two. Example shuffle masks:
10803///
10804/// (even)
10805/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
10806/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10807/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
10808/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
10809/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
10810/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
10811///
10812/// (odd)
10813/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
10814/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
10815///
10816/// Any of these lanes can of course be undef.
10817///
10818/// This routine only supports N <= 3.
10819/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10820/// for larger N.
10821///
10822/// \returns N above, or the number of times even/odd elements must be dropped
10823/// if there is such a number. Otherwise returns zero.
10824static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
10825 bool IsSingleInput) {
10826 // The modulus for the shuffle vector entries is based on whether this is
10827 // a single input or not.
10828 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10829 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10830 "We should only be called with masks with a power-of-2 size!");
10831
10832 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10833 int Offset = MatchEven ? 0 : 1;
10834
10835 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10836 // and 2^3 simultaneously. This is because we may have ambiguity with
10837 // partially undef inputs.
10838 bool ViableForN[3] = {true, true, true};
10839
10840 for (int i = 0, e = Mask.size(); i < e; ++i) {
10841 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10842 // want.
10843 if (Mask[i] < 0)
10844 continue;
10845
10846 bool IsAnyViable = false;
10847 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10848 if (ViableForN[j]) {
10849 uint64_t N = j + 1;
10850
10851 // The shuffle mask must be equal to (i * 2^N) % M.
10852 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
10853 IsAnyViable = true;
10854 else
10855 ViableForN[j] = false;
10856 }
10857 // Early exit if we exhaust the possible powers of two.
10858 if (!IsAnyViable)
10859 break;
10860 }
10861
10862 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10863 if (ViableForN[j])
10864 return j + 1;
10865
10866 // Return 0 as there is no viable power of two.
10867 return 0;
10868}
10869
10870// X86 has dedicated pack instructions that can handle specific truncation
10871// operations: PACKSS and PACKUS.
10872// Checks for compaction shuffle masks if MaxStages > 1.
10873// TODO: Add support for matching multiple PACKSS/PACKUS stages.
10874static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
10875 unsigned &PackOpcode, ArrayRef<int> TargetMask,
10876 const SelectionDAG &DAG,
10877 const X86Subtarget &Subtarget,
10878 unsigned MaxStages = 1) {
10879 unsigned NumElts = VT.getVectorNumElements();
10880 unsigned BitSize = VT.getScalarSizeInBits();
10881 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
10882 "Illegal maximum compaction");
10883
10884 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
10885 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
10886 unsigned NumPackedBits = NumSrcBits - BitSize;
10887 N1 = peekThroughBitcasts(N1);
10888 N2 = peekThroughBitcasts(N2);
10889 unsigned NumBits1 = N1.getScalarValueSizeInBits();
10890 unsigned NumBits2 = N2.getScalarValueSizeInBits();
10891 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
10892 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
10893 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
10894 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
10895 return false;
10896 if (Subtarget.hasSSE41() || BitSize == 8) {
10897 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
10898 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
10899 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
10900 V1 = N1;
10901 V2 = N2;
10902 SrcVT = PackVT;
10903 PackOpcode = X86ISD::PACKUS;
10904 return true;
10905 }
10906 }
10907 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
10908 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
10909 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
10910 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
10911 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
10912 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
10913 V1 = N1;
10914 V2 = N2;
10915 SrcVT = PackVT;
10916 PackOpcode = X86ISD::PACKSS;
10917 return true;
10918 }
10919 return false;
10920 };
10921
10922 // Attempt to match against wider and wider compaction patterns.
10923 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
10924 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
10925 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
10926
10927 // Try binary shuffle.
10928 SmallVector<int, 32> BinaryMask;
10929 createPackShuffleMask(VT, BinaryMask, false, NumStages);
10930 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
10931 if (MatchPACK(V1, V2, PackVT))
10932 return true;
10933
10934 // Try unary shuffle.
10935 SmallVector<int, 32> UnaryMask;
10936 createPackShuffleMask(VT, UnaryMask, true, NumStages);
10937 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
10938 if (MatchPACK(V1, V1, PackVT))
10939 return true;
10940 }
10941
10942 return false;
10943}
10944
10946 SDValue V2, ArrayRef<int> Mask,
10947 const X86Subtarget &Subtarget,
10948 SelectionDAG &DAG) {
10949 MVT PackVT;
10950 unsigned PackOpcode;
10951 unsigned SizeBits = VT.getSizeInBits();
10952 unsigned EltBits = VT.getScalarSizeInBits();
10953 unsigned MaxStages = Log2_32(64 / EltBits);
10954 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
10955 Subtarget, MaxStages))
10956 return SDValue();
10957
10958 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
10959 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
10960
10961 // Don't lower multi-stage packs on AVX512, truncation is better.
10962 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
10963 return SDValue();
10964
10965 // Pack to the largest type possible:
10966 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
10967 unsigned MaxPackBits = 16;
10968 if (CurrentEltBits > 16 &&
10969 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
10970 MaxPackBits = 32;
10971
10972 // Repeatedly pack down to the target size.
10973 SDValue Res;
10974 for (unsigned i = 0; i != NumStages; ++i) {
10975 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
10976 unsigned NumSrcElts = SizeBits / SrcEltBits;
10977 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10978 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
10979 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10980 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
10981 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
10982 DAG.getBitcast(SrcVT, V2));
10983 V1 = V2 = Res;
10984 CurrentEltBits /= 2;
10985 }
10986 assert(Res && Res.getValueType() == VT &&
10987 "Failed to lower compaction shuffle");
10988 return Res;
10989}
10990
10991/// Try to emit a bitmask instruction for a shuffle.
10992///
10993/// This handles cases where we can model a blend exactly as a bitmask due to
10994/// one of the inputs being zeroable.
10996 SDValue V2, ArrayRef<int> Mask,
10997 const APInt &Zeroable,
10998 const X86Subtarget &Subtarget,
10999 SelectionDAG &DAG) {
11000 MVT MaskVT = VT;
11001 MVT EltVT = VT.getVectorElementType();
11002 SDValue Zero, AllOnes;
11003 // Use f64 if i64 isn't legal.
11004 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
11005 EltVT = MVT::f64;
11006 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
11007 }
11008
11009 MVT LogicVT = VT;
11010 if (EltVT.isFloatingPoint()) {
11011 Zero = DAG.getConstantFP(0.0, DL, EltVT);
11012 APFloat AllOnesValue = APFloat::getAllOnesValue(EltVT.getFltSemantics());
11013 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
11014 LogicVT = MVT::getVectorVT(EltVT.changeTypeToInteger(), Mask.size());
11015 } else {
11016 Zero = DAG.getConstant(0, DL, EltVT);
11017 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11018 }
11019
11020 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
11021 SDValue V;
11022 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11023 if (Zeroable[i])
11024 continue;
11025 if (Mask[i] % Size != i)
11026 return SDValue(); // Not a blend.
11027 if (!V)
11028 V = Mask[i] < Size ? V1 : V2;
11029 else if (V != (Mask[i] < Size ? V1 : V2))
11030 return SDValue(); // Can only let one input through the mask.
11031
11032 VMaskOps[i] = AllOnes;
11033 }
11034 if (!V)
11035 return SDValue(); // No non-zeroable elements!
11036
11037 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
11038 VMask = DAG.getBitcast(LogicVT, VMask);
11039 V = DAG.getBitcast(LogicVT, V);
11040 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
11041 return DAG.getBitcast(VT, And);
11042}
11043
11044/// Try to emit a blend instruction for a shuffle using bit math.
11045///
11046/// This is used as a fallback approach when first class blend instructions are
11047/// unavailable. Currently it is only suitable for integer vectors, but could
11048/// be generalized for floating point vectors if desirable.
11050 SDValue V2, ArrayRef<int> Mask,
11051 SelectionDAG &DAG) {
11052 assert(VT.isInteger() && "Only supports integer vector types!");
11053 MVT EltVT = VT.getVectorElementType();
11054 SDValue Zero = DAG.getConstant(0, DL, EltVT);
11055 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11057 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11058 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
11059 return SDValue(); // Shuffled input!
11060 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
11061 }
11062
11063 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
11064 return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);
11065}
11066
11068 SDValue PreservedSrc,
11069 const X86Subtarget &Subtarget,
11070 SelectionDAG &DAG);
11071
11074 const APInt &Zeroable, bool &ForceV1Zero,
11075 bool &ForceV2Zero, uint64_t &BlendMask) {
11076 bool V1IsZeroOrUndef =
11078 bool V2IsZeroOrUndef =
11080
11081 BlendMask = 0;
11082 ForceV1Zero = false, ForceV2Zero = false;
11083 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
11084
11085 int NumElts = Mask.size();
11086 int NumLanes = VT.getSizeInBits() / 128;
11087 int NumEltsPerLane = NumElts / NumLanes;
11088 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");
11089
11090 // For 32/64-bit elements, if we only reference one input (plus any undefs),
11091 // then ensure the blend mask part for that lane just references that input.
11092 bool ForceWholeLaneMasks =
11093 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
11094
11095 // Attempt to generate the binary blend mask. If an input is zero then
11096 // we can use any lane.
11097 for (int Lane = 0; Lane != NumLanes; ++Lane) {
11098 // Keep track of the inputs used per lane.
11099 bool LaneV1InUse = false;
11100 bool LaneV2InUse = false;
11101 uint64_t LaneBlendMask = 0;
11102 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
11103 int Elt = (Lane * NumEltsPerLane) + LaneElt;
11104 int M = Mask[Elt];
11105 if (M == SM_SentinelUndef)
11106 continue;
11107 if (M == Elt || (0 <= M && M < NumElts &&
11108 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
11109 Mask[Elt] = Elt;
11110 LaneV1InUse = true;
11111 continue;
11112 }
11113 if (M == (Elt + NumElts) ||
11114 (NumElts <= M &&
11115 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
11116 LaneBlendMask |= 1ull << LaneElt;
11117 Mask[Elt] = Elt + NumElts;
11118 LaneV2InUse = true;
11119 continue;
11120 }
11121 if (Zeroable[Elt]) {
11122 if (V1IsZeroOrUndef) {
11123 ForceV1Zero = true;
11124 Mask[Elt] = Elt;
11125 LaneV1InUse = true;
11126 continue;
11127 }
11128 if (V2IsZeroOrUndef) {
11129 ForceV2Zero = true;
11130 LaneBlendMask |= 1ull << LaneElt;
11131 Mask[Elt] = Elt + NumElts;
11132 LaneV2InUse = true;
11133 continue;
11134 }
11135 }
11136 return false;
11137 }
11138
11139 // If we only used V2 then splat the lane blend mask to avoid any demanded
11140 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
11141 // blend mask bit).
11142 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
11143 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
11144
11145 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
11146 }
11147 return true;
11148}
11149
11150/// Try to emit a blend instruction for a shuffle.
11151///
11152/// This doesn't do any checks for the availability of instructions for blending
11153/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
11154/// be matched in the backend with the type given. What it does check for is
11155/// that the shuffle mask is a blend, or convertible into a blend with zero.
11157 SDValue V2, ArrayRef<int> Original,
11158 const APInt &Zeroable,
11159 const X86Subtarget &Subtarget,
11160 SelectionDAG &DAG) {
11161 uint64_t BlendMask = 0;
11162 bool ForceV1Zero = false, ForceV2Zero = false;
11163 SmallVector<int, 64> Mask(Original);
11164 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
11165 BlendMask))
11166 return SDValue();
11167
11168 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
11169 if (ForceV1Zero)
11170 V1 = getZeroVector(VT, Subtarget, DAG, DL);
11171 if (ForceV2Zero)
11172 V2 = getZeroVector(VT, Subtarget, DAG, DL);
11173
11174 unsigned NumElts = VT.getVectorNumElements();
11175
11176 switch (VT.SimpleTy) {
11177 case MVT::v4i64:
11178 case MVT::v8i32:
11179 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
11180 [[fallthrough]];
11181 case MVT::v4f64:
11182 case MVT::v8f32:
11183 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
11184 [[fallthrough]];
11185 case MVT::v2f64:
11186 case MVT::v2i64:
11187 case MVT::v4f32:
11188 case MVT::v4i32:
11189 case MVT::v8i16:
11190 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
11191 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
11192 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11193 case MVT::v16i16: {
11194 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
11195 SmallVector<int, 8> RepeatedMask;
11196 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
11197 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
11198 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
11199 BlendMask = 0;
11200 for (int i = 0; i < 8; ++i)
11201 if (RepeatedMask[i] >= 8)
11202 BlendMask |= 1ull << i;
11203 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11204 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11205 }
11206 // Use PBLENDW for lower/upper lanes and then blend lanes.
11207 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
11208 // merge to VSELECT where useful.
11209 uint64_t LoMask = BlendMask & 0xFF;
11210 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
11211 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
11212 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11213 DAG.getTargetConstant(LoMask, DL, MVT::i8));
11214 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11215 DAG.getTargetConstant(HiMask, DL, MVT::i8));
11216 return DAG.getVectorShuffle(
11217 MVT::v16i16, DL, Lo, Hi,
11218 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
11219 }
11220 [[fallthrough]];
11221 }
11222 case MVT::v32i8:
11223 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
11224 [[fallthrough]];
11225 case MVT::v16i8: {
11226 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
11227
11228 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
11229 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11230 Subtarget, DAG))
11231 return Masked;
11232
11233 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
11234 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11235 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11236 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11237 }
11238
11239 // If we have VPTERNLOG, we can use that as a bit blend.
11240 if (Subtarget.hasVLX())
11241 if (SDValue BitBlend =
11242 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
11243 return BitBlend;
11244
11245 // Scale the blend by the number of bytes per element.
11246 int Scale = VT.getScalarSizeInBits() / 8;
11247
11248 // This form of blend is always done on bytes. Compute the byte vector
11249 // type.
11250 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11251
11252 // x86 allows load folding with blendvb from the 2nd source operand. But
11253 // we are still using LLVM select here (see comment below), so that's V1.
11254 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
11255 // allow that load-folding possibility.
11256 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
11258 std::swap(V1, V2);
11259 }
11260
11261 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
11262 // mix of LLVM's code generator and the x86 backend. We tell the code
11263 // generator that boolean values in the elements of an x86 vector register
11264 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
11265 // mapping a select to operand #1, and 'false' mapping to operand #2. The
11266 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
11267 // of the element (the remaining are ignored) and 0 in that high bit would
11268 // mean operand #1 while 1 in the high bit would mean operand #2. So while
11269 // the LLVM model for boolean values in vector elements gets the relevant
11270 // bit set, it is set backwards and over constrained relative to x86's
11271 // actual model.
11272 SmallVector<SDValue, 32> VSELECTMask;
11273 for (int i = 0, Size = Mask.size(); i < Size; ++i)
11274 for (int j = 0; j < Scale; ++j)
11275 VSELECTMask.push_back(
11276 Mask[i] < 0
11277 ? DAG.getUNDEF(MVT::i8)
11278 : DAG.getSignedConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8));
11279
11280 V1 = DAG.getBitcast(BlendVT, V1);
11281 V2 = DAG.getBitcast(BlendVT, V2);
11282 return DAG.getBitcast(
11283 VT,
11284 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
11285 V1, V2));
11286 }
11287 case MVT::v16f32:
11288 case MVT::v8f64:
11289 case MVT::v8i64:
11290 case MVT::v16i32:
11291 case MVT::v32i16:
11292 case MVT::v64i8: {
11293 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
11294 bool OptForSize = DAG.shouldOptForSize();
11295 if (!OptForSize) {
11296 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11297 Subtarget, DAG))
11298 return Masked;
11299 }
11300
11301 // Otherwise load an immediate into a GPR, cast to k-register, and use a
11302 // masked move.
11303 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11304 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11305 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11306 }
11307 default:
11308 llvm_unreachable("Not a supported integer vector type!");
11309 }
11310}
11311
11312/// Try to lower as a blend of elements from two inputs followed by
11313/// a single-input permutation.
11314///
11315/// This matches the pattern where we can blend elements from two inputs and
11316/// then reduce the shuffle to a single-input permutation.
11318 SDValue V1, SDValue V2,
11319 ArrayRef<int> Mask,
11320 SelectionDAG &DAG,
11321 bool ImmBlends = false) {
11322 // We build up the blend mask while checking whether a blend is a viable way
11323 // to reduce the shuffle.
11324 SmallVector<int, 32> BlendMask(Mask.size(), -1);
11325 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
11326
11327 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11328 if (Mask[i] < 0)
11329 continue;
11330
11331 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
11332
11333 if (BlendMask[Mask[i] % Size] < 0)
11334 BlendMask[Mask[i] % Size] = Mask[i];
11335 else if (BlendMask[Mask[i] % Size] != Mask[i])
11336 return SDValue(); // Can't blend in the needed input!
11337
11338 PermuteMask[i] = Mask[i] % Size;
11339 }
11340
11341 // If only immediate blends, then bail if the blend mask can't be widened to
11342 // i16.
11343 unsigned EltSize = VT.getScalarSizeInBits();
11344 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
11345 return SDValue();
11346
11347 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
11348 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
11349}
11350
11351/// Try to lower as an unpack of elements from two inputs followed by
11352/// a single-input permutation.
11353///
11354/// This matches the pattern where we can unpack elements from two inputs and
11355/// then reduce the shuffle to a single-input (wider) permutation.
11357 SDValue V1, SDValue V2,
11358 ArrayRef<int> Mask,
11359 SelectionDAG &DAG) {
11360 int NumElts = Mask.size();
11361 int NumLanes = VT.getSizeInBits() / 128;
11362 int NumLaneElts = NumElts / NumLanes;
11363 int NumHalfLaneElts = NumLaneElts / 2;
11364
11365 bool MatchLo = true, MatchHi = true;
11366 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
11367
11368 // Determine UNPCKL/UNPCKH type and operand order.
11369 for (int Elt = 0; Elt != NumElts; ++Elt) {
11370 int M = Mask[Elt];
11371 if (M < 0)
11372 continue;
11373
11374 // Normalize the mask value depending on whether it's V1 or V2.
11375 int NormM = M;
11376 SDValue &Op = Ops[Elt & 1];
11377 if (M < NumElts && (Op.isUndef() || Op == V1))
11378 Op = V1;
11379 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
11380 Op = V2;
11381 NormM -= NumElts;
11382 } else
11383 return SDValue();
11384
11385 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
11386 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11387 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
11388 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
11389 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
11390 if (MatchLoAnyLane || MatchHiAnyLane) {
11391 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
11392 "Failed to match UNPCKLO/UNPCKHI");
11393 break;
11394 }
11395 }
11396 MatchLo &= MatchLoAnyLane;
11397 MatchHi &= MatchHiAnyLane;
11398 if (!MatchLo && !MatchHi)
11399 return SDValue();
11400 }
11401 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
11402
11403 // Element indices have changed after unpacking. Calculate permute mask
11404 // so that they will be put back to the position as dictated by the
11405 // original shuffle mask indices.
11406 SmallVector<int, 32> PermuteMask(NumElts, -1);
11407 for (int Elt = 0; Elt != NumElts; ++Elt) {
11408 int M = Mask[Elt];
11409 if (M < 0)
11410 continue;
11411 int NormM = M;
11412 if (NumElts <= M)
11413 NormM -= NumElts;
11414 bool IsFirstOp = M < NumElts;
11415 int BaseMaskElt =
11416 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
11417 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
11418 PermuteMask[Elt] = BaseMaskElt;
11419 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
11420 PermuteMask[Elt] = BaseMaskElt + 1;
11421 assert(PermuteMask[Elt] != -1 &&
11422 "Input mask element is defined but failed to assign permute mask");
11423 }
11424
11425 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11426 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
11427 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
11428}
11429
11430/// Try to lower a shuffle as a permute of the inputs followed by an
11431/// UNPCK instruction.
11432///
11433/// This specifically targets cases where we end up with alternating between
11434/// the two inputs, and so can permute them into something that feeds a single
11435/// UNPCK instruction. Note that this routine only targets integer vectors
11436/// because for floating point vectors we have a generalized SHUFPS lowering
11437/// strategy that handles everything that doesn't *exactly* match an unpack,
11438/// making this clever lowering unnecessary.
11440 SDValue V1, SDValue V2,
11441 ArrayRef<int> Mask,
11442 const X86Subtarget &Subtarget,
11443 SelectionDAG &DAG) {
11444 int Size = Mask.size();
11445 assert(Mask.size() >= 2 && "Single element masks are invalid.");
11446
11447 // This routine only supports 128-bit integer dual input vectors.
11448 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
11449 return SDValue();
11450
11451 int NumLoInputs =
11452 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
11453 int NumHiInputs =
11454 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
11455
11456 bool UnpackLo = NumLoInputs >= NumHiInputs;
11457
11458 auto TryUnpack = [&](int ScalarSize, int Scale) {
11459 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
11460 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
11461
11462 for (int i = 0; i < Size; ++i) {
11463 if (Mask[i] < 0)
11464 continue;
11465
11466 // Each element of the unpack contains Scale elements from this mask.
11467 int UnpackIdx = i / Scale;
11468
11469 // We only handle the case where V1 feeds the first slots of the unpack.
11470 // We rely on canonicalization to ensure this is the case.
11471 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
11472 return SDValue();
11473
11474 // Setup the mask for this input. The indexing is tricky as we have to
11475 // handle the unpack stride.
11476 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
11477 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
11478 Mask[i] % Size;
11479 }
11480
11481 // If we will have to shuffle both inputs to use the unpack, check whether
11482 // we can just unpack first and shuffle the result. If so, skip this unpack.
11483 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
11484 !isNoopShuffleMask(V2Mask))
11485 return SDValue();
11486
11487 // Shuffle the inputs into place.
11488 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11489 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11490
11491 // Cast the inputs to the type we will use to unpack them.
11492 MVT UnpackVT =
11493 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
11494 V1 = DAG.getBitcast(UnpackVT, V1);
11495 V2 = DAG.getBitcast(UnpackVT, V2);
11496
11497 // Unpack the inputs and cast the result back to the desired type.
11498 return DAG.getBitcast(
11499 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11500 UnpackVT, V1, V2));
11501 };
11502
11503 // We try each unpack from the largest to the smallest to try and find one
11504 // that fits this mask.
11505 int OrigScalarSize = VT.getScalarSizeInBits();
11506 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
11507 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
11508 return Unpack;
11509
11510 // If we're shuffling with a zero vector then we're better off not doing
11511 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
11514 return SDValue();
11515
11516 // If none of the unpack-rooted lowerings worked (or were profitable) try an
11517 // initial unpack.
11518 if (NumLoInputs == 0 || NumHiInputs == 0) {
11519 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
11520 "We have to have *some* inputs!");
11521 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
11522
11523 // FIXME: We could consider the total complexity of the permute of each
11524 // possible unpacking. Or at the least we should consider how many
11525 // half-crossings are created.
11526 // FIXME: We could consider commuting the unpacks.
11527
11528 SmallVector<int, 32> PermMask((unsigned)Size, -1);
11529 for (int i = 0; i < Size; ++i) {
11530 if (Mask[i] < 0)
11531 continue;
11532
11533 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
11534
11535 PermMask[i] =
11536 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11537 }
11538 return DAG.getVectorShuffle(
11539 VT, DL,
11540 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
11541 V1, V2),
11542 DAG.getUNDEF(VT), PermMask);
11543 }
11544
11545 return SDValue();
11546}
11547
11548/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
11549/// permuting the elements of the result in place.
11551 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11552 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11553 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
11554 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
11555 (VT.is512BitVector() && !Subtarget.hasBWI()))
11556 return SDValue();
11557
11558 // We don't currently support lane crossing permutes.
11559 if (is128BitLaneCrossingShuffleMask(VT, Mask))
11560 return SDValue();
11561
11562 int Scale = VT.getScalarSizeInBits() / 8;
11563 int NumLanes = VT.getSizeInBits() / 128;
11564 int NumElts = VT.getVectorNumElements();
11565 int NumEltsPerLane = NumElts / NumLanes;
11566
11567 // Determine range of mask elts.
11568 bool Blend1 = true;
11569 bool Blend2 = true;
11570 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
11571 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
11572 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11573 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11574 int M = Mask[Lane + Elt];
11575 if (M < 0)
11576 continue;
11577 if (M < NumElts) {
11578 Blend1 &= (M == (Lane + Elt));
11579 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11580 M = M % NumEltsPerLane;
11581 Range1.first = std::min(Range1.first, M);
11582 Range1.second = std::max(Range1.second, M);
11583 } else {
11584 M -= NumElts;
11585 Blend2 &= (M == (Lane + Elt));
11586 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11587 M = M % NumEltsPerLane;
11588 Range2.first = std::min(Range2.first, M);
11589 Range2.second = std::max(Range2.second, M);
11590 }
11591 }
11592 }
11593
11594 // Bail if we don't need both elements.
11595 // TODO - it might be worth doing this for unary shuffles if the permute
11596 // can be widened.
11597 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
11598 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
11599 return SDValue();
11600
11601 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
11602 return SDValue();
11603
11604 // Rotate the 2 ops so we can access both ranges, then permute the result.
11605 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
11606 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11607 SDValue Rotate = DAG.getBitcast(
11608 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
11609 DAG.getBitcast(ByteVT, Lo),
11610 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
11611 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
11612 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11613 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11614 int M = Mask[Lane + Elt];
11615 if (M < 0)
11616 continue;
11617 if (M < NumElts)
11618 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11619 else
11620 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11621 }
11622 }
11623 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
11624 };
11625
11626 // Check if the ranges are small enough to rotate from either direction.
11627 if (Range2.second < Range1.first)
11628 return RotateAndPermute(V1, V2, Range1.first, 0);
11629 if (Range1.second < Range2.first)
11630 return RotateAndPermute(V2, V1, Range2.first, NumElts);
11631 return SDValue();
11632}
11633
11635 return isUndefOrEqual(Mask, 0);
11636}
11637
11639 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
11640}
11641
11642/// Check if the Mask consists of the same element repeated multiple times.
11644 size_t NumUndefs = 0;
11645 std::optional<int> UniqueElt;
11646 for (int Elt : Mask) {
11647 if (Elt == SM_SentinelUndef) {
11648 NumUndefs++;
11649 continue;
11650 }
11651 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
11652 return false;
11653 UniqueElt = Elt;
11654 }
11655 // Make sure the element is repeated enough times by checking the number of
11656 // undefs is small.
11657 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
11658}
11659
11660/// Generic routine to decompose a shuffle and blend into independent
11661/// blends and permutes.
11662///
11663/// This matches the extremely common pattern for handling combined
11664/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
11665/// operations. It will try to pick the best arrangement of shuffles and
11666/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
11668 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11669 const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11670 int NumElts = Mask.size();
11671 int NumLanes = VT.getSizeInBits() / 128;
11672 int NumEltsPerLane = NumElts / NumLanes;
11673
11674 // Shuffle the input elements into the desired positions in V1 and V2 and
11675 // unpack/blend them together.
11676 bool IsAlternating = true;
11677 bool V1Zero = true, V2Zero = true;
11678 SmallVector<int, 32> V1Mask(NumElts, -1);
11679 SmallVector<int, 32> V2Mask(NumElts, -1);
11680 SmallVector<int, 32> FinalMask(NumElts, -1);
11681 for (int i = 0; i < NumElts; ++i) {
11682 int M = Mask[i];
11683 if (M >= 0 && M < NumElts) {
11684 V1Mask[i] = M;
11685 FinalMask[i] = i;
11686 V1Zero &= Zeroable[i];
11687 IsAlternating &= (i & 1) == 0;
11688 } else if (M >= NumElts) {
11689 V2Mask[i] = M - NumElts;
11690 FinalMask[i] = i + NumElts;
11691 V2Zero &= Zeroable[i];
11692 IsAlternating &= (i & 1) == 1;
11693 }
11694 }
11695
11696 // If we effectively only demand the 0'th element of \p Input, and not only
11697 // as 0'th element, then broadcast said input,
11698 // and change \p InputMask to be a no-op (identity) mask.
11699 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
11700 &DAG](SDValue &Input,
11701 MutableArrayRef<int> InputMask) {
11702 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
11703 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
11704 !X86::mayFoldLoad(Input, Subtarget)))
11705 return;
11706 if (isNoopShuffleMask(InputMask))
11707 return;
11708 assert(isBroadcastShuffleMask(InputMask) &&
11709 "Expected to demand only the 0'th element.");
11711 for (auto I : enumerate(InputMask)) {
11712 int &InputMaskElt = I.value();
11713 if (InputMaskElt >= 0)
11714 InputMaskElt = I.index();
11715 }
11716 };
11717
11718 // Currently, we may need to produce one shuffle per input, and blend results.
11719 // It is possible that the shuffle for one of the inputs is already a no-op.
11720 // See if we can simplify non-no-op shuffles into broadcasts,
11721 // which we consider to be strictly better than an arbitrary shuffle.
11722 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
11724 canonicalizeBroadcastableInput(V1, V1Mask);
11725 canonicalizeBroadcastableInput(V2, V2Mask);
11726 }
11727
11728 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
11729 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
11730 // the shuffle may be able to fold with a load or other benefit. However, when
11731 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
11732 // pre-shuffle first is a better strategy.
11733 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
11734 // If we don't have blends, see if we can create a cheap unpack.
11735 if (!Subtarget.hasSSE41() && VT.is128BitVector() &&
11736 (is128BitUnpackShuffleMask(V1Mask, DAG) ||
11737 is128BitUnpackShuffleMask(V2Mask, DAG)))
11738 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11739 DL, VT, V1, V2, Mask, Subtarget, DAG))
11740 return PermUnpack;
11741
11742 // Only prefer immediate blends to unpack/rotate.
11743 if (SDValue BlendPerm =
11744 lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG, true))
11745 return BlendPerm;
11746
11747 // If either input vector provides only a single element which is repeated
11748 // multiple times, unpacking from both input vectors would generate worse
11749 // code. e.g. for
11750 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
11751 // it is better to process t4 first to create a vector of t4[0], then unpack
11752 // that vector with t2.
11753 if (!V1Zero && !V2Zero && !isSingleElementRepeatedMask(V1Mask) &&
11755 if (SDValue UnpackPerm =
11756 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
11757 return UnpackPerm;
11758
11760 DL, VT, V1, V2, Mask, Subtarget, DAG))
11761 return RotatePerm;
11762
11763 // Unpack/rotate failed - try again with variable blends.
11764 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11765 DAG))
11766 return BlendPerm;
11767
11768 if (VT.getScalarSizeInBits() >= 32)
11769 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11770 DL, VT, V1, V2, Mask, Subtarget, DAG))
11771 return PermUnpack;
11772 }
11773
11774 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
11775 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
11776 // TODO: It doesn't have to be alternating - but each lane mustn't have more
11777 // than half the elements coming from each source.
11778 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
11779 V1Mask.assign(NumElts, -1);
11780 V2Mask.assign(NumElts, -1);
11781 FinalMask.assign(NumElts, -1);
11782 for (int i = 0; i != NumElts; i += NumEltsPerLane)
11783 for (int j = 0; j != NumEltsPerLane; ++j) {
11784 int M = Mask[i + j];
11785 if (M >= 0 && M < NumElts) {
11786 V1Mask[i + (j / 2)] = M;
11787 FinalMask[i + j] = i + (j / 2);
11788 } else if (M >= NumElts) {
11789 V2Mask[i + (j / 2)] = M - NumElts;
11790 FinalMask[i + j] = i + (j / 2) + NumElts;
11791 }
11792 }
11793 }
11794
11795 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11796 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11797 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
11798}
11799
11800static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
11801 const X86Subtarget &Subtarget,
11802 ArrayRef<int> Mask) {
11803 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11804 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11805
11806 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11807 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11808 int MaxSubElts = 64 / EltSizeInBits;
11809 unsigned RotateAmt, NumSubElts;
11810 if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,
11811 MaxSubElts, NumSubElts, RotateAmt))
11812 return -1;
11813 unsigned NumElts = Mask.size();
11814 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11815 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11816 return RotateAmt;
11817}
11818
11819/// Lower shuffle using X86ISD::VROTLI rotations.
11821 ArrayRef<int> Mask,
11822 const X86Subtarget &Subtarget,
11823 SelectionDAG &DAG) {
11824 // Only XOP + AVX512 targets have bit rotation instructions.
11825 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
11826 bool IsLegal =
11827 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
11828 if (!IsLegal && Subtarget.hasSSE3())
11829 return SDValue();
11830
11831 MVT RotateVT;
11832 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
11833 Subtarget, Mask);
11834 if (RotateAmt < 0)
11835 return SDValue();
11836
11837 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11838 // expanded to OR(SRL,SHL), will be more efficient, but if they can
11839 // widen to vXi16 or more then existing lowering should will be better.
11840 if (!IsLegal) {
11841 if ((RotateAmt % 16) == 0)
11842 return SDValue();
11843 // TODO: Use getTargetVShiftByConstNode.
11844 unsigned ShlAmt = RotateAmt;
11845 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11846 V1 = DAG.getBitcast(RotateVT, V1);
11847 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11848 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11849 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11850 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11851 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
11852 return DAG.getBitcast(VT, Rot);
11853 }
11854
11855 SDValue Rot =
11856 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11857 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
11858 return DAG.getBitcast(VT, Rot);
11859}
11860
11861/// Try to match a vector shuffle as an element rotation.
11862///
11863/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
11865 ArrayRef<int> Mask) {
11866 int NumElts = Mask.size();
11867
11868 // We need to detect various ways of spelling a rotation:
11869 // [11, 12, 13, 14, 15, 0, 1, 2]
11870 // [-1, 12, 13, 14, -1, -1, 1, -1]
11871 // [-1, -1, -1, -1, -1, -1, 1, 2]
11872 // [ 3, 4, 5, 6, 7, 8, 9, 10]
11873 // [-1, 4, 5, 6, -1, -1, 9, -1]
11874 // [-1, 4, 5, 6, -1, -1, -1, -1]
11875 int Rotation = 0;
11876 SDValue Lo, Hi;
11877 for (int i = 0; i < NumElts; ++i) {
11878 int M = Mask[i];
11879 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
11880 "Unexpected mask index.");
11881 if (M < 0)
11882 continue;
11883
11884 // Determine where a rotated vector would have started.
11885 int StartIdx = i - (M % NumElts);
11886 if (StartIdx == 0)
11887 // The identity rotation isn't interesting, stop.
11888 return -1;
11889
11890 // If we found the tail of a vector the rotation must be the missing
11891 // front. If we found the head of a vector, it must be how much of the
11892 // head.
11893 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11894
11895 if (Rotation == 0)
11896 Rotation = CandidateRotation;
11897 else if (Rotation != CandidateRotation)
11898 // The rotations don't match, so we can't match this mask.
11899 return -1;
11900
11901 // Compute which value this mask is pointing at.
11902 SDValue MaskV = M < NumElts ? V1 : V2;
11903
11904 // Compute which of the two target values this index should be assigned
11905 // to. This reflects whether the high elements are remaining or the low
11906 // elements are remaining.
11907 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
11908
11909 // Either set up this value if we've not encountered it before, or check
11910 // that it remains consistent.
11911 if (!TargetV)
11912 TargetV = MaskV;
11913 else if (TargetV != MaskV)
11914 // This may be a rotation, but it pulls from the inputs in some
11915 // unsupported interleaving.
11916 return -1;
11917 }
11918
11919 // Check that we successfully analyzed the mask, and normalize the results.
11920 assert(Rotation != 0 && "Failed to locate a viable rotation!");
11921 assert((Lo || Hi) && "Failed to find a rotated input vector!");
11922 if (!Lo)
11923 Lo = Hi;
11924 else if (!Hi)
11925 Hi = Lo;
11926
11927 V1 = Lo;
11928 V2 = Hi;
11929
11930 return Rotation;
11931}
11932
11933/// Try to lower a vector shuffle as a byte rotation.
11934///
11935/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
11936/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
11937/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
11938/// try to generically lower a vector shuffle through such an pattern. It
11939/// does not check for the profitability of lowering either as PALIGNR or
11940/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
11941/// This matches shuffle vectors that look like:
11942///
11943/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
11944///
11945/// Essentially it concatenates V1 and V2, shifts right by some number of
11946/// elements, and takes the low elements as the result. Note that while this is
11947/// specified as a *right shift* because x86 is little-endian, it is a *left
11948/// rotate* of the vector lanes.
11950 ArrayRef<int> Mask) {
11951 // Don't accept any shuffles with zero elements.
11952 if (isAnyZero(Mask))
11953 return -1;
11954
11955 // PALIGNR works on 128-bit lanes.
11956 SmallVector<int, 16> RepeatedMask;
11957 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
11958 return -1;
11959
11960 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
11961 if (Rotation <= 0)
11962 return -1;
11963
11964 // PALIGNR rotates bytes, so we need to scale the
11965 // rotation based on how many bytes are in the vector lane.
11966 int NumElts = RepeatedMask.size();
11967 int Scale = 16 / NumElts;
11968 return Rotation * Scale;
11969}
11970
11972 SDValue V2, ArrayRef<int> Mask,
11973 const X86Subtarget &Subtarget,
11974 SelectionDAG &DAG) {
11975 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11976
11977 SDValue Lo = V1, Hi = V2;
11978 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
11979 if (ByteRotation <= 0)
11980 return SDValue();
11981
11982 // Cast the inputs to i8 vector of correct length to match PALIGNR or
11983 // PSLLDQ/PSRLDQ.
11984 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11985 Lo = DAG.getBitcast(ByteVT, Lo);
11986 Hi = DAG.getBitcast(ByteVT, Hi);
11987
11988 // SSSE3 targets can use the palignr instruction.
11989 if (Subtarget.hasSSSE3()) {
11990 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
11991 "512-bit PALIGNR requires BWI instructions");
11992 return DAG.getBitcast(
11993 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
11994 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
11995 }
11996
11997 assert(VT.is128BitVector() &&
11998 "Rotate-based lowering only supports 128-bit lowering!");
11999 assert(Mask.size() <= 16 &&
12000 "Can shuffle at most 16 bytes in a 128-bit vector!");
12001 assert(ByteVT == MVT::v16i8 &&
12002 "SSE2 rotate lowering only needed for v16i8!");
12003
12004 // Default SSE2 implementation
12005 int LoByteShift = 16 - ByteRotation;
12006 int HiByteShift = ByteRotation;
12007
12008 SDValue LoShift =
12009 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
12010 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
12011 SDValue HiShift =
12012 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
12013 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
12014 return DAG.getBitcast(VT,
12015 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
12016}
12017
12018/// Try to lower a vector shuffle as a dword/qword rotation.
12019///
12020/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
12021/// rotation of the concatenation of two vectors; This routine will
12022/// try to generically lower a vector shuffle through such an pattern.
12023///
12024/// Essentially it concatenates V1 and V2, shifts right by some number of
12025/// elements, and takes the low elements as the result. Note that while this is
12026/// specified as a *right shift* because x86 is little-endian, it is a *left
12027/// rotate* of the vector lanes.
12029 SDValue V2, ArrayRef<int> Mask,
12030 const APInt &Zeroable,
12031 const X86Subtarget &Subtarget,
12032 SelectionDAG &DAG) {
12033 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
12034 "Only 32-bit and 64-bit elements are supported!");
12035
12036 // 128/256-bit vectors are only supported with VLX.
12037 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
12038 && "VLX required for 128/256-bit vectors");
12039
12040 SDValue Lo = V1, Hi = V2;
12041 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
12042 if (0 < Rotation)
12043 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
12044 DAG.getTargetConstant(Rotation, DL, MVT::i8));
12045
12046 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
12047 // TODO: Pull this out as a matchShuffleAsElementShift helper?
12048 // TODO: We can probably make this more aggressive and use shift-pairs like
12049 // lowerShuffleAsByteShiftMask.
12050 unsigned NumElts = Mask.size();
12051 unsigned ZeroLo = Zeroable.countr_one();
12052 unsigned ZeroHi = Zeroable.countl_one();
12053 assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
12054 if (!ZeroLo && !ZeroHi)
12055 return SDValue();
12056
12057 if (ZeroLo) {
12058 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12059 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
12060 if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
12061 return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
12062 getZeroVector(VT, Subtarget, DAG, DL),
12063 DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
12064 }
12065
12066 if (ZeroHi) {
12067 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
12068 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
12069 if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
12070 return DAG.getNode(X86ISD::VALIGN, DL, VT,
12071 getZeroVector(VT, Subtarget, DAG, DL), Src,
12072 DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
12073 }
12074
12075 return SDValue();
12076}
12077
12078/// Try to lower a vector shuffle as a byte shift sequence.
12080 SDValue V2, ArrayRef<int> Mask,
12081 const APInt &Zeroable,
12082 const X86Subtarget &Subtarget,
12083 SelectionDAG &DAG) {
12084 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12085 assert(VT.is128BitVector() && "Only 128-bit vectors supported");
12086
12087 // We need a shuffle that has zeros at one/both ends and a sequential
12088 // shuffle from one source within.
12089 unsigned ZeroLo = Zeroable.countr_one();
12090 unsigned ZeroHi = Zeroable.countl_one();
12091 if (!ZeroLo && !ZeroHi)
12092 return SDValue();
12093
12094 unsigned NumElts = Mask.size();
12095 unsigned Len = NumElts - (ZeroLo + ZeroHi);
12096 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
12097 return SDValue();
12098
12099 unsigned Scale = VT.getScalarSizeInBits() / 8;
12100 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
12101 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
12102 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
12103 return SDValue();
12104
12105 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12106 Res = DAG.getBitcast(MVT::v16i8, Res);
12107
12108 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
12109 // inner sequential set of elements, possibly offset:
12110 // 01234567 --> zzzzzz01 --> 1zzzzzzz
12111 // 01234567 --> 4567zzzz --> zzzzz456
12112 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
12113 if (ZeroLo == 0) {
12114 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12115 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12116 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12117 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12118 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
12119 } else if (ZeroHi == 0) {
12120 unsigned Shift = Mask[ZeroLo] % NumElts;
12121 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12122 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12123 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12124 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12125 } else if (!Subtarget.hasSSSE3()) {
12126 // If we don't have PSHUFB then its worth avoiding an AND constant mask
12127 // by performing 3 byte shifts. Shuffle combining can kick in above that.
12128 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
12129 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12130 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12131 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12132 Shift += Mask[ZeroLo] % NumElts;
12133 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12134 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12135 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12136 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12137 } else
12138 return SDValue();
12139
12140 return DAG.getBitcast(VT, Res);
12141}
12142
12143/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
12144///
12145/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
12146/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
12147/// matches elements from one of the input vectors shuffled to the left or
12148/// right with zeroable elements 'shifted in'. It handles both the strictly
12149/// bit-wise element shifts and the byte shift across an entire 128-bit double
12150/// quad word lane.
12151///
12152/// PSHL : (little-endian) left bit shift.
12153/// [ zz, 0, zz, 2 ]
12154/// [ -1, 4, zz, -1 ]
12155/// PSRL : (little-endian) right bit shift.
12156/// [ 1, zz, 3, zz]
12157/// [ -1, -1, 7, zz]
12158/// PSLLDQ : (little-endian) left byte shift
12159/// [ zz, 0, 1, 2, 3, 4, 5, 6]
12160/// [ zz, zz, -1, -1, 2, 3, 4, -1]
12161/// [ zz, zz, zz, zz, zz, zz, -1, 1]
12162/// PSRLDQ : (little-endian) right byte shift
12163/// [ 5, 6, 7, zz, zz, zz, zz, zz]
12164/// [ -1, 5, 6, 7, zz, zz, zz, zz]
12165/// [ 1, 2, -1, -1, -1, -1, zz, zz]
12166static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
12167 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
12168 int MaskOffset, const APInt &Zeroable,
12169 const X86Subtarget &Subtarget) {
12170 int Size = Mask.size();
12171 unsigned SizeInBits = Size * ScalarSizeInBits;
12172
12173 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
12174 for (int i = 0; i < Size; i += Scale)
12175 for (int j = 0; j < Shift; ++j)
12176 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
12177 return false;
12178
12179 return true;
12180 };
12181
12182 auto MatchShift = [&](int Shift, int Scale, bool Left) {
12183 for (int i = 0; i != Size; i += Scale) {
12184 unsigned Pos = Left ? i + Shift : i;
12185 unsigned Low = Left ? i : i + Shift;
12186 unsigned Len = Scale - Shift;
12187 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
12188 return -1;
12189 }
12190
12191 int ShiftEltBits = ScalarSizeInBits * Scale;
12192 bool ByteShift = ShiftEltBits > 64;
12193 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
12194 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
12195 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
12196
12197 // Normalize the scale for byte shifts to still produce an i64 element
12198 // type.
12199 Scale = ByteShift ? Scale / 2 : Scale;
12200
12201 // We need to round trip through the appropriate type for the shift.
12202 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
12203 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
12204 : MVT::getVectorVT(ShiftSVT, Size / Scale);
12205 return (int)ShiftAmt;
12206 };
12207
12208 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
12209 // keep doubling the size of the integer elements up to that. We can
12210 // then shift the elements of the integer vector by whole multiples of
12211 // their width within the elements of the larger integer vector. Test each
12212 // multiple to see if we can find a match with the moved element indices
12213 // and that the shifted in elements are all zeroable.
12214 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
12215 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
12216 for (int Shift = 1; Shift != Scale; ++Shift)
12217 for (bool Left : {true, false})
12218 if (CheckZeros(Shift, Scale, Left)) {
12219 int ShiftAmt = MatchShift(Shift, Scale, Left);
12220 if (0 < ShiftAmt)
12221 return ShiftAmt;
12222 }
12223
12224 // no match
12225 return -1;
12226}
12227
12229 SDValue V2, ArrayRef<int> Mask,
12230 const APInt &Zeroable,
12231 const X86Subtarget &Subtarget,
12232 SelectionDAG &DAG, bool BitwiseOnly) {
12233 int Size = Mask.size();
12234 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12235
12236 MVT ShiftVT;
12237 SDValue V = V1;
12238 unsigned Opcode;
12239
12240 // Try to match shuffle against V1 shift.
12241 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12242 Mask, 0, Zeroable, Subtarget);
12243
12244 // If V1 failed, try to match shuffle against V2 shift.
12245 if (ShiftAmt < 0) {
12246 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12247 Mask, Size, Zeroable, Subtarget);
12248 V = V2;
12249 }
12250
12251 if (ShiftAmt < 0)
12252 return SDValue();
12253
12254 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
12255 return SDValue();
12256
12257 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
12258 "Illegal integer vector type");
12259 V = DAG.getBitcast(ShiftVT, V);
12260 V = DAG.getNode(Opcode, DL, ShiftVT, V,
12261 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
12262 return DAG.getBitcast(VT, V);
12263}
12264
12265// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
12266// Remainder of lower half result is zero and upper half is all undef.
12267static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
12268 ArrayRef<int> Mask, uint64_t &BitLen,
12269 uint64_t &BitIdx, const APInt &Zeroable) {
12270 int Size = Mask.size();
12271 int HalfSize = Size / 2;
12272 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12273 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
12274
12275 // Upper half must be undefined.
12276 if (!isUndefUpperHalf(Mask))
12277 return false;
12278
12279 // Determine the extraction length from the part of the
12280 // lower half that isn't zeroable.
12281 int Len = HalfSize;
12282 for (; Len > 0; --Len)
12283 if (!Zeroable[Len - 1])
12284 break;
12285 assert(Len > 0 && "Zeroable shuffle mask");
12286
12287 // Attempt to match first Len sequential elements from the lower half.
12288 SDValue Src;
12289 int Idx = -1;
12290 for (int i = 0; i != Len; ++i) {
12291 int M = Mask[i];
12292 if (M == SM_SentinelUndef)
12293 continue;
12294 SDValue &V = (M < Size ? V1 : V2);
12295 M = M % Size;
12296
12297 // The extracted elements must start at a valid index and all mask
12298 // elements must be in the lower half.
12299 if (i > M || M >= HalfSize)
12300 return false;
12301
12302 if (Idx < 0 || (Src == V && Idx == (M - i))) {
12303 Src = V;
12304 Idx = M - i;
12305 continue;
12306 }
12307 return false;
12308 }
12309
12310 if (!Src || Idx < 0)
12311 return false;
12312
12313 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
12314 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12315 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12316 V1 = Src;
12317 return true;
12318}
12319
12320// INSERTQ: Extract lowest Len elements from lower half of second source and
12321// insert over first source, starting at Idx.
12322// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
12323static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
12324 ArrayRef<int> Mask, uint64_t &BitLen,
12325 uint64_t &BitIdx) {
12326 int Size = Mask.size();
12327 int HalfSize = Size / 2;
12328 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12329
12330 // Upper half must be undefined.
12331 if (!isUndefUpperHalf(Mask))
12332 return false;
12333
12334 for (int Idx = 0; Idx != HalfSize; ++Idx) {
12335 SDValue Base;
12336
12337 // Attempt to match first source from mask before insertion point.
12338 if (isUndefInRange(Mask, 0, Idx)) {
12339 /* EMPTY */
12340 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
12341 Base = V1;
12342 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
12343 Base = V2;
12344 } else {
12345 continue;
12346 }
12347
12348 // Extend the extraction length looking to match both the insertion of
12349 // the second source and the remaining elements of the first.
12350 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
12351 SDValue Insert;
12352 int Len = Hi - Idx;
12353
12354 // Match insertion.
12355 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
12356 Insert = V1;
12357 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
12358 Insert = V2;
12359 } else {
12360 continue;
12361 }
12362
12363 // Match the remaining elements of the lower half.
12364 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
12365 /* EMPTY */
12366 } else if ((!Base || (Base == V1)) &&
12367 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
12368 Base = V1;
12369 } else if ((!Base || (Base == V2)) &&
12370 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
12371 Size + Hi)) {
12372 Base = V2;
12373 } else {
12374 continue;
12375 }
12376
12377 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12378 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12379 V1 = Base;
12380 V2 = Insert;
12381 return true;
12382 }
12383 }
12384
12385 return false;
12386}
12387
12388/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
12390 SDValue V2, ArrayRef<int> Mask,
12391 const APInt &Zeroable, SelectionDAG &DAG) {
12392 uint64_t BitLen, BitIdx;
12393 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
12394 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
12395 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12396 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12397
12398 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
12399 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
12400 V2 ? V2 : DAG.getUNDEF(VT),
12401 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12402 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12403
12404 return SDValue();
12405}
12406
12407/// Lower a vector shuffle as an any/signed/zero extension.
12408///
12409/// Given a specific number of elements, element bit width, and extension
12410/// stride, produce either an extension based on the available
12411/// features of the subtarget. The extended elements are consecutive and
12412/// begin and can start from an offsetted element index in the input; to
12413/// avoid excess shuffling the offset must either being in the bottom lane
12414/// or at the start of a higher lane. All extended elements must be from
12415/// the same lane.
12417 int Scale, int Offset,
12418 unsigned ExtOpc, SDValue InputV,
12419 ArrayRef<int> Mask,
12420 const X86Subtarget &Subtarget,
12421 SelectionDAG &DAG) {
12422 assert(Scale > 1 && "Need a scale to extend.");
12423 assert(ISD::isExtOpcode(ExtOpc) && "Unsupported extension");
12424 int EltBits = VT.getScalarSizeInBits();
12425 int NumElements = VT.getVectorNumElements();
12426 int NumEltsPerLane = 128 / EltBits;
12427 int OffsetLane = Offset / NumEltsPerLane;
12428 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
12429 "Only 8, 16, and 32 bit elements can be extended.");
12430 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
12431 assert(0 <= Offset && "Extension offset must be positive.");
12432 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
12433 "Extension offset must be in the first lane or start an upper lane.");
12434
12435 // Check that an index is in same lane as the base offset.
12436 auto SafeOffset = [&](int Idx) {
12437 return OffsetLane == (Idx / NumEltsPerLane);
12438 };
12439
12440 // Shift along an input so that the offset base moves to the first element.
12441 auto ShuffleOffset = [&](SDValue V) {
12442 if (!Offset)
12443 return V;
12444
12445 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12446 for (int i = 0; i * Scale < NumElements; ++i) {
12447 int SrcIdx = i + Offset;
12448 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
12449 }
12450 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
12451 };
12452
12453 // Found a valid a/zext mask! Try various lowering strategies based on the
12454 // input type and available ISA extensions.
12455 if (Subtarget.hasSSE41()) {
12456 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
12457 // PUNPCK will catch this in a later shuffle match.
12458 if (Offset && Scale == 2 && VT.is128BitVector())
12459 return SDValue();
12460 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
12461 NumElements / Scale);
12462 InputV = DAG.getBitcast(VT, InputV);
12463 InputV = ShuffleOffset(InputV);
12464 InputV = getEXTEND_VECTOR_INREG(ExtOpc, DL, ExtVT, InputV, DAG);
12465 return DAG.getBitcast(VT, InputV);
12466 }
12467
12468 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
12469 InputV = DAG.getBitcast(VT, InputV);
12470 bool AnyExt = ExtOpc == ISD::ANY_EXTEND;
12471
12472 // TODO: Add pre-SSE41 SIGN_EXTEND_VECTOR_INREG handling.
12473 if (ExtOpc == ISD::SIGN_EXTEND)
12474 return SDValue();
12475
12476 // For any extends we can cheat for larger element sizes and use shuffle
12477 // instructions that can fold with a load and/or copy.
12478 if (AnyExt && EltBits == 32) {
12479 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
12480 -1};
12481 return DAG.getBitcast(
12482 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12483 DAG.getBitcast(MVT::v4i32, InputV),
12484 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12485 }
12486 if (AnyExt && EltBits == 16 && Scale > 2) {
12487 int PSHUFDMask[4] = {Offset / 2, -1,
12488 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
12489 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12490 DAG.getBitcast(MVT::v4i32, InputV),
12491 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
12492 int PSHUFWMask[4] = {1, -1, -1, -1};
12493 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
12494 return DAG.getBitcast(
12495 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
12496 DAG.getBitcast(MVT::v8i16, InputV),
12497 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
12498 }
12499
12500 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
12501 // to 64-bits.
12502 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
12503 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
12504 assert(VT.is128BitVector() && "Unexpected vector width!");
12505
12506 int LoIdx = Offset * EltBits;
12507 SDValue Lo = DAG.getBitcast(
12508 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12509 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12510 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
12511
12512 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
12513 return DAG.getBitcast(VT, Lo);
12514
12515 int HiIdx = (Offset + 1) * EltBits;
12516 SDValue Hi = DAG.getBitcast(
12517 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12518 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12519 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
12520 return DAG.getBitcast(VT,
12521 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
12522 }
12523
12524 // If this would require more than 2 unpack instructions to expand, use
12525 // pshufb when available. We can only use more than 2 unpack instructions
12526 // when zero extending i8 elements which also makes it easier to use pshufb.
12527 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
12528 assert(NumElements == 16 && "Unexpected byte vector width!");
12529 SDValue PSHUFBMask[16];
12530 for (int i = 0; i < 16; ++i) {
12531 int Idx = Offset + (i / Scale);
12532 if ((i % Scale == 0 && SafeOffset(Idx))) {
12533 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
12534 continue;
12535 }
12536 PSHUFBMask[i] =
12537 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
12538 }
12539 InputV = DAG.getBitcast(MVT::v16i8, InputV);
12540 return DAG.getBitcast(
12541 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
12542 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
12543 }
12544
12545 // If we are extending from an offset, ensure we start on a boundary that
12546 // we can unpack from.
12547 int AlignToUnpack = Offset % (NumElements / Scale);
12548 if (AlignToUnpack) {
12549 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12550 for (int i = AlignToUnpack; i < NumElements; ++i)
12551 ShMask[i - AlignToUnpack] = i;
12552 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
12553 Offset -= AlignToUnpack;
12554 }
12555
12556 // Otherwise emit a sequence of unpacks.
12557 do {
12558 unsigned UnpackLoHi = X86ISD::UNPCKL;
12559 if (Offset >= (NumElements / 2)) {
12560 UnpackLoHi = X86ISD::UNPCKH;
12561 Offset -= (NumElements / 2);
12562 }
12563
12564 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
12565 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
12566 : getZeroVector(InputVT, Subtarget, DAG, DL);
12567 InputV = DAG.getBitcast(InputVT, InputV);
12568 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
12569 Scale /= 2;
12570 EltBits *= 2;
12571 NumElements /= 2;
12572 } while (Scale > 1);
12573 return DAG.getBitcast(VT, InputV);
12574}
12575
12576/// Try to lower a vector shuffle as a zero extension on any microarch.
12577///
12578/// This routine will try to do everything in its power to cleverly lower
12579/// a shuffle which happens to match the pattern of a zero extend. It doesn't
12580/// check for the profitability of this lowering, it tries to aggressively
12581/// match this pattern. It will use all of the micro-architectural details it
12582/// can to emit an efficient lowering. It handles both blends with all-zero
12583/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
12584/// masking out later).
12585///
12586/// The reason we have dedicated lowering for zext-style shuffles is that they
12587/// are both incredibly common and often quite performance sensitive.
12589 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12590 const APInt &Zeroable, const X86Subtarget &Subtarget,
12591 SelectionDAG &DAG) {
12592 int Bits = VT.getSizeInBits();
12593 int NumLanes = Bits / 128;
12594 int NumElements = VT.getVectorNumElements();
12595 int NumEltsPerLane = NumElements / NumLanes;
12596 assert(VT.getScalarSizeInBits() <= 32 &&
12597 "Exceeds 32-bit integer zero extension limit");
12598 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
12599
12600 // Define a helper function to check a particular ext-scale and lower to it if
12601 // valid.
12602 auto Lower = [&](int Scale) -> SDValue {
12603 SDValue InputV;
12604 bool AnyExt = true;
12605 int Offset = 0;
12606 int Matches = 0;
12607 for (int i = 0; i < NumElements; ++i) {
12608 int M = Mask[i];
12609 if (M < 0)
12610 continue; // Valid anywhere but doesn't tell us anything.
12611 if (i % Scale != 0) {
12612 // Each of the extended elements need to be zeroable.
12613 if (!Zeroable[i])
12614 return SDValue();
12615
12616 // We no longer are in the anyext case.
12617 AnyExt = false;
12618 continue;
12619 }
12620
12621 // Each of the base elements needs to be consecutive indices into the
12622 // same input vector.
12623 SDValue V = M < NumElements ? V1 : V2;
12624 M = M % NumElements;
12625 if (!InputV) {
12626 InputV = V;
12627 Offset = M - (i / Scale);
12628 } else if (InputV != V)
12629 return SDValue(); // Flip-flopping inputs.
12630
12631 // Offset must start in the lowest 128-bit lane or at the start of an
12632 // upper lane.
12633 // FIXME: Is it ever worth allowing a negative base offset?
12634 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
12635 (Offset % NumEltsPerLane) == 0))
12636 return SDValue();
12637
12638 // If we are offsetting, all referenced entries must come from the same
12639 // lane.
12640 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
12641 return SDValue();
12642
12643 if ((M % NumElements) != (Offset + (i / Scale)))
12644 return SDValue(); // Non-consecutive strided elements.
12645 Matches++;
12646 }
12647
12648 // If we fail to find an input, we have a zero-shuffle which should always
12649 // have already been handled.
12650 // FIXME: Maybe handle this here in case during blending we end up with one?
12651 if (!InputV)
12652 return SDValue();
12653
12654 // If we are offsetting, don't extend if we only match a single input, we
12655 // can always do better by using a basic PSHUF or PUNPCK.
12656 if (Offset != 0 && Matches < 2)
12657 return SDValue();
12658
12659 unsigned ExtOpc = AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND;
12660 return lowerShuffleAsSpecificExtension(DL, VT, Scale, Offset, ExtOpc,
12661 InputV, Mask, Subtarget, DAG);
12662 };
12663
12664 // The widest scale possible for extending is to a 64-bit integer.
12665 assert(Bits % 64 == 0 &&
12666 "The number of bits in a vector must be divisible by 64 on x86!");
12667 int NumExtElements = Bits / 64;
12668
12669 // Each iteration, try extending the elements half as much, but into twice as
12670 // many elements.
12671 for (; NumExtElements < NumElements; NumExtElements *= 2) {
12672 assert(NumElements % NumExtElements == 0 &&
12673 "The input vector size must be divisible by the extended size.");
12674 if (SDValue V = Lower(NumElements / NumExtElements))
12675 return V;
12676 }
12677
12678 // General extends failed, but 128-bit vectors may be able to use MOVQ.
12679 if (Bits != 128)
12680 return SDValue();
12681
12682 // Returns one of the source operands if the shuffle can be reduced to a
12683 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
12684 auto CanZExtLowHalf = [&]() {
12685 for (int i = NumElements / 2; i != NumElements; ++i)
12686 if (!Zeroable[i])
12687 return SDValue();
12688 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
12689 return V1;
12690 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
12691 return V2;
12692 return SDValue();
12693 };
12694
12695 if (SDValue V = CanZExtLowHalf()) {
12696 V = DAG.getBitcast(MVT::v2i64, V);
12697 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
12698 return DAG.getBitcast(VT, V);
12699 }
12700
12701 // No viable ext lowering found.
12702 return SDValue();
12703}
12704
12705/// Try to get a scalar value for a specific element of a vector.
12706///
12707/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
12709 SelectionDAG &DAG) {
12710 MVT VT = V.getSimpleValueType();
12711 MVT EltVT = VT.getVectorElementType();
12712 V = peekThroughBitcasts(V);
12713
12714 // If the bitcasts shift the element size, we can't extract an equivalent
12715 // element from it.
12716 MVT NewVT = V.getSimpleValueType();
12717 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
12718 return SDValue();
12719
12720 if (V.getOpcode() == ISD::BUILD_VECTOR ||
12721 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
12722 // Ensure the scalar operand is the same size as the destination.
12723 // FIXME: Add support for scalar truncation where possible.
12724 SDValue S = V.getOperand(Idx);
12725 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
12726 return DAG.getBitcast(EltVT, S);
12727 }
12728
12729 return SDValue();
12730}
12731
12732/// Helper to test for a load that can be folded with x86 shuffles.
12733///
12734/// This is particularly important because the set of instructions varies
12735/// significantly based on whether the operand is a load or not.
12737 return V.hasOneUse() &&
12739}
12740
12741template<typename T>
12742static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
12743 T EltVT = VT.getScalarType();
12744 return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||
12745 (EltVT == MVT::f16 && !Subtarget.hasFP16());
12746}
12747
12748/// Try to lower insertion of a single element into a zero vector.
12749///
12750/// This is a common pattern that we have especially efficient patterns to lower
12751/// across all subtarget feature sets.
12753 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12754 const APInt &Zeroable, const X86Subtarget &Subtarget,
12755 SelectionDAG &DAG) {
12756 MVT ExtVT = VT;
12757 MVT EltVT = VT.getVectorElementType();
12758 unsigned NumElts = VT.getVectorNumElements();
12759 unsigned EltBits = VT.getScalarSizeInBits();
12760
12761 if (isSoftF16(EltVT, Subtarget))
12762 return SDValue();
12763
12764 int V2Index =
12765 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
12766 Mask.begin();
12767 bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
12768 bool IsV1Zeroable = true;
12769 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12770 if (i != V2Index && !Zeroable[i]) {
12771 IsV1Zeroable = false;
12772 break;
12773 }
12774
12775 // Bail if a non-zero V1 isn't used in place.
12776 if (!IsV1Zeroable) {
12777 SmallVector<int, 8> V1Mask(Mask);
12778 V1Mask[V2Index] = -1;
12779 if (!isNoopShuffleMask(V1Mask))
12780 return SDValue();
12781 }
12782
12783 // Check for a single input from a SCALAR_TO_VECTOR node.
12784 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
12785 // all the smarts here sunk into that routine. However, the current
12786 // lowering of BUILD_VECTOR makes that nearly impossible until the old
12787 // vector shuffle lowering is dead.
12788 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
12789 DAG);
12790 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
12791 // We need to zext the scalar if it is smaller than an i32.
12792 V2S = DAG.getBitcast(EltVT, V2S);
12793 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
12794 // Using zext to expand a narrow element won't work for non-zero
12795 // insertions. But we can use a masked constant vector if we're
12796 // inserting V2 into the bottom of V1.
12797 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
12798 return SDValue();
12799
12800 // Zero-extend directly to i32.
12801 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
12802 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
12803
12804 // If we're inserting into a constant, mask off the inserted index
12805 // and OR with the zero-extended scalar.
12806 if (!IsV1Zeroable) {
12807 SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
12808 Bits[V2Index] = APInt::getZero(EltBits);
12809 SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
12810 V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
12811 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12812 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
12813 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12814 }
12815 }
12816 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12817 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
12818 (EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) {
12819 // Either not inserting from the low element of the input or the input
12820 // element size is too small to use VZEXT_MOVL to clear the high bits.
12821 return SDValue();
12822 }
12823
12824 if (!IsV1Zeroable) {
12825 // If V1 can't be treated as a zero vector we have fewer options to lower
12826 // this. We can't support integer vectors or non-zero targets cheaply.
12827 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
12828 if (!VT.isFloatingPoint() || V2Index != 0)
12829 return SDValue();
12830 if (!VT.is128BitVector())
12831 return SDValue();
12832
12833 // Otherwise, use MOVSD, MOVSS or MOVSH.
12834 unsigned MovOpc = 0;
12835 if (EltVT == MVT::f16)
12836 MovOpc = X86ISD::MOVSH;
12837 else if (EltVT == MVT::f32)
12838 MovOpc = X86ISD::MOVSS;
12839 else if (EltVT == MVT::f64)
12840 MovOpc = X86ISD::MOVSD;
12841 else
12842 llvm_unreachable("Unsupported floating point element type to handle!");
12843 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
12844 }
12845
12846 // This lowering only works for the low element with floating point vectors.
12847 if (VT.isFloatingPoint() && V2Index != 0)
12848 return SDValue();
12849
12850 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
12851 if (ExtVT != VT)
12852 V2 = DAG.getBitcast(VT, V2);
12853
12854 if (V2Index != 0) {
12855 // If we have 4 or fewer lanes we can cheaply shuffle the element into
12856 // the desired position. Otherwise it is more efficient to do a vector
12857 // shift left. We know that we can do a vector shift left because all
12858 // the inputs are zero.
12859 if (VT.isFloatingPoint() || NumElts <= 4) {
12860 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
12861 V2Shuffle[V2Index] = 0;
12862 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
12863 } else {
12864 V2 = DAG.getBitcast(MVT::v16i8, V2);
12865 V2 = DAG.getNode(
12866 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
12867 DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
12868 V2 = DAG.getBitcast(VT, V2);
12869 }
12870 }
12871 return V2;
12872}
12873
12874/// Try to lower broadcast of a single - truncated - integer element,
12875/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
12876///
12877/// This assumes we have AVX2.
12879 int BroadcastIdx,
12880 const X86Subtarget &Subtarget,
12881 SelectionDAG &DAG) {
12882 assert(Subtarget.hasAVX2() &&
12883 "We can only lower integer broadcasts with AVX2!");
12884
12885 MVT EltVT = VT.getVectorElementType();
12886 MVT V0VT = V0.getSimpleValueType();
12887
12888 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
12889 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
12890
12891 MVT V0EltVT = V0VT.getVectorElementType();
12892 if (!V0EltVT.isInteger())
12893 return SDValue();
12894
12895 const unsigned EltSize = EltVT.getSizeInBits();
12896 const unsigned V0EltSize = V0EltVT.getSizeInBits();
12897
12898 // This is only a truncation if the original element type is larger.
12899 if (V0EltSize <= EltSize)
12900 return SDValue();
12901
12902 assert(((V0EltSize % EltSize) == 0) &&
12903 "Scalar type sizes must all be powers of 2 on x86!");
12904
12905 const unsigned V0Opc = V0.getOpcode();
12906 const unsigned Scale = V0EltSize / EltSize;
12907 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
12908
12909 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
12910 V0Opc != ISD::BUILD_VECTOR)
12911 return SDValue();
12912
12913 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
12914
12915 // If we're extracting non-least-significant bits, shift so we can truncate.
12916 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
12917 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
12918 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
12919 if (const int OffsetIdx = BroadcastIdx % Scale)
12920 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
12921 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
12922
12923 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
12924 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
12925}
12926
12927/// Test whether this can be lowered with a single SHUFPS instruction.
12928///
12929/// This is used to disable more specialized lowerings when the shufps lowering
12930/// will happen to be efficient.
12932 // This routine only handles 128-bit shufps.
12933 assert(Mask.size() == 4 && "Unsupported mask size!");
12934 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
12935 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
12936 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
12937 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
12938
12939 // To lower with a single SHUFPS we need to have the low half and high half
12940 // each requiring a single input.
12941 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
12942 return false;
12943 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
12944 return false;
12945
12946 return true;
12947}
12948
12949/// Test whether the specified input (0 or 1) is in-place blended by the
12950/// given mask.
12951///
12952/// This returns true if the elements from a particular input are already in the
12953/// slot required by the given mask and require no permutation.
12955 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12956 int Size = Mask.size();
12957 for (int i = 0; i < Size; ++i)
12958 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12959 return false;
12960
12961 return true;
12962}
12963
12964/// Test whether the specified input (0 or 1) is a broadcast/splat blended by
12965/// the given mask.
12966///
12968 int BroadcastableElement = 0) {
12969 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12970 int Size = Mask.size();
12971 for (int i = 0; i < Size; ++i)
12972 if (Mask[i] >= 0 && Mask[i] / Size == Input &&
12973 Mask[i] % Size != BroadcastableElement)
12974 return false;
12975 return true;
12976}
12977
12978/// If we are extracting two 128-bit halves of a vector and shuffling the
12979/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
12980/// multi-shuffle lowering.
12982 SDValue N1, ArrayRef<int> Mask,
12983 SelectionDAG &DAG) {
12984 MVT VT = N0.getSimpleValueType();
12985 assert((VT.is128BitVector() &&
12986 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
12987 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
12988
12989 // Check that both sources are extracts of the same source vector.
12990 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12992 N0.getOperand(0) != N1.getOperand(0) ||
12993 !N0.hasOneUse() || !N1.hasOneUse())
12994 return SDValue();
12995
12996 SDValue WideVec = N0.getOperand(0);
12997 MVT WideVT = WideVec.getSimpleValueType();
12998 if (!WideVT.is256BitVector())
12999 return SDValue();
13000
13001 // Match extracts of each half of the wide source vector. Commute the shuffle
13002 // if the extract of the low half is N1.
13003 unsigned NumElts = VT.getVectorNumElements();
13004 SmallVector<int, 4> NewMask(Mask);
13005 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
13006 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
13007 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
13009 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
13010 return SDValue();
13011
13012 // Final bailout: if the mask is simple, we are better off using an extract
13013 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
13014 // because that avoids a constant load from memory.
13015 if (NumElts == 4 &&
13016 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
13017 return SDValue();
13018
13019 // Extend the shuffle mask with undef elements.
13020 NewMask.append(NumElts, -1);
13021
13022 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
13023 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
13024 NewMask);
13025 // This is free: ymm -> xmm.
13026 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
13027 DAG.getVectorIdxConstant(0, DL));
13028}
13029
13030/// Try to lower broadcast of a single element.
13031///
13032/// For convenience, this code also bundles all of the subtarget feature set
13033/// filtering. While a little annoying to re-dispatch on type here, there isn't
13034/// a convenient way to factor it out.
13036 SDValue V2, ArrayRef<int> Mask,
13037 const X86Subtarget &Subtarget,
13038 SelectionDAG &DAG) {
13039 MVT EltVT = VT.getVectorElementType();
13040 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
13041 (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
13042 (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))
13043 return SDValue();
13044
13045 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
13046 // we can only broadcast from a register with AVX2.
13047 unsigned NumEltBits = VT.getScalarSizeInBits();
13048 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
13051 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
13052
13053 // Check that the mask is a broadcast.
13054 int BroadcastIdx = getSplatIndex(Mask);
13055 if (BroadcastIdx < 0) {
13056 // Check for hidden broadcast.
13057 SmallVector<int, 16> BroadcastMask(VT.getVectorNumElements(), 0);
13058 if (!isShuffleEquivalent(Mask, BroadcastMask, V1, V2))
13059 return SDValue();
13060 BroadcastIdx = 0;
13061 }
13062 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
13063 "a sorted mask where the broadcast "
13064 "comes from V1.");
13065 int NumActiveElts = count_if(Mask, [](int M) { return M >= 0; });
13066
13067 // Go up the chain of (vector) values to find a scalar load that we can
13068 // combine with the broadcast.
13069 // TODO: Combine this logic with findEltLoadSrc() used by
13070 // EltsFromConsecutiveLoads().
13071 int BitOffset = BroadcastIdx * NumEltBits;
13072 SDValue V = V1;
13073 for (;;) {
13074 switch (V.getOpcode()) {
13075 case ISD::BITCAST: {
13076 V = V.getOperand(0);
13077 continue;
13078 }
13079 case ISD::CONCAT_VECTORS: {
13080 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
13081 int OpIdx = BitOffset / OpBitWidth;
13082 V = V.getOperand(OpIdx);
13083 BitOffset %= OpBitWidth;
13084 continue;
13085 }
13087 // The extraction index adds to the existing offset.
13088 unsigned EltBitWidth = V.getScalarValueSizeInBits();
13089 unsigned Idx = V.getConstantOperandVal(1);
13090 unsigned BeginOffset = Idx * EltBitWidth;
13091 BitOffset += BeginOffset;
13092 V = V.getOperand(0);
13093 continue;
13094 }
13095 case ISD::INSERT_SUBVECTOR: {
13096 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
13097 int EltBitWidth = VOuter.getScalarValueSizeInBits();
13098 int Idx = (int)V.getConstantOperandVal(2);
13099 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
13100 int BeginOffset = Idx * EltBitWidth;
13101 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
13102 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
13103 BitOffset -= BeginOffset;
13104 V = VInner;
13105 } else {
13106 V = VOuter;
13107 }
13108 continue;
13109 }
13110 }
13111 break;
13112 }
13113 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
13114 BroadcastIdx = BitOffset / NumEltBits;
13115
13116 // Do we need to bitcast the source to retrieve the original broadcast index?
13117 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
13118
13119 // Check if this is a broadcast of a scalar. We special case lowering
13120 // for scalars so that we can more effectively fold with loads.
13121 // If the original value has a larger element type than the shuffle, the
13122 // broadcast element is in essence truncated. Make that explicit to ease
13123 // folding.
13124 if (BitCastSrc && VT.isInteger())
13125 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
13126 DL, VT, V, BroadcastIdx, Subtarget, DAG))
13127 return TruncBroadcast;
13128
13129 // Also check the simpler case, where we can directly reuse the scalar.
13130 if (!BitCastSrc &&
13131 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
13132 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
13133 V = V.getOperand(BroadcastIdx);
13134
13135 // If we can't broadcast from a register, check that the input is a load.
13136 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
13137 return SDValue();
13138 } else if (ISD::isNormalLoad(V.getNode()) &&
13139 cast<LoadSDNode>(V)->isSimple()) {
13140 // We do not check for one-use of the vector load because a broadcast load
13141 // is expected to be a win for code size, register pressure, and possibly
13142 // uops even if the original vector load is not eliminated.
13143
13144 // Reduce the vector load and shuffle to a broadcasted scalar load.
13145 auto *Ld = cast<LoadSDNode>(V);
13146 SDValue BaseAddr = Ld->getBasePtr();
13147 MVT SVT = VT.getScalarType();
13148 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
13149 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
13150 SDValue NewAddr =
13152
13153 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
13154 // than MOVDDUP.
13155 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
13156 if (Opcode == X86ISD::VBROADCAST) {
13157 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
13158 SDValue Ops[] = {Ld->getChain(), NewAddr};
13159 V = DAG.getMemIntrinsicNode(
13160 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
13162 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13164 return DAG.getBitcast(VT, V);
13165 }
13166 assert(SVT == MVT::f64 && "Unexpected VT!");
13167 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
13169 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13171 } else if (!BroadcastFromReg) {
13172 // We can't broadcast from a vector register.
13173 return SDValue();
13174 } else if (BitOffset != 0) {
13175 // We can only broadcast from the zero-element of a vector register,
13176 // but it can be advantageous to broadcast from the zero-element of a
13177 // subvector.
13178 if (!VT.is256BitVector() && !VT.is512BitVector())
13179 return SDValue();
13180
13181 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
13182 if (VT == MVT::v4f64 || VT == MVT::v4i64)
13183 return SDValue();
13184
13185 // If we are broadcasting an element from the lowest 128-bit subvector, try
13186 // to move the element in position.
13187 if (BitOffset < 128 && NumActiveElts > 1 &&
13188 V.getScalarValueSizeInBits() == NumEltBits) {
13189 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13190 "Unexpected bit-offset");
13191 SmallVector<int, 16> ExtractMask(128 / NumEltBits, SM_SentinelUndef);
13192 ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits();
13193 V = extractSubVector(V, 0, DAG, DL, 128);
13194 V = DAG.getVectorShuffle(V.getValueType(), DL, V, V, ExtractMask);
13195 } else {
13196 // Only broadcast the zero-element of a 128-bit subvector.
13197 if ((BitOffset % 128) != 0)
13198 return SDValue();
13199
13200 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13201 "Unexpected bit-offset");
13202 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
13203 "Unexpected vector size");
13204 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
13205 V = extract128BitVector(V, ExtractIdx, DAG, DL);
13206 }
13207 }
13208
13209 // On AVX we can use VBROADCAST directly for scalar sources.
13210 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
13211 V = DAG.getBitcast(MVT::f64, V);
13212 if (Subtarget.hasAVX()) {
13213 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
13214 return DAG.getBitcast(VT, V);
13215 }
13216 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
13217 }
13218
13219 // If this is a scalar, do the broadcast on this type and bitcast.
13220 if (!V.getValueType().isVector()) {
13221 assert(V.getScalarValueSizeInBits() == NumEltBits &&
13222 "Unexpected scalar size");
13223 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
13225 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
13226 }
13227
13228 // We only support broadcasting from 128-bit vectors to minimize the
13229 // number of patterns we need to deal with in isel. So extract down to
13230 // 128-bits, removing as many bitcasts as possible.
13231 if (V.getValueSizeInBits() > 128)
13233
13234 // Otherwise cast V to a vector with the same element type as VT, but
13235 // possibly narrower than VT. Then perform the broadcast.
13236 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
13237 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
13238 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
13239}
13240
13241// Check for whether we can use INSERTPS to perform the shuffle. We only use
13242// INSERTPS when the V1 elements are already in the correct locations
13243// because otherwise we can just always use two SHUFPS instructions which
13244// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
13245// perform INSERTPS if a single V1 element is out of place and all V2
13246// elements are zeroable.
13248 unsigned &InsertPSMask,
13249 const APInt &Zeroable,
13250 ArrayRef<int> Mask, SelectionDAG &DAG) {
13251 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
13252 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
13253 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13254
13255 // Attempt to match INSERTPS with one element from VA or VB being
13256 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
13257 // are updated.
13258 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
13259 ArrayRef<int> CandidateMask) {
13260 unsigned ZMask = 0;
13261 int VADstIndex = -1;
13262 int VBDstIndex = -1;
13263 bool VAUsedInPlace = false;
13264
13265 for (int i = 0; i < 4; ++i) {
13266 // Synthesize a zero mask from the zeroable elements (includes undefs).
13267 if (Zeroable[i]) {
13268 ZMask |= 1 << i;
13269 continue;
13270 }
13271
13272 // Flag if we use any VA inputs in place.
13273 if (i == CandidateMask[i]) {
13274 VAUsedInPlace = true;
13275 continue;
13276 }
13277
13278 // We can only insert a single non-zeroable element.
13279 if (VADstIndex >= 0 || VBDstIndex >= 0)
13280 return false;
13281
13282 if (CandidateMask[i] < 4) {
13283 // VA input out of place for insertion.
13284 VADstIndex = i;
13285 } else {
13286 // VB input for insertion.
13287 VBDstIndex = i;
13288 }
13289 }
13290
13291 // Don't bother if we have no (non-zeroable) element for insertion.
13292 if (VADstIndex < 0 && VBDstIndex < 0)
13293 return false;
13294
13295 // Determine element insertion src/dst indices. The src index is from the
13296 // start of the inserted vector, not the start of the concatenated vector.
13297 unsigned VBSrcIndex = 0;
13298 if (VADstIndex >= 0) {
13299 // If we have a VA input out of place, we use VA as the V2 element
13300 // insertion and don't use the original V2 at all.
13301 VBSrcIndex = CandidateMask[VADstIndex];
13302 VBDstIndex = VADstIndex;
13303 VB = VA;
13304 } else {
13305 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13306 }
13307
13308 // If no V1 inputs are used in place, then the result is created only from
13309 // the zero mask and the V2 insertion - so remove V1 dependency.
13310 if (!VAUsedInPlace)
13311 VA = DAG.getUNDEF(MVT::v4f32);
13312
13313 // Update V1, V2 and InsertPSMask accordingly.
13314 V1 = VA;
13315 V2 = VB;
13316
13317 // Insert the V2 element into the desired position.
13318 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13319 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
13320 return true;
13321 };
13322
13323 if (matchAsInsertPS(V1, V2, Mask))
13324 return true;
13325
13326 // Commute and try again.
13327 SmallVector<int, 4> CommutedMask(Mask);
13329 if (matchAsInsertPS(V2, V1, CommutedMask))
13330 return true;
13331
13332 return false;
13333}
13334
13336 ArrayRef<int> Mask, const APInt &Zeroable,
13337 SelectionDAG &DAG) {
13338 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13339 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13340
13341 // Attempt to match the insertps pattern.
13342 unsigned InsertPSMask = 0;
13343 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13344 return SDValue();
13345
13346 // Insert the V2 element into the desired position.
13347 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13348 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13349}
13350
13351/// Handle lowering of 2-lane 64-bit floating point shuffles.
13352///
13353/// This is the basis function for the 2-lane 64-bit shuffles as we have full
13354/// support for floating point shuffles but not integer shuffles. These
13355/// instructions will incur a domain crossing penalty on some chips though so
13356/// it is better to avoid lowering through this for integer vectors where
13357/// possible.
13359 const APInt &Zeroable, SDValue V1, SDValue V2,
13360 const X86Subtarget &Subtarget,
13361 SelectionDAG &DAG) {
13362 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13363 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13364 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13365
13366 if (V2.isUndef()) {
13367 // Check for being able to broadcast a single element.
13368 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
13369 Mask, Subtarget, DAG))
13370 return Broadcast;
13371
13372 // Straight shuffle of a single input vector. Simulate this by using the
13373 // single input as both of the "inputs" to this instruction..
13374 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
13375
13376 if (Subtarget.hasAVX()) {
13377 // If we have AVX, we can use VPERMILPS which will allow folding a load
13378 // into the shuffle.
13379 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
13380 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13381 }
13382
13383 return DAG.getNode(
13384 X86ISD::SHUFP, DL, MVT::v2f64,
13385 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13386 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13387 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13388 }
13389 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13390 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13391 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13392 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13393
13394 if (Subtarget.hasAVX2())
13395 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13396 return Extract;
13397
13398 // When loading a scalar and then shuffling it into a vector we can often do
13399 // the insertion cheaply.
13401 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13402 return Insertion;
13403 // Try inverting the insertion since for v2 masks it is easy to do and we
13404 // can't reliably sort the mask one way or the other.
13405 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
13406 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
13408 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13409 return Insertion;
13410
13411 // Try to use one of the special instruction patterns to handle two common
13412 // blend patterns if a zero-blend above didn't work.
13413 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
13414 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
13415 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
13416 // We can either use a special instruction to load over the low double or
13417 // to move just the low double.
13418 return DAG.getNode(
13419 X86ISD::MOVSD, DL, MVT::v2f64, V2,
13420 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
13421
13422 if (Subtarget.hasSSE41())
13423 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
13424 Zeroable, Subtarget, DAG))
13425 return Blend;
13426
13427 // Use dedicated unpack instructions for masks that match their pattern.
13428 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, V1, V2, Mask, DAG))
13429 return V;
13430
13431 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
13432 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
13433 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13434}
13435
13436/// Handle lowering of 2-lane 64-bit integer shuffles.
13437///
13438/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
13439/// the integer unit to minimize domain crossing penalties. However, for blends
13440/// it falls back to the floating point shuffle operation with appropriate bit
13441/// casting.
13443 const APInt &Zeroable, SDValue V1, SDValue V2,
13444 const X86Subtarget &Subtarget,
13445 SelectionDAG &DAG) {
13446 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13447 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13448 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13449
13450 if (V2.isUndef()) {
13451 // Check for being able to broadcast a single element.
13452 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
13453 Mask, Subtarget, DAG))
13454 return Broadcast;
13455
13456 // Straight shuffle of a single input vector. For everything from SSE2
13457 // onward this has a single fast instruction with no scary immediates.
13458 // We have to map the mask as it is actually a v4i32 shuffle instruction.
13459 V1 = DAG.getBitcast(MVT::v4i32, V1);
13460 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
13461 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
13462 Mask[1] < 0 ? -1 : (Mask[1] * 2),
13463 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
13464 return DAG.getBitcast(
13465 MVT::v2i64,
13466 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13467 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
13468 }
13469 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
13470 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
13471 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13472 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13473
13474 if (Subtarget.hasAVX2())
13475 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13476 return Extract;
13477
13478 // Try to use shift instructions.
13479 if (SDValue Shift =
13480 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
13481 DAG, /*BitwiseOnly*/ false))
13482 return Shift;
13483
13484 // When loading a scalar and then shuffling it into a vector we can often do
13485 // the insertion cheaply.
13487 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13488 return Insertion;
13489 // Try inverting the insertion since for v2 masks it is easy to do and we
13490 // can't reliably sort the mask one way or the other.
13491 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
13493 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13494 return Insertion;
13495
13496 // We have different paths for blend lowering, but they all must use the
13497 // *exact* same predicate.
13498 bool IsBlendSupported = Subtarget.hasSSE41();
13499 if (IsBlendSupported)
13500 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
13501 Zeroable, Subtarget, DAG))
13502 return Blend;
13503
13504 // Use dedicated unpack instructions for masks that match their pattern.
13505 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, V1, V2, Mask, DAG))
13506 return V;
13507
13508 // Try to use byte rotation instructions.
13509 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13510 if (Subtarget.hasSSSE3()) {
13511 if (Subtarget.hasVLX())
13512 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
13513 Zeroable, Subtarget, DAG))
13514 return Rotate;
13515
13516 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
13517 Subtarget, DAG))
13518 return Rotate;
13519 }
13520
13521 // If we have direct support for blends, we should lower by decomposing into
13522 // a permute. That will be faster than the domain cross.
13523 if (IsBlendSupported)
13524 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
13525 Zeroable, Subtarget, DAG);
13526
13527 // We implement this with SHUFPD which is pretty lame because it will likely
13528 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
13529 // However, all the alternatives are still more cycles and newer chips don't
13530 // have this problem. It would be really nice if x86 had better shuffles here.
13531 V1 = DAG.getBitcast(MVT::v2f64, V1);
13532 V2 = DAG.getBitcast(MVT::v2f64, V2);
13533 return DAG.getBitcast(MVT::v2i64,
13534 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
13535}
13536
13537/// Lower a vector shuffle using the SHUFPS instruction.
13538///
13539/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
13540/// It makes no assumptions about whether this is the *best* lowering, it simply
13541/// uses it.
13543 ArrayRef<int> Mask, SDValue V1,
13544 SDValue V2, SelectionDAG &DAG) {
13545 SDValue LowV = V1, HighV = V2;
13546 SmallVector<int, 4> NewMask(Mask);
13547 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13548
13549 if (NumV2Elements == 1) {
13550 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
13551
13552 // Compute the index adjacent to V2Index and in the same half by toggling
13553 // the low bit.
13554 int V2AdjIndex = V2Index ^ 1;
13555
13556 if (Mask[V2AdjIndex] < 0) {
13557 // Handles all the cases where we have a single V2 element and an undef.
13558 // This will only ever happen in the high lanes because we commute the
13559 // vector otherwise.
13560 if (V2Index < 2)
13561 std::swap(LowV, HighV);
13562 NewMask[V2Index] -= 4;
13563 } else {
13564 // Handle the case where the V2 element ends up adjacent to a V1 element.
13565 // To make this work, blend them together as the first step.
13566 int V1Index = V2AdjIndex;
13567 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
13568 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
13569 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13570
13571 // Now proceed to reconstruct the final blend as we have the necessary
13572 // high or low half formed.
13573 if (V2Index < 2) {
13574 LowV = V2;
13575 HighV = V1;
13576 } else {
13577 HighV = V2;
13578 }
13579 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
13580 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
13581 }
13582 } else if (NumV2Elements == 2) {
13583 if (Mask[0] < 4 && Mask[1] < 4) {
13584 // Handle the easy case where we have V1 in the low lanes and V2 in the
13585 // high lanes.
13586 NewMask[2] -= 4;
13587 NewMask[3] -= 4;
13588 } else if (Mask[2] < 4 && Mask[3] < 4) {
13589 // We also handle the reversed case because this utility may get called
13590 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
13591 // arrange things in the right direction.
13592 NewMask[0] -= 4;
13593 NewMask[1] -= 4;
13594 HighV = V1;
13595 LowV = V2;
13596 } else {
13597 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
13598 // trying to place elements directly, just blend them and set up the final
13599 // shuffle to place them.
13600
13601 // The first two blend mask elements are for V1, the second two are for
13602 // V2.
13603 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
13604 Mask[2] < 4 ? Mask[2] : Mask[3],
13605 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13606 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13607 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13608 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13609
13610 // Now we do a normal shuffle of V1 by giving V1 as both operands to
13611 // a blend.
13612 LowV = HighV = V1;
13613 NewMask[0] = Mask[0] < 4 ? 0 : 2;
13614 NewMask[1] = Mask[0] < 4 ? 2 : 0;
13615 NewMask[2] = Mask[2] < 4 ? 1 : 3;
13616 NewMask[3] = Mask[2] < 4 ? 3 : 1;
13617 }
13618 } else if (NumV2Elements == 3) {
13619 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
13620 // we can get here due to other paths (e.g repeated mask matching) that we
13621 // don't want to do another round of lowerVECTOR_SHUFFLE.
13623 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
13624 }
13625 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
13626 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
13627}
13628
13629/// Lower 4-lane 32-bit floating point shuffles.
13630///
13631/// Uses instructions exclusively from the floating point unit to minimize
13632/// domain crossing penalties, as these are sufficient to implement all v4f32
13633/// shuffles.
13635 const APInt &Zeroable, SDValue V1, SDValue V2,
13636 const X86Subtarget &Subtarget,
13637 SelectionDAG &DAG) {
13638 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13639 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13640 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13641
13642 if (Subtarget.hasSSE41())
13643 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
13644 Zeroable, Subtarget, DAG))
13645 return Blend;
13646
13647 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13648
13649 if (NumV2Elements == 0) {
13650 // Check for being able to broadcast a single element.
13651 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
13652 Mask, Subtarget, DAG))
13653 return Broadcast;
13654
13655 // Use even/odd duplicate instructions for masks that match their pattern.
13656 if (Subtarget.hasSSE3()) {
13657 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
13658 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
13659 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
13660 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
13661 }
13662
13663 if (Subtarget.hasAVX()) {
13664 // If we have AVX, we can use VPERMILPS which will allow folding a load
13665 // into the shuffle.
13666 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
13667 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13668 }
13669
13670 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
13671 // in SSE1 because otherwise they are widened to v2f64 and never get here.
13672 if (!Subtarget.hasSSE2()) {
13673 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
13674 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
13675 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
13676 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
13677 }
13678
13679 // Otherwise, use a straight shuffle of a single input vector. We pass the
13680 // input vector to both operands to simulate this with a SHUFPS.
13681 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
13682 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13683 }
13684
13685 if (Subtarget.hasSSE2())
13687 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
13688 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
13689 return ZExt;
13690 }
13691
13692 if (Subtarget.hasAVX2())
13693 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13694 return Extract;
13695
13696 // There are special ways we can lower some single-element blends. However, we
13697 // have custom ways we can lower more complex single-element blends below that
13698 // we defer to if both this and BLENDPS fail to match, so restrict this to
13699 // when the V2 input is targeting element 0 of the mask -- that is the fast
13700 // case here.
13701 if (NumV2Elements == 1 && Mask[0] >= 4)
13703 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13704 return V;
13705
13706 if (Subtarget.hasSSE41()) {
13707 // Use INSERTPS if we can complete the shuffle efficiently.
13708 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
13709 return V;
13710
13711 if (!isSingleSHUFPSMask(Mask))
13712 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
13713 V2, Mask, DAG))
13714 return BlendPerm;
13715 }
13716
13717 // Use low/high mov instructions. These are only valid in SSE1 because
13718 // otherwise they are widened to v2f64 and never get here.
13719 if (!Subtarget.hasSSE2()) {
13720 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
13721 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
13722 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
13723 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
13724 }
13725
13726 // Use dedicated unpack instructions for masks that match their pattern.
13727 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, V1, V2, Mask, DAG))
13728 return V;
13729
13730 // Otherwise fall back to a SHUFPS lowering strategy.
13731 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
13732}
13733
13734/// Lower 4-lane i32 vector shuffles.
13735///
13736/// We try to handle these with integer-domain shuffles where we can, but for
13737/// blends we use the floating point domain blend instructions.
13739 const APInt &Zeroable, SDValue V1, SDValue V2,
13740 const X86Subtarget &Subtarget,
13741 SelectionDAG &DAG) {
13742 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13743 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13744 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13745
13746 // Whenever we can lower this as a zext, that instruction is strictly faster
13747 // than any alternative. It also allows us to fold memory operands into the
13748 // shuffle in many cases.
13749 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
13750 Zeroable, Subtarget, DAG))
13751 return ZExt;
13752
13753 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13754
13755 // Try to use shift instructions if fast.
13756 if (Subtarget.preferLowerShuffleAsShift()) {
13757 if (SDValue Shift =
13758 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
13759 Subtarget, DAG, /*BitwiseOnly*/ true))
13760 return Shift;
13761 if (NumV2Elements == 0)
13762 if (SDValue Rotate =
13763 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
13764 return Rotate;
13765 }
13766
13767 if (NumV2Elements == 0) {
13768 // Try to use broadcast unless the mask only has one non-undef element.
13769 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
13770 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
13771 Mask, Subtarget, DAG))
13772 return Broadcast;
13773 }
13774
13775 // Straight shuffle of a single input vector. For everything from SSE2
13776 // onward this has a single fast instruction with no scary immediates.
13777 // We coerce the shuffle pattern to be compatible with UNPCK instructions
13778 // but we aren't actually going to use the UNPCK instruction because doing
13779 // so prevents folding a load into this instruction or making a copy.
13780 const int UnpackLoMask[] = {0, 0, 1, 1};
13781 const int UnpackHiMask[] = {2, 2, 3, 3};
13782 if (!isSingleElementRepeatedMask(Mask)) {
13783 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
13784 Mask = UnpackLoMask;
13785 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
13786 Mask = UnpackHiMask;
13787 }
13788
13789 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13790 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13791 }
13792
13793 if (Subtarget.hasAVX2())
13794 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13795 return Extract;
13796
13797 // Try to use shift instructions.
13798 if (SDValue Shift =
13799 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
13800 DAG, /*BitwiseOnly*/ false))
13801 return Shift;
13802
13803 // There are special ways we can lower some single-element blends.
13804 if (NumV2Elements == 1)
13806 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13807 return V;
13808
13809 // We have different paths for blend lowering, but they all must use the
13810 // *exact* same predicate.
13811 bool IsBlendSupported = Subtarget.hasSSE41();
13812 if (IsBlendSupported)
13813 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
13814 Zeroable, Subtarget, DAG))
13815 return Blend;
13816
13817 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
13818 Zeroable, Subtarget, DAG))
13819 return Masked;
13820
13821 // Use dedicated unpack instructions for masks that match their pattern.
13822 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, V1, V2, Mask, DAG))
13823 return V;
13824
13825 // Try to use byte rotation instructions.
13826 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13827 if (Subtarget.hasSSSE3()) {
13828 if (Subtarget.hasVLX())
13829 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
13830 Zeroable, Subtarget, DAG))
13831 return Rotate;
13832
13833 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
13834 Subtarget, DAG))
13835 return Rotate;
13836 }
13837
13838 // Assume that a single SHUFPS is faster than an alternative sequence of
13839 // multiple instructions (even if the CPU has a domain penalty).
13840 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13841 if (!isSingleSHUFPSMask(Mask)) {
13842 // If we have direct support for blends, we should lower by decomposing into
13843 // a permute. That will be faster than the domain cross.
13844 if (IsBlendSupported)
13845 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
13846 Zeroable, Subtarget, DAG);
13847
13848 // Try to lower by permuting the inputs into an unpack instruction.
13849 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
13850 Mask, Subtarget, DAG))
13851 return Unpack;
13852 }
13853
13854 // We implement this with SHUFPS because it can blend from two vectors.
13855 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
13856 // up the inputs, bypassing domain shift penalties that we would incur if we
13857 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
13858 // relevant.
13859 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
13860 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
13861 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
13862 return DAG.getBitcast(MVT::v4i32, ShufPS);
13863}
13864
13865/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
13866/// shuffle lowering, and the most complex part.
13867///
13868/// The lowering strategy is to try to form pairs of input lanes which are
13869/// targeted at the same half of the final vector, and then use a dword shuffle
13870/// to place them onto the right half, and finally unpack the paired lanes into
13871/// their final position.
13872///
13873/// The exact breakdown of how to form these dword pairs and align them on the
13874/// correct sides is really tricky. See the comments within the function for
13875/// more of the details.
13876///
13877/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
13878/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
13879/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
13880/// vector, form the analogous 128-bit 8-element Mask.
13882 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
13883 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13884 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
13885 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
13886
13887 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
13888 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
13889 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
13890
13891 // Attempt to directly match PSHUFLW or PSHUFHW.
13892 if (isUndefOrInRange(LoMask, 0, 4) &&
13893 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
13894 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13895 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13896 }
13897 if (isUndefOrInRange(HiMask, 4, 8) &&
13898 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
13899 for (int i = 0; i != 4; ++i)
13900 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13901 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13902 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13903 }
13904
13905 SmallVector<int, 4> LoInputs;
13906 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
13907 array_pod_sort(LoInputs.begin(), LoInputs.end());
13908 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
13909 SmallVector<int, 4> HiInputs;
13910 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
13911 array_pod_sort(HiInputs.begin(), HiInputs.end());
13912 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
13913 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
13914 int NumHToL = LoInputs.size() - NumLToL;
13915 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
13916 int NumHToH = HiInputs.size() - NumLToH;
13917 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
13918 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
13919 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
13920 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
13921
13922 // If we are shuffling values from one half - check how many different DWORD
13923 // pairs we need to create. If only 1 or 2 then we can perform this as a
13924 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
13925 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
13926 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
13927 V = DAG.getNode(ShufWOp, DL, VT, V,
13928 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13929 V = DAG.getBitcast(PSHUFDVT, V);
13930 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
13931 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13932 return DAG.getBitcast(VT, V);
13933 };
13934
13935 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
13936 int PSHUFDMask[4] = { -1, -1, -1, -1 };
13937 SmallVector<std::pair<int, int>, 4> DWordPairs;
13938 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
13939
13940 // Collect the different DWORD pairs.
13941 for (int DWord = 0; DWord != 4; ++DWord) {
13942 int M0 = Mask[2 * DWord + 0];
13943 int M1 = Mask[2 * DWord + 1];
13944 M0 = (M0 >= 0 ? M0 % 4 : M0);
13945 M1 = (M1 >= 0 ? M1 % 4 : M1);
13946 if (M0 < 0 && M1 < 0)
13947 continue;
13948
13949 bool Match = false;
13950 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
13951 auto &DWordPair = DWordPairs[j];
13952 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
13953 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
13954 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
13955 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
13956 PSHUFDMask[DWord] = DOffset + j;
13957 Match = true;
13958 break;
13959 }
13960 }
13961 if (!Match) {
13962 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
13963 DWordPairs.push_back(std::make_pair(M0, M1));
13964 }
13965 }
13966
13967 if (DWordPairs.size() <= 2) {
13968 DWordPairs.resize(2, std::make_pair(-1, -1));
13969 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
13970 DWordPairs[1].first, DWordPairs[1].second};
13971 // For splat, ensure we widen the PSHUFDMask to allow vXi64 folds.
13972 if (ShuffleVectorSDNode::isSplatMask(PSHUFDMask) &&
13973 ShuffleVectorSDNode::isSplatMask(PSHUFHalfMask)) {
13974 int SplatIdx = ShuffleVectorSDNode::getSplatMaskIndex(PSHUFHalfMask);
13975 std::fill(PSHUFHalfMask, PSHUFHalfMask + 4, SplatIdx);
13976 PSHUFDMask[0] = PSHUFDMask[2] = DOffset + 0;
13977 PSHUFDMask[1] = PSHUFDMask[3] = DOffset + 1;
13978 }
13979 if ((NumHToL + NumHToH) == 0)
13980 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
13981 if ((NumLToL + NumLToH) == 0)
13982 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
13983 }
13984 }
13985
13986 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
13987 // such inputs we can swap two of the dwords across the half mark and end up
13988 // with <=2 inputs to each half in each half. Once there, we can fall through
13989 // to the generic code below. For example:
13990 //
13991 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13992 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
13993 //
13994 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
13995 // and an existing 2-into-2 on the other half. In this case we may have to
13996 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
13997 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
13998 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
13999 // because any other situation (including a 3-into-1 or 1-into-3 in the other
14000 // half than the one we target for fixing) will be fixed when we re-enter this
14001 // path. We will also combine away any sequence of PSHUFD instructions that
14002 // result into a single instruction. Here is an example of the tricky case:
14003 //
14004 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14005 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
14006 //
14007 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
14008 //
14009 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
14010 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
14011 //
14012 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
14013 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
14014 //
14015 // The result is fine to be handled by the generic logic.
14016 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
14017 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
14018 int AOffset, int BOffset) {
14019 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
14020 "Must call this with A having 3 or 1 inputs from the A half.");
14021 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
14022 "Must call this with B having 1 or 3 inputs from the B half.");
14023 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
14024 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
14025
14026 bool ThreeAInputs = AToAInputs.size() == 3;
14027
14028 // Compute the index of dword with only one word among the three inputs in
14029 // a half by taking the sum of the half with three inputs and subtracting
14030 // the sum of the actual three inputs. The difference is the remaining
14031 // slot.
14032 int ADWord = 0, BDWord = 0;
14033 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14034 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14035 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14036 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14037 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14038 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14039 int TripleNonInputIdx =
14040 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
14041 TripleDWord = TripleNonInputIdx / 2;
14042
14043 // We use xor with one to compute the adjacent DWord to whichever one the
14044 // OneInput is in.
14045 OneInputDWord = (OneInput / 2) ^ 1;
14046
14047 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
14048 // and BToA inputs. If there is also such a problem with the BToB and AToB
14049 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
14050 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
14051 // is essential that we don't *create* a 3<-1 as then we might oscillate.
14052 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
14053 // Compute how many inputs will be flipped by swapping these DWords. We
14054 // need
14055 // to balance this to ensure we don't form a 3-1 shuffle in the other
14056 // half.
14057 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
14058 llvm::count(AToBInputs, 2 * ADWord + 1);
14059 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
14060 llvm::count(BToBInputs, 2 * BDWord + 1);
14061 if ((NumFlippedAToBInputs == 1 &&
14062 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
14063 (NumFlippedBToBInputs == 1 &&
14064 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
14065 // We choose whether to fix the A half or B half based on whether that
14066 // half has zero flipped inputs. At zero, we may not be able to fix it
14067 // with that half. We also bias towards fixing the B half because that
14068 // will more commonly be the high half, and we have to bias one way.
14069 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
14070 ArrayRef<int> Inputs) {
14071 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
14072 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
14073 // Determine whether the free index is in the flipped dword or the
14074 // unflipped dword based on where the pinned index is. We use this bit
14075 // in an xor to conditionally select the adjacent dword.
14076 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
14077 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14078 if (IsFixIdxInput == IsFixFreeIdxInput)
14079 FixFreeIdx += 1;
14080 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14081 assert(IsFixIdxInput != IsFixFreeIdxInput &&
14082 "We need to be changing the number of flipped inputs!");
14083 int PSHUFHalfMask[] = {0, 1, 2, 3};
14084 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
14085 V = DAG.getNode(
14086 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
14087 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
14088 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14089
14090 for (int &M : Mask)
14091 if (M >= 0 && M == FixIdx)
14092 M = FixFreeIdx;
14093 else if (M >= 0 && M == FixFreeIdx)
14094 M = FixIdx;
14095 };
14096 if (NumFlippedBToBInputs != 0) {
14097 int BPinnedIdx =
14098 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
14099 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
14100 } else {
14101 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
14102 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
14103 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
14104 }
14105 }
14106 }
14107
14108 int PSHUFDMask[] = {0, 1, 2, 3};
14109 PSHUFDMask[ADWord] = BDWord;
14110 PSHUFDMask[BDWord] = ADWord;
14111 V = DAG.getBitcast(
14112 VT,
14113 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14114 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14115
14116 // Adjust the mask to match the new locations of A and B.
14117 for (int &M : Mask)
14118 if (M >= 0 && M/2 == ADWord)
14119 M = 2 * BDWord + M % 2;
14120 else if (M >= 0 && M/2 == BDWord)
14121 M = 2 * ADWord + M % 2;
14122
14123 // Recurse back into this routine to re-compute state now that this isn't
14124 // a 3 and 1 problem.
14125 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
14126 };
14127 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14128 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14129 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14130 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14131
14132 // At this point there are at most two inputs to the low and high halves from
14133 // each half. That means the inputs can always be grouped into dwords and
14134 // those dwords can then be moved to the correct half with a dword shuffle.
14135 // We use at most one low and one high word shuffle to collect these paired
14136 // inputs into dwords, and finally a dword shuffle to place them.
14137 int PSHUFLMask[4] = {-1, -1, -1, -1};
14138 int PSHUFHMask[4] = {-1, -1, -1, -1};
14139 int PSHUFDMask[4] = {-1, -1, -1, -1};
14140
14141 // First fix the masks for all the inputs that are staying in their
14142 // original halves. This will then dictate the targets of the cross-half
14143 // shuffles.
14144 auto fixInPlaceInputs =
14145 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
14146 MutableArrayRef<int> SourceHalfMask,
14147 MutableArrayRef<int> HalfMask, int HalfOffset) {
14148 if (InPlaceInputs.empty())
14149 return;
14150 if (InPlaceInputs.size() == 1) {
14151 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14152 InPlaceInputs[0] - HalfOffset;
14153 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14154 return;
14155 }
14156 if (IncomingInputs.empty()) {
14157 // Just fix all of the in place inputs.
14158 for (int Input : InPlaceInputs) {
14159 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
14160 PSHUFDMask[Input / 2] = Input / 2;
14161 }
14162 return;
14163 }
14164
14165 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
14166 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14167 InPlaceInputs[0] - HalfOffset;
14168 // Put the second input next to the first so that they are packed into
14169 // a dword. We find the adjacent index by toggling the low bit.
14170 int AdjIndex = InPlaceInputs[0] ^ 1;
14171 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14172 llvm::replace(HalfMask, InPlaceInputs[1], AdjIndex);
14173 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14174 };
14175 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14176 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14177
14178 // Now gather the cross-half inputs and place them into a free dword of
14179 // their target half.
14180 // FIXME: This operation could almost certainly be simplified dramatically to
14181 // look more like the 3-1 fixing operation.
14182 auto moveInputsToRightHalf = [&PSHUFDMask](
14183 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
14184 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
14185 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
14186 int DestOffset) {
14187 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
14188 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14189 };
14190 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
14191 int Word) {
14192 int LowWord = Word & ~1;
14193 int HighWord = Word | 1;
14194 return isWordClobbered(SourceHalfMask, LowWord) ||
14195 isWordClobbered(SourceHalfMask, HighWord);
14196 };
14197
14198 if (IncomingInputs.empty())
14199 return;
14200
14201 if (ExistingInputs.empty()) {
14202 // Map any dwords with inputs from them into the right half.
14203 for (int Input : IncomingInputs) {
14204 // If the source half mask maps over the inputs, turn those into
14205 // swaps and use the swapped lane.
14206 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
14207 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
14208 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
14209 Input - SourceOffset;
14210 // We have to swap the uses in our half mask in one sweep.
14211 for (int &M : HalfMask)
14212 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
14213 M = Input;
14214 else if (M == Input)
14215 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14216 } else {
14217 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
14218 Input - SourceOffset &&
14219 "Previous placement doesn't match!");
14220 }
14221 // Note that this correctly re-maps both when we do a swap and when
14222 // we observe the other side of the swap above. We rely on that to
14223 // avoid swapping the members of the input list directly.
14224 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14225 }
14226
14227 // Map the input's dword into the correct half.
14228 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
14229 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
14230 else
14231 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
14232 Input / 2 &&
14233 "Previous placement doesn't match!");
14234 }
14235
14236 // And just directly shift any other-half mask elements to be same-half
14237 // as we will have mirrored the dword containing the element into the
14238 // same position within that half.
14239 for (int &M : HalfMask)
14240 if (M >= SourceOffset && M < SourceOffset + 4) {
14241 M = M - SourceOffset + DestOffset;
14242 assert(M >= 0 && "This should never wrap below zero!");
14243 }
14244 return;
14245 }
14246
14247 // Ensure we have the input in a viable dword of its current half. This
14248 // is particularly tricky because the original position may be clobbered
14249 // by inputs being moved and *staying* in that half.
14250 if (IncomingInputs.size() == 1) {
14251 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14252 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14253 SourceOffset;
14254 SourceHalfMask[InputFixed - SourceOffset] =
14255 IncomingInputs[0] - SourceOffset;
14256 llvm::replace(HalfMask, IncomingInputs[0], InputFixed);
14257 IncomingInputs[0] = InputFixed;
14258 }
14259 } else if (IncomingInputs.size() == 2) {
14260 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14261 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14262 // We have two non-adjacent or clobbered inputs we need to extract from
14263 // the source half. To do this, we need to map them into some adjacent
14264 // dword slot in the source mask.
14265 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14266 IncomingInputs[1] - SourceOffset};
14267
14268 // If there is a free slot in the source half mask adjacent to one of
14269 // the inputs, place the other input in it. We use (Index XOR 1) to
14270 // compute an adjacent index.
14271 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14272 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14273 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14274 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14275 InputsFixed[1] = InputsFixed[0] ^ 1;
14276 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14277 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14278 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14279 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14280 InputsFixed[0] = InputsFixed[1] ^ 1;
14281 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14282 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14283 // The two inputs are in the same DWord but it is clobbered and the
14284 // adjacent DWord isn't used at all. Move both inputs to the free
14285 // slot.
14286 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14287 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14288 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14289 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14290 } else {
14291 // The only way we hit this point is if there is no clobbering
14292 // (because there are no off-half inputs to this half) and there is no
14293 // free slot adjacent to one of the inputs. In this case, we have to
14294 // swap an input with a non-input.
14295 for (int i = 0; i < 4; ++i)
14296 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
14297 "We can't handle any clobbers here!");
14298 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
14299 "Cannot have adjacent inputs here!");
14300
14301 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14302 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14303
14304 // We also have to update the final source mask in this case because
14305 // it may need to undo the above swap.
14306 for (int &M : FinalSourceHalfMask)
14307 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14308 M = InputsFixed[1] + SourceOffset;
14309 else if (M == InputsFixed[1] + SourceOffset)
14310 M = (InputsFixed[0] ^ 1) + SourceOffset;
14311
14312 InputsFixed[1] = InputsFixed[0] ^ 1;
14313 }
14314
14315 // Point everything at the fixed inputs.
14316 for (int &M : HalfMask)
14317 if (M == IncomingInputs[0])
14318 M = InputsFixed[0] + SourceOffset;
14319 else if (M == IncomingInputs[1])
14320 M = InputsFixed[1] + SourceOffset;
14321
14322 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14323 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14324 }
14325 } else {
14326 llvm_unreachable("Unhandled input size!");
14327 }
14328
14329 // Now hoist the DWord down to the right half.
14330 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14331 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
14332 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14333 for (int &M : HalfMask)
14334 for (int Input : IncomingInputs)
14335 if (M == Input)
14336 M = FreeDWord * 2 + Input % 2;
14337 };
14338 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14339 /*SourceOffset*/ 4, /*DestOffset*/ 0);
14340 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14341 /*SourceOffset*/ 0, /*DestOffset*/ 4);
14342
14343 // Now enact all the shuffles we've computed to move the inputs into their
14344 // target half.
14345 if (!isNoopShuffleMask(PSHUFLMask))
14346 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14347 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
14348 if (!isNoopShuffleMask(PSHUFHMask))
14349 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14350 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
14351 if (!isNoopShuffleMask(PSHUFDMask))
14352 V = DAG.getBitcast(
14353 VT,
14354 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14355 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14356
14357 // At this point, each half should contain all its inputs, and we can then
14358 // just shuffle them into their final position.
14359 assert(none_of(LoMask, [](int M) { return M >= 4; }) &&
14360 "Failed to lift all the high half inputs to the low mask!");
14361 assert(none_of(HiMask, [](int M) { return M >= 0 && M < 4; }) &&
14362 "Failed to lift all the low half inputs to the high mask!");
14363
14364 // Do a half shuffle for the low mask.
14365 if (!isNoopShuffleMask(LoMask))
14366 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14367 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14368
14369 // Do a half shuffle with the high mask after shifting its values down.
14370 for (int &M : HiMask)
14371 if (M >= 0)
14372 M -= 4;
14373 if (!isNoopShuffleMask(HiMask))
14374 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14375 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14376
14377 return V;
14378}
14379
14380/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
14381/// blend if only one input is used.
14383 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14384 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
14386 "Lane crossing shuffle masks not supported");
14387
14388 int NumBytes = VT.getSizeInBits() / 8;
14389 int Size = Mask.size();
14390 int Scale = NumBytes / Size;
14391
14392 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14393 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14394 V1InUse = false;
14395 V2InUse = false;
14396
14397 for (int i = 0; i < NumBytes; ++i) {
14398 int M = Mask[i / Scale];
14399 if (M < 0)
14400 continue;
14401
14402 const int ZeroMask = 0x80;
14403 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
14404 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
14405 if (Zeroable[i / Scale])
14406 V1Idx = V2Idx = ZeroMask;
14407
14408 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
14409 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
14410 V1InUse |= (ZeroMask != V1Idx);
14411 V2InUse |= (ZeroMask != V2Idx);
14412 }
14413
14414 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
14415 if (V1InUse)
14416 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
14417 DAG.getBuildVector(ShufVT, DL, V1Mask));
14418 if (V2InUse)
14419 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
14420 DAG.getBuildVector(ShufVT, DL, V2Mask));
14421
14422 // If we need shuffled inputs from both, blend the two.
14423 SDValue V;
14424 if (V1InUse && V2InUse)
14425 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
14426 else
14427 V = V1InUse ? V1 : V2;
14428
14429 // Cast the result back to the correct type.
14430 return DAG.getBitcast(VT, V);
14431}
14432
14433/// Generic lowering of 8-lane i16 shuffles.
14434///
14435/// This handles both single-input shuffles and combined shuffle/blends with
14436/// two inputs. The single input shuffles are immediately delegated to
14437/// a dedicated lowering routine.
14438///
14439/// The blends are lowered in one of three fundamental ways. If there are few
14440/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
14441/// of the input is significantly cheaper when lowered as an interleaving of
14442/// the two inputs, try to interleave them. Otherwise, blend the low and high
14443/// halves of the inputs separately (making them have relatively few inputs)
14444/// and then concatenate them.
14446 const APInt &Zeroable, SDValue V1, SDValue V2,
14447 const X86Subtarget &Subtarget,
14448 SelectionDAG &DAG) {
14449 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14450 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14451 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14452
14453 // Whenever we can lower this as a zext, that instruction is strictly faster
14454 // than any alternative.
14455 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
14456 Zeroable, Subtarget, DAG))
14457 return ZExt;
14458
14459 // Try to use lower using a truncation.
14460 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14461 Subtarget, DAG))
14462 return V;
14463
14464 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
14465
14466 if (NumV2Inputs == 0) {
14467 // Try to use shift instructions.
14468 if (SDValue Shift =
14469 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
14470 Subtarget, DAG, /*BitwiseOnly*/ false))
14471 return Shift;
14472
14473 // Check for being able to broadcast a single element.
14474 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
14475 Mask, Subtarget, DAG))
14476 return Broadcast;
14477
14478 // Try to use bit rotation instructions.
14479 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
14480 Subtarget, DAG))
14481 return Rotate;
14482
14483 // Use dedicated unpack instructions for masks that match their pattern.
14484 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14485 return V;
14486
14487 // Use dedicated pack instructions for masks that match their pattern.
14488 if (SDValue V =
14489 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14490 return V;
14491
14492 // Try to use byte rotation instructions.
14493 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
14494 Subtarget, DAG))
14495 return Rotate;
14496
14497 // Make a copy of the mask so it can be modified.
14498 SmallVector<int, 8> MutableMask(Mask);
14499 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
14500 Subtarget, DAG);
14501 }
14502
14503 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
14504 "All single-input shuffles should be canonicalized to be V1-input "
14505 "shuffles.");
14506
14507 // Try to use shift instructions.
14508 if (SDValue Shift =
14509 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
14510 DAG, /*BitwiseOnly*/ false))
14511 return Shift;
14512
14513 // See if we can use SSE4A Extraction / Insertion.
14514 if (Subtarget.hasSSE4A())
14515 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
14516 Zeroable, DAG))
14517 return V;
14518
14519 // There are special ways we can lower some single-element blends.
14520 if (NumV2Inputs == 1)
14522 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14523 return V;
14524
14525 // We have different paths for blend lowering, but they all must use the
14526 // *exact* same predicate.
14527 bool IsBlendSupported = Subtarget.hasSSE41();
14528 if (IsBlendSupported)
14529 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
14530 Zeroable, Subtarget, DAG))
14531 return Blend;
14532
14533 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
14534 Zeroable, Subtarget, DAG))
14535 return Masked;
14536
14537 // Use dedicated unpack instructions for masks that match their pattern.
14538 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14539 return V;
14540
14541 // Use dedicated pack instructions for masks that match their pattern.
14542 if (SDValue V =
14543 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14544 return V;
14545
14546 // Try to use lower using a truncation.
14547 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14548 Subtarget, DAG))
14549 return V;
14550
14551 // Try to use byte rotation instructions.
14552 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
14553 Subtarget, DAG))
14554 return Rotate;
14555
14556 if (SDValue BitBlend =
14557 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
14558 return BitBlend;
14559
14560 // Try to use byte shift instructions to mask.
14561 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
14562 Zeroable, Subtarget, DAG))
14563 return V;
14564
14565 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
14566 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
14567 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
14568 !Subtarget.hasVLX()) {
14569 // Check if this is part of a 256-bit vector truncation.
14570 unsigned PackOpc = 0;
14571 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
14574 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
14575 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
14576 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
14577 DAG.getTargetConstant(0xEE, DL, MVT::i8));
14578 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
14579 V1 = extract128BitVector(V1V2, 0, DAG, DL);
14580 V2 = extract128BitVector(V1V2, 4, DAG, DL);
14581 PackOpc = X86ISD::PACKUS;
14582 } else if (Subtarget.hasSSE41()) {
14583 SmallVector<SDValue, 4> DWordClearOps(4,
14584 DAG.getConstant(0, DL, MVT::i32));
14585 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
14586 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
14587 SDValue DWordClearMask =
14588 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
14589 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
14590 DWordClearMask);
14591 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
14592 DWordClearMask);
14593 PackOpc = X86ISD::PACKUS;
14594 } else if (!Subtarget.hasSSSE3()) {
14595 SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
14596 V1 = DAG.getBitcast(MVT::v4i32, V1);
14597 V2 = DAG.getBitcast(MVT::v4i32, V2);
14598 V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
14599 V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
14600 V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
14601 V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
14602 PackOpc = X86ISD::PACKSS;
14603 }
14604 if (PackOpc) {
14605 // Now pack things back together.
14606 SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
14607 if (NumEvenDrops == 2) {
14608 Result = DAG.getBitcast(MVT::v4i32, Result);
14609 Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
14610 }
14611 return Result;
14612 }
14613 }
14614
14615 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
14616 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
14617 if (NumOddDrops == 1) {
14618 bool HasSSE41 = Subtarget.hasSSE41();
14619 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14620 DAG.getBitcast(MVT::v4i32, V1),
14621 DAG.getTargetConstant(16, DL, MVT::i8));
14622 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14623 DAG.getBitcast(MVT::v4i32, V2),
14624 DAG.getTargetConstant(16, DL, MVT::i8));
14625 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
14626 MVT::v8i16, V1, V2);
14627 }
14628
14629 // Try to lower by permuting the inputs into an unpack instruction.
14630 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
14631 Mask, Subtarget, DAG))
14632 return Unpack;
14633
14634 // If we can't directly blend but can use PSHUFB, that will be better as it
14635 // can both shuffle and set up the inefficient blend.
14636 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
14637 bool V1InUse, V2InUse;
14638 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
14639 Zeroable, DAG, V1InUse, V2InUse);
14640 }
14641
14642 // We can always bit-blend if we have to so the fallback strategy is to
14643 // decompose into single-input permutes and blends/unpacks.
14644 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, Mask,
14645 Zeroable, Subtarget, DAG);
14646}
14647
14648/// Lower 8-lane 16-bit floating point shuffles.
14650 const APInt &Zeroable, SDValue V1, SDValue V2,
14651 const X86Subtarget &Subtarget,
14652 SelectionDAG &DAG) {
14653 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14654 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14655 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14656 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
14657
14658 if (Subtarget.hasFP16()) {
14659 if (NumV2Elements == 0) {
14660 // Check for being able to broadcast a single element.
14661 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
14662 Mask, Subtarget, DAG))
14663 return Broadcast;
14664 }
14665 if (NumV2Elements == 1 && Mask[0] >= 8)
14667 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14668 return V;
14669 }
14670
14671 V1 = DAG.getBitcast(MVT::v8i16, V1);
14672 V2 = DAG.getBitcast(MVT::v8i16, V2);
14673 return DAG.getBitcast(MVT::v8f16,
14674 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14675}
14676
14677// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
14678// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
14679// the active subvector is extracted.
14681 ArrayRef<int> OriginalMask, SDValue V1,
14682 SDValue V2, const X86Subtarget &Subtarget,
14683 SelectionDAG &DAG) {
14684 // Commute binary inputs so V2 is a load to simplify VPERMI2/T2 folds.
14685 SmallVector<int, 32> Mask(OriginalMask);
14686 if (!V2.isUndef() && isShuffleFoldableLoad(V1) &&
14687 !isShuffleFoldableLoad(V2)) {
14689 std::swap(V1, V2);
14690 }
14691
14692 MVT MaskVT = VT.changeTypeToInteger();
14693 SDValue MaskNode;
14694 MVT ShuffleVT = VT;
14695 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
14696 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
14697 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
14698 ShuffleVT = V1.getSimpleValueType();
14699
14700 // Adjust mask to correct indices for the second input.
14701 int NumElts = VT.getVectorNumElements();
14702 unsigned Scale = 512 / VT.getSizeInBits();
14703 SmallVector<int, 32> AdjustedMask(Mask);
14704 for (int &M : AdjustedMask)
14705 if (NumElts <= M)
14706 M += (Scale - 1) * NumElts;
14707 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
14708 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
14709 } else {
14710 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
14711 }
14712
14713 SDValue Result;
14714 if (V2.isUndef())
14715 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
14716 else
14717 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
14718
14719 if (VT != ShuffleVT)
14720 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
14721
14722 return Result;
14723}
14724
14725/// Generic lowering of v16i8 shuffles.
14726///
14727/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
14728/// detect any complexity reducing interleaving. If that doesn't help, it uses
14729/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
14730/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
14731/// back together.
14733 const APInt &Zeroable, SDValue V1, SDValue V2,
14734 const X86Subtarget &Subtarget,
14735 SelectionDAG &DAG) {
14736 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14737 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14738 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14739
14740 // Try to use shift instructions.
14741 if (SDValue Shift =
14742 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
14743 DAG, /*BitwiseOnly*/ false))
14744 return Shift;
14745
14746 // Try to use byte rotation instructions.
14747 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
14748 Subtarget, DAG))
14749 return Rotate;
14750
14751 // Use dedicated pack instructions for masks that match their pattern.
14752 if (SDValue V =
14753 lowerShuffleWithPACK(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14754 return V;
14755
14756 // Try to use a zext lowering.
14757 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
14758 Zeroable, Subtarget, DAG))
14759 return ZExt;
14760
14761 // Try to use lower using a truncation.
14762 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14763 Subtarget, DAG))
14764 return V;
14765
14766 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14767 Subtarget, DAG))
14768 return V;
14769
14770 // See if we can use SSE4A Extraction / Insertion.
14771 if (Subtarget.hasSSE4A())
14772 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
14773 Zeroable, DAG))
14774 return V;
14775
14776 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
14777
14778 // For single-input shuffles, there are some nicer lowering tricks we can use.
14779 if (NumV2Elements == 0) {
14780 // Check for being able to broadcast a single element.
14781 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
14782 Mask, Subtarget, DAG))
14783 return Broadcast;
14784
14785 // Try to use bit rotation instructions.
14786 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
14787 Subtarget, DAG))
14788 return Rotate;
14789
14790 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14791 return V;
14792
14793 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
14794 // Notably, this handles splat and partial-splat shuffles more efficiently.
14795 // However, it only makes sense if the pre-duplication shuffle simplifies
14796 // things significantly. Currently, this means we need to be able to
14797 // express the pre-duplication shuffle as an i16 shuffle.
14798 //
14799 // FIXME: We should check for other patterns which can be widened into an
14800 // i16 shuffle as well.
14801 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
14802 for (int i = 0; i < 16; i += 2)
14803 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
14804 return false;
14805
14806 return true;
14807 };
14808 auto tryToWidenViaDuplication = [&]() -> SDValue {
14809 if (!canWidenViaDuplication(Mask))
14810 return SDValue();
14811 SmallVector<int, 4> LoInputs;
14812 copy_if(Mask, std::back_inserter(LoInputs),
14813 [](int M) { return M >= 0 && M < 8; });
14814 array_pod_sort(LoInputs.begin(), LoInputs.end());
14815 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
14816 SmallVector<int, 4> HiInputs;
14817 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
14818 array_pod_sort(HiInputs.begin(), HiInputs.end());
14819 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
14820
14821 bool TargetLo = LoInputs.size() >= HiInputs.size();
14822 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
14823 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
14824
14825 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
14827 for (int I : InPlaceInputs) {
14828 PreDupI16Shuffle[I/2] = I/2;
14829 LaneMap[I] = I;
14830 }
14831 int j = TargetLo ? 0 : 4, je = j + 4;
14832 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
14833 // Check if j is already a shuffle of this input. This happens when
14834 // there are two adjacent bytes after we move the low one.
14835 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
14836 // If we haven't yet mapped the input, search for a slot into which
14837 // we can map it.
14838 while (j < je && PreDupI16Shuffle[j] >= 0)
14839 ++j;
14840
14841 if (j == je)
14842 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
14843 return SDValue();
14844
14845 // Map this input with the i16 shuffle.
14846 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
14847 }
14848
14849 // Update the lane map based on the mapping we ended up with.
14850 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
14851 }
14852 V1 = DAG.getBitcast(
14853 MVT::v16i8,
14854 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14855 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
14856
14857 // Unpack the bytes to form the i16s that will be shuffled into place.
14858 bool EvenInUse = false, OddInUse = false;
14859 for (int i = 0; i < 16; i += 2) {
14860 EvenInUse |= (Mask[i + 0] >= 0);
14861 OddInUse |= (Mask[i + 1] >= 0);
14862 if (EvenInUse && OddInUse)
14863 break;
14864 }
14865 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14866 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
14867 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
14868
14869 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14870 for (int i = 0; i < 16; ++i)
14871 if (Mask[i] >= 0) {
14872 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14873 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
14874 if (PostDupI16Shuffle[i / 2] < 0)
14875 PostDupI16Shuffle[i / 2] = MappedMask;
14876 else
14877 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
14878 "Conflicting entries in the original shuffle!");
14879 }
14880 return DAG.getBitcast(
14881 MVT::v16i8,
14882 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14883 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
14884 };
14885 if (SDValue V = tryToWidenViaDuplication())
14886 return V;
14887 }
14888
14889 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
14890 Zeroable, Subtarget, DAG))
14891 return Masked;
14892
14893 // Use dedicated unpack instructions for masks that match their pattern.
14894 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14895 return V;
14896
14897 // Try to use byte shift instructions to mask.
14898 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
14899 Zeroable, Subtarget, DAG))
14900 return V;
14901
14902 // Check for compaction patterns.
14903 bool IsSingleInput = V2.isUndef();
14904 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
14905
14906 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
14907 // with PSHUFB. It is important to do this before we attempt to generate any
14908 // blends but after all of the single-input lowerings. If the single input
14909 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
14910 // want to preserve that and we can DAG combine any longer sequences into
14911 // a PSHUFB in the end. But once we start blending from multiple inputs,
14912 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
14913 // and there are *very* few patterns that would actually be faster than the
14914 // PSHUFB approach because of its ability to zero lanes.
14915 //
14916 // If the mask is a binary compaction, we can more efficiently perform this
14917 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
14918 //
14919 // FIXME: The only exceptions to the above are blends which are exact
14920 // interleavings with direct instructions supporting them. We currently don't
14921 // handle those well here.
14922 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
14923 bool V1InUse = false;
14924 bool V2InUse = false;
14925
14927 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
14928
14929 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
14930 // do so. This avoids using them to handle blends-with-zero which is
14931 // important as a single pshufb is significantly faster for that.
14932 if (V1InUse && V2InUse) {
14933 if (Subtarget.hasSSE41())
14934 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
14935 Zeroable, Subtarget, DAG))
14936 return Blend;
14937
14938 // We can use an unpack to do the blending rather than an or in some
14939 // cases. Even though the or may be (very minorly) more efficient, we
14940 // preference this lowering because there are common cases where part of
14941 // the complexity of the shuffles goes away when we do the final blend as
14942 // an unpack.
14943 // FIXME: It might be worth trying to detect if the unpack-feeding
14944 // shuffles will both be pshufb, in which case we shouldn't bother with
14945 // this.
14947 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14948 return Unpack;
14949
14950 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
14951 if (Subtarget.hasVBMI())
14952 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
14953 DAG);
14954
14955 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
14956 if (Subtarget.hasXOP()) {
14957 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
14958 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
14959 }
14960
14961 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
14962 // PALIGNR will be cheaper than the second PSHUFB+OR.
14964 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14965 return V;
14966 }
14967
14968 return PSHUFB;
14969 }
14970
14971 // There are special ways we can lower some single-element blends.
14972 if (NumV2Elements == 1)
14974 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14975 return V;
14976
14977 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
14978 return Blend;
14979
14980 // Check whether a compaction lowering can be done. This handles shuffles
14981 // which take every Nth element for some even N. See the helper function for
14982 // details.
14983 //
14984 // We special case these as they can be particularly efficiently handled with
14985 // the PACKUSB instruction on x86 and they show up in common patterns of
14986 // rearranging bytes to truncate wide elements.
14987 if (NumEvenDrops) {
14988 // NumEvenDrops is the power of two stride of the elements. Another way of
14989 // thinking about it is that we need to drop the even elements this many
14990 // times to get the original input.
14991
14992 // First we need to zero all the dropped bytes.
14993 assert(NumEvenDrops <= 3 &&
14994 "No support for dropping even elements more than 3 times.");
14995 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
14996 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
14997 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
14998 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
14999 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
15000 WordClearMask);
15001 if (!IsSingleInput)
15002 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
15003 WordClearMask);
15004
15005 // Now pack things back together.
15006 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15007 IsSingleInput ? V1 : V2);
15008 for (int i = 1; i < NumEvenDrops; ++i) {
15009 Result = DAG.getBitcast(MVT::v8i16, Result);
15010 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
15011 }
15012 return Result;
15013 }
15014
15015 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
15016 if (NumOddDrops == 1) {
15017 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
15018 DAG.getBitcast(MVT::v8i16, V1),
15019 DAG.getTargetConstant(8, DL, MVT::i8));
15020 if (!IsSingleInput)
15021 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
15022 DAG.getBitcast(MVT::v8i16, V2),
15023 DAG.getTargetConstant(8, DL, MVT::i8));
15024 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15025 IsSingleInput ? V1 : V2);
15026 }
15027
15028 // Handle multi-input cases by blending/unpacking single-input shuffles.
15029 if (NumV2Elements > 0)
15030 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
15031 Zeroable, Subtarget, DAG);
15032
15033 // The fallback path for single-input shuffles widens this into two v8i16
15034 // vectors with unpacks, shuffles those, and then pulls them back together
15035 // with a pack.
15036 SDValue V = V1;
15037
15038 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15039 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15040 for (int i = 0; i < 16; ++i)
15041 if (Mask[i] >= 0)
15042 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15043
15044 SDValue VLoHalf, VHiHalf;
15045 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
15046 // them out and avoid using UNPCK{L,H} to extract the elements of V as
15047 // i16s.
15048 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
15049 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
15050 // Use a mask to drop the high bytes.
15051 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
15052 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
15053 DAG.getConstant(0x00FF, DL, MVT::v8i16));
15054
15055 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
15056 VHiHalf = DAG.getUNDEF(MVT::v8i16);
15057
15058 // Squash the masks to point directly into VLoHalf.
15059 for (int &M : LoBlendMask)
15060 if (M >= 0)
15061 M /= 2;
15062 for (int &M : HiBlendMask)
15063 if (M >= 0)
15064 M /= 2;
15065 } else {
15066 // Otherwise just unpack the low half of V into VLoHalf and the high half into
15067 // VHiHalf so that we can blend them as i16s.
15068 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
15069
15070 VLoHalf = DAG.getBitcast(
15071 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
15072 VHiHalf = DAG.getBitcast(
15073 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
15074 }
15075
15076 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
15077 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
15078
15079 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
15080}
15081
15082/// Dispatching routine to lower various 128-bit x86 vector shuffles.
15083///
15084/// This routine breaks down the specific type of 128-bit shuffle and
15085/// dispatches to the lowering routines accordingly.
15087 MVT VT, SDValue V1, SDValue V2,
15088 const APInt &Zeroable,
15089 const X86Subtarget &Subtarget,
15090 SelectionDAG &DAG) {
15091 if (VT == MVT::v8bf16) {
15092 V1 = DAG.getBitcast(MVT::v8i16, V1);
15093 V2 = DAG.getBitcast(MVT::v8i16, V2);
15094 return DAG.getBitcast(VT,
15095 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
15096 }
15097
15098 switch (VT.SimpleTy) {
15099 case MVT::v2i64:
15100 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15101 case MVT::v2f64:
15102 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15103 case MVT::v4i32:
15104 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15105 case MVT::v4f32:
15106 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15107 case MVT::v8i16:
15108 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15109 case MVT::v8f16:
15110 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15111 case MVT::v16i8:
15112 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15113
15114 default:
15115 llvm_unreachable("Unimplemented!");
15116 }
15117}
15118
15119/// Generic routine to split vector shuffle into half-sized shuffles.
15120///
15121/// This routine just extracts two subvectors, shuffles them independently, and
15122/// then concatenates them back together. This should work effectively with all
15123/// AVX vector shuffle types.
15125 SDValue V2, ArrayRef<int> Mask,
15126 SelectionDAG &DAG, bool SimpleOnly) {
15127 assert(VT.getSizeInBits() >= 256 &&
15128 "Only for 256-bit or wider vector shuffles!");
15129 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
15130 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
15131
15132 // If this came from the AVX1 v8i32 -> v8f32 bitcast, split using v4i32.
15133 if (VT == MVT::v8f32) {
15134 SDValue BC1 = peekThroughBitcasts(V1);
15135 SDValue BC2 = peekThroughBitcasts(V2);
15136 if (BC1.getValueType() == MVT::v8i32 && BC2.getValueType() == MVT::v8i32) {
15137 if (SDValue Split = splitAndLowerShuffle(DL, MVT::v8i32, BC1, BC2, Mask,
15138 DAG, SimpleOnly))
15139 return DAG.getBitcast(VT, Split);
15140 }
15141 }
15142
15143 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
15144 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
15145
15146 int NumElements = VT.getVectorNumElements();
15147 int SplitNumElements = NumElements / 2;
15148 MVT ScalarVT = VT.getVectorElementType();
15149 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
15150
15151 // Use splitVector/extractSubVector so that split build-vectors just build two
15152 // narrower build vectors. This helps shuffling with splats and zeros.
15153 auto SplitVector = [&](SDValue V) {
15154 SDValue LoV, HiV;
15155 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
15156 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
15157 DAG.getBitcast(SplitVT, HiV));
15158 };
15159
15160 SDValue LoV1, HiV1, LoV2, HiV2;
15161 std::tie(LoV1, HiV1) = SplitVector(V1);
15162 std::tie(LoV2, HiV2) = SplitVector(V2);
15163
15164 // Now create two 4-way blends of these half-width vectors.
15165 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
15166 bool &UseHiV1, bool &UseLoV2,
15167 bool &UseHiV2) {
15168 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
15169 for (int i = 0; i < SplitNumElements; ++i) {
15170 int M = HalfMask[i];
15171 if (M >= NumElements) {
15172 if (M >= NumElements + SplitNumElements)
15173 UseHiV2 = true;
15174 else
15175 UseLoV2 = true;
15176 } else if (M >= 0) {
15177 if (M >= SplitNumElements)
15178 UseHiV1 = true;
15179 else
15180 UseLoV1 = true;
15181 }
15182 }
15183 };
15184
15185 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
15186 if (!SimpleOnly)
15187 return true;
15188
15189 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15190 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15191
15192 return !(UseHiV1 || UseHiV2);
15193 };
15194
15195 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
15196 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
15197 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
15198 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
15199 for (int i = 0; i < SplitNumElements; ++i) {
15200 int M = HalfMask[i];
15201 if (M >= NumElements) {
15202 V2BlendMask[i] = M - NumElements;
15203 BlendMask[i] = SplitNumElements + i;
15204 } else if (M >= 0) {
15205 V1BlendMask[i] = M;
15206 BlendMask[i] = i;
15207 }
15208 }
15209
15210 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15211 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15212
15213 // Because the lowering happens after all combining takes place, we need to
15214 // manually combine these blend masks as much as possible so that we create
15215 // a minimal number of high-level vector shuffle nodes.
15216 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");
15217
15218 // First try just blending the halves of V1 or V2.
15219 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
15220 return DAG.getUNDEF(SplitVT);
15221 if (!UseLoV2 && !UseHiV2)
15222 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15223 if (!UseLoV1 && !UseHiV1)
15224 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15225
15226 SDValue V1Blend, V2Blend;
15227 if (UseLoV1 && UseHiV1) {
15228 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15229 } else {
15230 // We only use half of V1 so map the usage down into the final blend mask.
15231 V1Blend = UseLoV1 ? LoV1 : HiV1;
15232 for (int i = 0; i < SplitNumElements; ++i)
15233 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
15234 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
15235 }
15236 if (UseLoV2 && UseHiV2) {
15237 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15238 } else {
15239 // We only use half of V2 so map the usage down into the final blend mask.
15240 V2Blend = UseLoV2 ? LoV2 : HiV2;
15241 for (int i = 0; i < SplitNumElements; ++i)
15242 if (BlendMask[i] >= SplitNumElements)
15243 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
15244 }
15245 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
15246 };
15247
15248 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
15249 return SDValue();
15250
15251 SDValue Lo = HalfBlend(LoMask);
15252 SDValue Hi = HalfBlend(HiMask);
15253 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15254}
15255
15256/// Either split a vector in halves or decompose the shuffles and the
15257/// blend/unpack.
15258///
15259/// This is provided as a good fallback for many lowerings of non-single-input
15260/// shuffles with more than one 128-bit lane. In those cases, we want to select
15261/// between splitting the shuffle into 128-bit components and stitching those
15262/// back together vs. extracting the single-input shuffles and blending those
15263/// results.
15265 SDValue V2, ArrayRef<int> Mask,
15266 const APInt &Zeroable,
15267 const X86Subtarget &Subtarget,
15268 SelectionDAG &DAG) {
15269 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
15270 "shuffles as it could then recurse on itself.");
15271 int Size = Mask.size();
15272
15273 // If this can be modeled as a broadcast of two elements followed by a blend,
15274 // prefer that lowering. This is especially important because broadcasts can
15275 // often fold with memory operands.
15276 auto DoBothBroadcast = [&] {
15277 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
15278 for (int M : Mask)
15279 if (M >= Size) {
15280 if (V2BroadcastIdx < 0)
15281 V2BroadcastIdx = M - Size;
15282 else if ((M - Size) != V2BroadcastIdx &&
15283 !IsElementEquivalent(Size, V2, V2, M - Size, V2BroadcastIdx))
15284 return false;
15285 } else if (M >= 0) {
15286 if (V1BroadcastIdx < 0)
15287 V1BroadcastIdx = M;
15288 else if (M != V1BroadcastIdx &&
15289 !IsElementEquivalent(Size, V1, V1, M, V1BroadcastIdx))
15290 return false;
15291 }
15292 return true;
15293 };
15294 if (DoBothBroadcast())
15295 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15296 Subtarget, DAG);
15297
15298 // If the inputs all stem from a single 128-bit lane of each input, then we
15299 // split them rather than blending because the split will decompose to
15300 // unusually few instructions.
15301 int LaneCount = VT.getSizeInBits() / 128;
15302 int LaneSize = Size / LaneCount;
15303 SmallBitVector LaneInputs[2];
15304 LaneInputs[0].resize(LaneCount, false);
15305 LaneInputs[1].resize(LaneCount, false);
15306 for (int i = 0; i < Size; ++i)
15307 if (Mask[i] >= 0)
15308 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
15309 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
15310 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15311 /*SimpleOnly*/ false);
15312
15313 // Without AVX2, if we can freely split the subvectors then we're better off
15314 // performing half width shuffles.
15315 if (!Subtarget.hasAVX2()) {
15316 SDValue BC1 = peekThroughBitcasts(V1);
15317 SDValue BC2 = peekThroughBitcasts(V2);
15318 bool SplatOrSplitV1 = isFreeToSplitVector(BC1, DAG) ||
15319 DAG.isSplatValue(BC1, /*AllowUndefs=*/true);
15320 bool SplatOrSplitV2 = isFreeToSplitVector(BC2, DAG) ||
15321 DAG.isSplatValue(BC2, /*AllowUndefs=*/true);
15322 if (SplatOrSplitV1 && SplatOrSplitV2)
15323 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15324 /*SimpleOnly*/ false);
15325 }
15326
15327 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
15328 // requires that the decomposed single-input shuffles don't end up here.
15329 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15330 Subtarget, DAG);
15331}
15332
15333// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15334// TODO: Extend to support v8f32 (+ 512-bit shuffles).
15336 SDValue V1, SDValue V2,
15337 ArrayRef<int> Mask,
15338 SelectionDAG &DAG) {
15339 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
15340
15341 int LHSMask[4] = {-1, -1, -1, -1};
15342 int RHSMask[4] = {-1, -1, -1, -1};
15343 int SHUFPDMask[4] = {-1, -1, -1, -1};
15344
15345 // As SHUFPD uses a single LHS/RHS element per lane, we can always
15346 // perform the shuffle once the lanes have been shuffled in place.
15347 for (int i = 0; i != 4; ++i) {
15348 int M = Mask[i];
15349 if (M < 0)
15350 continue;
15351 int LaneBase = i & ~1;
15352 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15353 LaneMask[LaneBase + (M & 1)] = M;
15354 SHUFPDMask[i] = M & 1;
15355 }
15356
15357 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15358 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15359 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15360 getSHUFPDImmForMask(SHUFPDMask, DL, DAG));
15361}
15362
15363/// Lower a vector shuffle crossing multiple 128-bit lanes as
15364/// a lane permutation followed by a per-lane permutation.
15365///
15366/// This is mainly for cases where we can have non-repeating permutes
15367/// in each lane.
15368///
15369/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15370/// we should investigate merging them.
15372 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15373 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15374 int NumElts = VT.getVectorNumElements();
15375 int NumLanes = VT.getSizeInBits() / 128;
15376 int NumEltsPerLane = NumElts / NumLanes;
15377 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
15378
15379 /// Attempts to find a sublane permute with the given size
15380 /// that gets all elements into their target lanes.
15381 ///
15382 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
15383 /// If unsuccessful, returns false and may overwrite InLaneMask.
15384 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15385 int NumSublanesPerLane = NumSublanes / NumLanes;
15386 int NumEltsPerSublane = NumElts / NumSublanes;
15387
15388 SmallVector<int, 16> CrossLaneMask;
15389 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
15390 // CrossLaneMask but one entry == one sublane.
15391 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
15392 APInt DemandedCrossLane = APInt::getZero(NumElts);
15393
15394 for (int i = 0; i != NumElts; ++i) {
15395 int M = Mask[i];
15396 if (M < 0)
15397 continue;
15398
15399 int SrcSublane = M / NumEltsPerSublane;
15400 int DstLane = i / NumEltsPerLane;
15401
15402 // We only need to get the elements into the right lane, not sublane.
15403 // So search all sublanes that make up the destination lane.
15404 bool Found = false;
15405 int DstSubStart = DstLane * NumSublanesPerLane;
15406 int DstSubEnd = DstSubStart + NumSublanesPerLane;
15407 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15408 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15409 continue;
15410
15411 Found = true;
15412 CrossLaneMaskLarge[DstSublane] = SrcSublane;
15413 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15414 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15415 DemandedCrossLane.setBit(InLaneMask[i]);
15416 break;
15417 }
15418 if (!Found)
15419 return SDValue();
15420 }
15421
15422 // Fill CrossLaneMask using CrossLaneMaskLarge.
15423 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
15424
15425 if (!CanUseSublanes) {
15426 // If we're only shuffling a single lowest lane and the rest are identity
15427 // then don't bother.
15428 // TODO - isShuffleMaskInputInPlace could be extended to something like
15429 // this.
15430 int NumIdentityLanes = 0;
15431 bool OnlyShuffleLowestLane = true;
15432 for (int i = 0; i != NumLanes; ++i) {
15433 int LaneOffset = i * NumEltsPerLane;
15434 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
15435 i * NumEltsPerLane))
15436 NumIdentityLanes++;
15437 else if (CrossLaneMask[LaneOffset] != 0)
15438 OnlyShuffleLowestLane = false;
15439 }
15440 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15441 return SDValue();
15442 }
15443
15444 // Simplify CrossLaneMask based on the actual demanded elements.
15445 if (V1.hasOneUse())
15446 for (int i = 0; i != NumElts; ++i)
15447 if (!DemandedCrossLane[i])
15448 CrossLaneMask[i] = SM_SentinelUndef;
15449
15450 // Avoid returning the same shuffle operation. For example,
15451 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
15452 // undef:v16i16
15453 if (CrossLaneMask == Mask || InLaneMask == Mask)
15454 return SDValue();
15455
15456 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
15457 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
15458 InLaneMask);
15459 };
15460
15461 // First attempt a solution with full lanes.
15462 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
15463 return V;
15464
15465 // The rest of the solutions use sublanes.
15466 if (!CanUseSublanes)
15467 return SDValue();
15468
15469 // Then attempt a solution with 64-bit sublanes (vpermq).
15470 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
15471 return V;
15472
15473 // If that doesn't work and we have fast variable cross-lane shuffle,
15474 // attempt 32-bit sublanes (vpermd).
15475 if (!Subtarget.hasFastVariableCrossLaneShuffle())
15476 return SDValue();
15477
15478 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
15479}
15480
15481/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
15482static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
15483 SmallVector<int> &InLaneMask) {
15484 int Size = Mask.size();
15485 InLaneMask.assign(Mask.begin(), Mask.end());
15486 for (int i = 0; i < Size; ++i) {
15487 int &M = InLaneMask[i];
15488 if (M < 0)
15489 continue;
15490 if (((M % Size) / LaneSize) != (i / LaneSize))
15491 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
15492 }
15493}
15494
15495/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
15496/// source with a lane permutation.
15497///
15498/// This lowering strategy results in four instructions in the worst case for a
15499/// single-input cross lane shuffle which is lower than any other fully general
15500/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
15501/// shuffle pattern should be handled prior to trying this lowering.
15503 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15504 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15505 // FIXME: This should probably be generalized for 512-bit vectors as well.
15506 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
15507 int Size = Mask.size();
15508 int LaneSize = Size / 2;
15509
15510 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15511 // Only do this if the elements aren't all from the lower lane,
15512 // otherwise we're (probably) better off doing a split.
15513 if (VT == MVT::v4f64 &&
15514 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
15515 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
15516
15517 // If there are only inputs from one 128-bit lane, splitting will in fact be
15518 // less expensive. The flags track whether the given lane contains an element
15519 // that crosses to another lane.
15520 bool AllLanes;
15521 if (!Subtarget.hasAVX2()) {
15522 bool LaneCrossing[2] = {false, false};
15523 for (int i = 0; i < Size; ++i)
15524 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
15525 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
15526 AllLanes = LaneCrossing[0] && LaneCrossing[1];
15527 } else {
15528 bool LaneUsed[2] = {false, false};
15529 for (int i = 0; i < Size; ++i)
15530 if (Mask[i] >= 0)
15531 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
15532 AllLanes = LaneUsed[0] && LaneUsed[1];
15533 }
15534
15535 // TODO - we could support shuffling V2 in the Flipped input.
15536 assert(V2.isUndef() &&
15537 "This last part of this routine only works on single input shuffles");
15538
15539 SmallVector<int> InLaneMask;
15540 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
15541
15542 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
15543 "In-lane shuffle mask expected");
15544
15545 // If we're not using both lanes in each lane and the inlane mask is not
15546 // repeating, then we're better off splitting.
15547 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
15548 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15549 /*SimpleOnly*/ false);
15550
15551 // Flip the lanes, and shuffle the results which should now be in-lane.
15552 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
15553 SDValue Flipped = DAG.getBitcast(PVT, V1);
15554 Flipped =
15555 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
15556 Flipped = DAG.getBitcast(VT, Flipped);
15557 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
15558}
15559
15560/// Handle lowering 2-lane 128-bit shuffles.
15562 SDValue V2, ArrayRef<int> Mask,
15563 const APInt &Zeroable,
15564 const X86Subtarget &Subtarget,
15565 SelectionDAG &DAG) {
15566 if (V2.isUndef()) {
15567 // Attempt to match VBROADCAST*128 subvector broadcast load.
15568 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
15569 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
15570 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
15572 MVT MemVT = VT.getHalfNumVectorElementsVT();
15573 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
15576 VT, MemVT, Ld, Ofs, DAG))
15577 return BcstLd;
15578 }
15579
15580 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
15581 if (Subtarget.hasAVX2())
15582 return SDValue();
15583 }
15584
15585 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
15586
15587 SmallVector<int, 4> WidenedMask;
15588 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
15589 return SDValue();
15590
15591 bool IsLowZero = (Zeroable & 0x3) == 0x3;
15592 bool IsHighZero = (Zeroable & 0xc) == 0xc;
15593
15594 // Try to use an insert into a zero vector.
15595 if (WidenedMask[0] == 0 && IsHighZero) {
15596 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15597 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
15598 DAG.getVectorIdxConstant(0, DL));
15599 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
15600 getZeroVector(VT, Subtarget, DAG, DL), LoV,
15601 DAG.getVectorIdxConstant(0, DL));
15602 }
15603
15604 // TODO: If minimizing size and one of the inputs is a zero vector and the
15605 // the zero vector has only one use, we could use a VPERM2X128 to save the
15606 // instruction bytes needed to explicitly generate the zero vector.
15607
15608 // Blends are faster and handle all the non-lane-crossing cases.
15609 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
15610 Subtarget, DAG))
15611 return Blend;
15612
15613 // If either input operand is a zero vector, use VPERM2X128 because its mask
15614 // allows us to replace the zero input with an implicit zero.
15615 if (!IsLowZero && !IsHighZero) {
15616 // Check for patterns which can be matched with a single insert of a 128-bit
15617 // subvector.
15618 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
15619 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
15620
15621 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
15622 // this will likely become vinsertf128 which can't fold a 256-bit memop.
15624 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15625 SDValue SubVec =
15626 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
15627 DAG.getVectorIdxConstant(0, DL));
15628 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
15629 DAG.getVectorIdxConstant(2, DL));
15630 }
15631 }
15632
15633 // Try to use SHUF128 if possible.
15634 if (Subtarget.hasVLX()) {
15635 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
15636 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
15637 ((WidenedMask[1] % 2) << 1);
15638 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
15639 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15640 }
15641 }
15642 }
15643
15644 // Otherwise form a 128-bit permutation. After accounting for undefs,
15645 // convert the 64-bit shuffle mask selection values into 128-bit
15646 // selection bits by dividing the indexes by 2 and shifting into positions
15647 // defined by a vperm2*128 instruction's immediate control byte.
15648
15649 // The immediate permute control byte looks like this:
15650 // [1:0] - select 128 bits from sources for low half of destination
15651 // [2] - ignore
15652 // [3] - zero low half of destination
15653 // [5:4] - select 128 bits from sources for high half of destination
15654 // [6] - ignore
15655 // [7] - zero high half of destination
15656
15657 assert((WidenedMask[0] >= 0 || IsLowZero) &&
15658 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
15659
15660 unsigned PermMask = 0;
15661 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
15662 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
15663
15664 // Check the immediate mask and replace unused sources with undef.
15665 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
15666 V1 = DAG.getUNDEF(VT);
15667 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
15668 V2 = DAG.getUNDEF(VT);
15669
15670 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
15671 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15672}
15673
15674/// Lower a vector shuffle by first fixing the 128-bit lanes and then
15675/// shuffling each lane.
15676///
15677/// This attempts to create a repeated lane shuffle where each lane uses one
15678/// or two of the lanes of the inputs. The lanes of the input vectors are
15679/// shuffled in one or two independent shuffles to get the lanes into the
15680/// position needed by the final shuffle.
15682 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15683 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15684 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
15685
15686 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15687 return SDValue();
15688
15689 int NumElts = Mask.size();
15690 int NumLanes = VT.getSizeInBits() / 128;
15691 int NumLaneElts = 128 / VT.getScalarSizeInBits();
15692 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
15693 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
15694
15695 // First pass will try to fill in the RepeatMask from lanes that need two
15696 // sources.
15697 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15698 int Srcs[2] = {-1, -1};
15699 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
15700 for (int i = 0; i != NumLaneElts; ++i) {
15701 int M = Mask[(Lane * NumLaneElts) + i];
15702 if (M < 0)
15703 continue;
15704 // Determine which of the possible input lanes (NumLanes from each source)
15705 // this element comes from. Assign that as one of the sources for this
15706 // lane. We can assign up to 2 sources for this lane. If we run out
15707 // sources we can't do anything.
15708 int LaneSrc = M / NumLaneElts;
15709 int Src;
15710 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
15711 Src = 0;
15712 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
15713 Src = 1;
15714 else
15715 return SDValue();
15716
15717 Srcs[Src] = LaneSrc;
15718 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
15719 }
15720
15721 // If this lane has two sources, see if it fits with the repeat mask so far.
15722 if (Srcs[1] < 0)
15723 continue;
15724
15725 LaneSrcs[Lane][0] = Srcs[0];
15726 LaneSrcs[Lane][1] = Srcs[1];
15727
15728 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
15729 assert(M1.size() == M2.size() && "Unexpected mask size");
15730 for (int i = 0, e = M1.size(); i != e; ++i)
15731 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
15732 return false;
15733 return true;
15734 };
15735
15736 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
15737 assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
15738 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
15739 int M = Mask[i];
15740 if (M < 0)
15741 continue;
15742 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
15743 "Unexpected mask element");
15744 MergedMask[i] = M;
15745 }
15746 };
15747
15748 if (MatchMasks(InLaneMask, RepeatMask)) {
15749 // Merge this lane mask into the final repeat mask.
15750 MergeMasks(InLaneMask, RepeatMask);
15751 continue;
15752 }
15753
15754 // Didn't find a match. Swap the operands and try again.
15755 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
15757
15758 if (MatchMasks(InLaneMask, RepeatMask)) {
15759 // Merge this lane mask into the final repeat mask.
15760 MergeMasks(InLaneMask, RepeatMask);
15761 continue;
15762 }
15763
15764 // Couldn't find a match with the operands in either order.
15765 return SDValue();
15766 }
15767
15768 // Now handle any lanes with only one source.
15769 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15770 // If this lane has already been processed, skip it.
15771 if (LaneSrcs[Lane][0] >= 0)
15772 continue;
15773
15774 for (int i = 0; i != NumLaneElts; ++i) {
15775 int M = Mask[(Lane * NumLaneElts) + i];
15776 if (M < 0)
15777 continue;
15778
15779 // If RepeatMask isn't defined yet we can define it ourself.
15780 if (RepeatMask[i] < 0)
15781 RepeatMask[i] = M % NumLaneElts;
15782
15783 if (RepeatMask[i] < NumElts) {
15784 if (RepeatMask[i] != M % NumLaneElts)
15785 return SDValue();
15786 LaneSrcs[Lane][0] = M / NumLaneElts;
15787 } else {
15788 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
15789 return SDValue();
15790 LaneSrcs[Lane][1] = M / NumLaneElts;
15791 }
15792 }
15793
15794 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
15795 return SDValue();
15796 }
15797
15798 SmallVector<int, 16> NewMask(NumElts, -1);
15799 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15800 int Src = LaneSrcs[Lane][0];
15801 for (int i = 0; i != NumLaneElts; ++i) {
15802 int M = -1;
15803 if (Src >= 0)
15804 M = Src * NumLaneElts + i;
15805 NewMask[Lane * NumLaneElts + i] = M;
15806 }
15807 }
15808 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15809 // Ensure we didn't get back the shuffle we started with.
15810 // FIXME: This is a hack to make up for some splat handling code in
15811 // getVectorShuffle.
15812 if (isa<ShuffleVectorSDNode>(NewV1) &&
15813 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
15814 return SDValue();
15815
15816 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15817 int Src = LaneSrcs[Lane][1];
15818 for (int i = 0; i != NumLaneElts; ++i) {
15819 int M = -1;
15820 if (Src >= 0)
15821 M = Src * NumLaneElts + i;
15822 NewMask[Lane * NumLaneElts + i] = M;
15823 }
15824 }
15825 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15826 // Ensure we didn't get back the shuffle we started with.
15827 // FIXME: This is a hack to make up for some splat handling code in
15828 // getVectorShuffle.
15829 if (isa<ShuffleVectorSDNode>(NewV2) &&
15830 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
15831 return SDValue();
15832
15833 for (int i = 0; i != NumElts; ++i) {
15834 if (Mask[i] < 0) {
15835 NewMask[i] = -1;
15836 continue;
15837 }
15838 NewMask[i] = RepeatMask[i % NumLaneElts];
15839 if (NewMask[i] < 0)
15840 continue;
15841
15842 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
15843 }
15844 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
15845}
15846
15847/// If the input shuffle mask results in a vector that is undefined in all upper
15848/// or lower half elements and that mask accesses only 2 halves of the
15849/// shuffle's operands, return true. A mask of half the width with mask indexes
15850/// adjusted to access the extracted halves of the original shuffle operands is
15851/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
15852/// lower half of each input operand is accessed.
15853static bool
15855 int &HalfIdx1, int &HalfIdx2) {
15856 assert((Mask.size() == HalfMask.size() * 2) &&
15857 "Expected input mask to be twice as long as output");
15858
15859 // Exactly one half of the result must be undef to allow narrowing.
15860 bool UndefLower = isUndefLowerHalf(Mask);
15861 bool UndefUpper = isUndefUpperHalf(Mask);
15862 if (UndefLower == UndefUpper)
15863 return false;
15864
15865 unsigned HalfNumElts = HalfMask.size();
15866 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
15867 HalfIdx1 = -1;
15868 HalfIdx2 = -1;
15869 for (unsigned i = 0; i != HalfNumElts; ++i) {
15870 int M = Mask[i + MaskIndexOffset];
15871 if (M < 0) {
15872 HalfMask[i] = M;
15873 continue;
15874 }
15875
15876 // Determine which of the 4 half vectors this element is from.
15877 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
15878 int HalfIdx = M / HalfNumElts;
15879
15880 // Determine the element index into its half vector source.
15881 int HalfElt = M % HalfNumElts;
15882
15883 // We can shuffle with up to 2 half vectors, set the new 'half'
15884 // shuffle mask accordingly.
15885 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
15886 HalfMask[i] = HalfElt;
15887 HalfIdx1 = HalfIdx;
15888 continue;
15889 }
15890 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
15891 HalfMask[i] = HalfElt + HalfNumElts;
15892 HalfIdx2 = HalfIdx;
15893 continue;
15894 }
15895
15896 // Too many half vectors referenced.
15897 return false;
15898 }
15899
15900 return true;
15901}
15902
15903/// Given the output values from getHalfShuffleMask(), create a half width
15904/// shuffle of extracted vectors followed by an insert back to full width.
15906 ArrayRef<int> HalfMask, int HalfIdx1,
15907 int HalfIdx2, bool UndefLower,
15908 SelectionDAG &DAG, bool UseConcat = false) {
15909 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
15910 assert(V1.getValueType().isSimple() && "Expecting only simple types");
15911
15912 MVT VT = V1.getSimpleValueType();
15913 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15914 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15915
15916 auto getHalfVector = [&](int HalfIdx) {
15917 if (HalfIdx < 0)
15918 return DAG.getUNDEF(HalfVT);
15919 SDValue V = (HalfIdx < 2 ? V1 : V2);
15920 HalfIdx = (HalfIdx % 2) * HalfNumElts;
15921 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
15922 DAG.getVectorIdxConstant(HalfIdx, DL));
15923 };
15924
15925 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
15926 SDValue Half1 = getHalfVector(HalfIdx1);
15927 SDValue Half2 = getHalfVector(HalfIdx2);
15928 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
15929 if (UseConcat) {
15930 SDValue Op0 = V;
15931 SDValue Op1 = DAG.getUNDEF(HalfVT);
15932 if (UndefLower)
15933 std::swap(Op0, Op1);
15934 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
15935 }
15936
15937 unsigned Offset = UndefLower ? HalfNumElts : 0;
15938 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
15940}
15941
15942/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
15943/// This allows for fast cases such as subvector extraction/insertion
15944/// or shuffling smaller vector types which can lower more efficiently.
15946 SDValue V2, ArrayRef<int> Mask,
15947 const X86Subtarget &Subtarget,
15948 SelectionDAG &DAG) {
15949 assert((VT.is256BitVector() || VT.is512BitVector()) &&
15950 "Expected 256-bit or 512-bit vector");
15951
15952 bool UndefLower = isUndefLowerHalf(Mask);
15953 if (!UndefLower && !isUndefUpperHalf(Mask))
15954 return SDValue();
15955
15956 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
15957 "Completely undef shuffle mask should have been simplified already");
15958
15959 // Upper half is undef and lower half is whole upper subvector.
15960 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
15961 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15962 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15963 if (!UndefLower &&
15964 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
15965 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15966 DAG.getVectorIdxConstant(HalfNumElts, DL));
15967 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15968 DAG.getVectorIdxConstant(0, DL));
15969 }
15970
15971 // Lower half is undef and upper half is whole lower subvector.
15972 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
15973 if (UndefLower &&
15974 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
15975 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15976 DAG.getVectorIdxConstant(0, DL));
15977 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15978 DAG.getVectorIdxConstant(HalfNumElts, DL));
15979 }
15980
15981 int HalfIdx1, HalfIdx2;
15982 SmallVector<int, 8> HalfMask(HalfNumElts);
15983 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
15984 return SDValue();
15985
15986 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
15987
15988 // Only shuffle the halves of the inputs when useful.
15989 unsigned NumLowerHalves =
15990 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
15991 unsigned NumUpperHalves =
15992 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
15993 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
15994
15995 // Determine the larger pattern of undef/halves, then decide if it's worth
15996 // splitting the shuffle based on subtarget capabilities and types.
15997 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
15998 if (!UndefLower) {
15999 // XXXXuuuu: no insert is needed.
16000 // Always extract lowers when setting lower - these are all free subreg ops.
16001 if (NumUpperHalves == 0)
16002 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16003 UndefLower, DAG);
16004
16005 if (NumUpperHalves == 1) {
16006 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
16007 if (Subtarget.hasAVX2()) {
16008 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
16009 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
16010 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
16011 (!isSingleSHUFPSMask(HalfMask) ||
16012 Subtarget.hasFastVariableCrossLaneShuffle()))
16013 return SDValue();
16014 // If this is an unary shuffle (assume that the 2nd operand is
16015 // canonicalized to undef), then we can use vpermpd. Otherwise, we
16016 // are better off extracting the upper half of 1 operand and using a
16017 // narrow shuffle.
16018 if (EltWidth == 64 && V2.isUndef())
16019 return SDValue();
16020 // If this is an unary vXi8 shuffle with inplace halves, then perform as
16021 // full width pshufb, and then merge.
16022 if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1)
16023 return SDValue();
16024 }
16025 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16026 if (Subtarget.hasAVX512() && VT.is512BitVector())
16027 return SDValue();
16028 // Extract + narrow shuffle is better than the wide alternative.
16029 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16030 UndefLower, DAG);
16031 }
16032
16033 // Don't extract both uppers, instead shuffle and then extract.
16034 assert(NumUpperHalves == 2 && "Half vector count went wrong");
16035 return SDValue();
16036 }
16037
16038 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
16039 if (NumUpperHalves == 0) {
16040 // AVX2 has efficient 64-bit element cross-lane shuffles.
16041 // TODO: Refine to account for unary shuffle, splat, and other masks?
16042 if (Subtarget.hasAVX2() && EltWidth == 64)
16043 return SDValue();
16044 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16045 if (Subtarget.hasAVX512() && VT.is512BitVector())
16046 return SDValue();
16047 // Narrow shuffle + insert is better than the wide alternative.
16048 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16049 UndefLower, DAG);
16050 }
16051
16052 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
16053 return SDValue();
16054}
16055
16056/// Handle case where shuffle sources are coming from the same 128-bit lane and
16057/// every lane can be represented as the same repeating mask - allowing us to
16058/// shuffle the sources with the repeating shuffle and then permute the result
16059/// to the destination lanes.
16061 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16062 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16063 int NumElts = VT.getVectorNumElements();
16064 int NumLanes = VT.getSizeInBits() / 128;
16065 int NumLaneElts = NumElts / NumLanes;
16066
16067 // On AVX2 we may be able to just shuffle the lowest elements and then
16068 // broadcast the result.
16069 if (Subtarget.hasAVX2()) {
16070 for (unsigned BroadcastSize : {16, 32, 64}) {
16071 if (BroadcastSize <= VT.getScalarSizeInBits())
16072 continue;
16073 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
16074
16075 // Attempt to match a repeating pattern every NumBroadcastElts,
16076 // accounting for UNDEFs but only references the lowest 128-bit
16077 // lane of the inputs.
16078 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
16079 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16080 for (int j = 0; j != NumBroadcastElts; ++j) {
16081 int M = Mask[i + j];
16082 if (M < 0)
16083 continue;
16084 int &R = RepeatMask[j];
16085 if (0 != ((M % NumElts) / NumLaneElts))
16086 return false;
16087 if (0 <= R && R != M)
16088 return false;
16089 R = M;
16090 }
16091 return true;
16092 };
16093
16094 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
16095 if (!FindRepeatingBroadcastMask(RepeatMask))
16096 continue;
16097
16098 // Shuffle the (lowest) repeated elements in place for broadcast.
16099 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
16100
16101 // Shuffle the actual broadcast.
16102 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
16103 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16104 for (int j = 0; j != NumBroadcastElts; ++j)
16105 BroadcastMask[i + j] = j;
16106
16107 // Avoid returning the same shuffle operation. For example,
16108 // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
16109 if (BroadcastMask == Mask)
16110 return SDValue();
16111
16112 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
16113 BroadcastMask);
16114 }
16115 }
16116
16117 // Bail if the shuffle mask doesn't cross 128-bit lanes.
16118 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
16119 return SDValue();
16120
16121 // Bail if we already have a repeated lane shuffle mask.
16122 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
16123 return SDValue();
16124
16125 // Helper to look for repeated mask in each split sublane, and that those
16126 // sublanes can then be permuted into place.
16127 auto ShuffleSubLanes = [&](int SubLaneScale) {
16128 int NumSubLanes = NumLanes * SubLaneScale;
16129 int NumSubLaneElts = NumLaneElts / SubLaneScale;
16130
16131 // Check that all the sources are coming from the same lane and see if we
16132 // can form a repeating shuffle mask (local to each sub-lane). At the same
16133 // time, determine the source sub-lane for each destination sub-lane.
16134 int TopSrcSubLane = -1;
16135 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
16136 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
16137 SubLaneScale,
16138 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
16139
16140 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16141 // Extract the sub-lane mask, check that it all comes from the same lane
16142 // and normalize the mask entries to come from the first lane.
16143 int SrcLane = -1;
16144 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
16145 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16146 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16147 if (M < 0)
16148 continue;
16149 int Lane = (M % NumElts) / NumLaneElts;
16150 if ((0 <= SrcLane) && (SrcLane != Lane))
16151 return SDValue();
16152 SrcLane = Lane;
16153 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16154 SubLaneMask[Elt] = LocalM;
16155 }
16156
16157 // Whole sub-lane is UNDEF.
16158 if (SrcLane < 0)
16159 continue;
16160
16161 // Attempt to match against the candidate repeated sub-lane masks.
16162 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16163 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
16164 for (int i = 0; i != NumSubLaneElts; ++i) {
16165 if (M1[i] < 0 || M2[i] < 0)
16166 continue;
16167 if (M1[i] != M2[i])
16168 return false;
16169 }
16170 return true;
16171 };
16172
16173 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16174 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16175 continue;
16176
16177 // Merge the sub-lane mask into the matching repeated sub-lane mask.
16178 for (int i = 0; i != NumSubLaneElts; ++i) {
16179 int M = SubLaneMask[i];
16180 if (M < 0)
16181 continue;
16182 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
16183 "Unexpected mask element");
16184 RepeatedSubLaneMask[i] = M;
16185 }
16186
16187 // Track the top most source sub-lane - by setting the remaining to
16188 // UNDEF we can greatly simplify shuffle matching.
16189 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16190 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16191 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16192 break;
16193 }
16194
16195 // Bail if we failed to find a matching repeated sub-lane mask.
16196 if (Dst2SrcSubLanes[DstSubLane] < 0)
16197 return SDValue();
16198 }
16199 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
16200 "Unexpected source lane");
16201
16202 // Create a repeating shuffle mask for the entire vector.
16203 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
16204 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
16205 int Lane = SubLane / SubLaneScale;
16206 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
16207 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16208 int M = RepeatedSubLaneMask[Elt];
16209 if (M < 0)
16210 continue;
16211 int Idx = (SubLane * NumSubLaneElts) + Elt;
16212 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
16213 }
16214 }
16215
16216 // Shuffle each source sub-lane to its destination.
16217 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
16218 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
16219 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
16220 if (SrcSubLane < 0)
16221 continue;
16222 for (int j = 0; j != NumSubLaneElts; ++j)
16223 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
16224 }
16225
16226 // Avoid returning the same shuffle operation.
16227 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
16228 if (RepeatedMask == Mask || SubLaneMask == Mask)
16229 return SDValue();
16230
16231 SDValue RepeatedShuffle =
16232 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
16233
16234 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
16235 SubLaneMask);
16236 };
16237
16238 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
16239 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
16240 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
16241 // Otherwise we can only permute whole 128-bit lanes.
16242 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
16243 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
16244 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
16245 MinSubLaneScale = 2;
16246 MaxSubLaneScale =
16247 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
16248 }
16249 if (Subtarget.hasBWI() && VT == MVT::v64i8)
16250 MinSubLaneScale = MaxSubLaneScale = 4;
16251
16252 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
16253 if (SDValue Shuffle = ShuffleSubLanes(Scale))
16254 return Shuffle;
16255
16256 return SDValue();
16257}
16258
16260 bool &ForceV1Zero, bool &ForceV2Zero,
16261 unsigned &ShuffleImm, ArrayRef<int> Mask,
16262 const APInt &Zeroable) {
16263 int NumElts = VT.getVectorNumElements();
16264 assert(VT.getScalarSizeInBits() == 64 &&
16265 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
16266 "Unexpected data type for VSHUFPD");
16267 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
16268 "Illegal shuffle mask");
16269
16270 bool ZeroLane[2] = { true, true };
16271 for (int i = 0; i < NumElts; ++i)
16272 ZeroLane[i & 1] &= Zeroable[i];
16273
16274 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
16275 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
16276 bool IsSHUFPD = true;
16277 bool IsCommutable = true;
16278 SmallVector<int, 8> SHUFPDMask(NumElts, -1);
16279 for (int i = 0; i < NumElts; ++i) {
16280 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
16281 continue;
16282 if (Mask[i] < 0)
16283 return false;
16284 int Val = (i & 6) + NumElts * (i & 1);
16285 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
16286 if (Mask[i] < Val || Mask[i] > Val + 1)
16287 IsSHUFPD = false;
16288 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
16289 IsCommutable = false;
16290 SHUFPDMask[i] = Mask[i] % 2;
16291 }
16292
16293 if (!IsSHUFPD && !IsCommutable)
16294 return false;
16295
16296 if (!IsSHUFPD && IsCommutable)
16297 std::swap(V1, V2);
16298
16299 ForceV1Zero = ZeroLane[0];
16300 ForceV2Zero = ZeroLane[1];
16301 ShuffleImm = getSHUFPDImm(SHUFPDMask);
16302 return true;
16303}
16304
16306 SDValue V2, ArrayRef<int> Mask,
16307 const APInt &Zeroable,
16308 const X86Subtarget &Subtarget,
16309 SelectionDAG &DAG) {
16310 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
16311 "Unexpected data type for VSHUFPD");
16312
16313 unsigned Immediate = 0;
16314 bool ForceV1Zero = false, ForceV2Zero = false;
16315 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
16316 Mask, Zeroable))
16317 return SDValue();
16318
16319 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
16320 if (ForceV1Zero)
16321 V1 = getZeroVector(VT, Subtarget, DAG, DL);
16322 if (ForceV2Zero)
16323 V2 = getZeroVector(VT, Subtarget, DAG, DL);
16324
16325 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
16326 DAG.getTargetConstant(Immediate, DL, MVT::i8));
16327}
16328
16329// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16330// by zeroable elements in the remaining 24 elements. Turn this into two
16331// vmovqb instructions shuffled together.
16333 SDValue V1, SDValue V2,
16334 ArrayRef<int> Mask,
16335 const APInt &Zeroable,
16336 SelectionDAG &DAG) {
16337 assert(VT == MVT::v32i8 && "Unexpected type!");
16338
16339 // The first 8 indices should be every 8th element.
16340 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
16341 return SDValue();
16342
16343 // Remaining elements need to be zeroable.
16344 if (Zeroable.countl_one() < (Mask.size() - 8))
16345 return SDValue();
16346
16347 V1 = DAG.getBitcast(MVT::v4i64, V1);
16348 V2 = DAG.getBitcast(MVT::v4i64, V2);
16349
16350 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
16351 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
16352
16353 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
16354 // the upper bits of the result using an unpckldq.
16355 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
16356 { 0, 1, 2, 3, 16, 17, 18, 19,
16357 4, 5, 6, 7, 20, 21, 22, 23 });
16358 // Insert the unpckldq into a zero vector to widen to v32i8.
16359 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
16360 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
16361 DAG.getVectorIdxConstant(0, DL));
16362}
16363
16364// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
16365// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
16366// =>
16367// ul = unpckl v1, v2
16368// uh = unpckh v1, v2
16369// a = vperm ul, uh
16370// b = vperm ul, uh
16371//
16372// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
16373// and permute. We cannot directly match v3 because it is split into two
16374// 256-bit vectors in earlier isel stages. Therefore, this function matches a
16375// pair of 256-bit shuffles and makes sure the masks are consecutive.
16376//
16377// Once unpck and permute nodes are created, the permute corresponding to this
16378// shuffle is returned, while the other permute replaces the other half of the
16379// shuffle in the selection dag.
16381 SDValue V1, SDValue V2,
16382 ArrayRef<int> Mask,
16383 SelectionDAG &DAG) {
16384 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
16385 VT != MVT::v32i8)
16386 return SDValue();
16387 // <B0, B1, B0+1, B1+1, ..., >
16388 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
16389 unsigned Begin1) {
16390 size_t Size = Mask.size();
16391 assert(Size % 2 == 0 && "Expected even mask size");
16392 for (unsigned I = 0; I < Size; I += 2) {
16393 if (Mask[I] != (int)(Begin0 + I / 2) ||
16394 Mask[I + 1] != (int)(Begin1 + I / 2))
16395 return false;
16396 }
16397 return true;
16398 };
16399 // Check which half is this shuffle node
16400 int NumElts = VT.getVectorNumElements();
16401 size_t FirstQtr = NumElts / 2;
16402 size_t ThirdQtr = NumElts + NumElts / 2;
16403 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
16404 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
16405 if (!IsFirstHalf && !IsSecondHalf)
16406 return SDValue();
16407
16408 // Find the intersection between shuffle users of V1 and V2.
16409 SmallVector<SDNode *, 2> Shuffles;
16410 for (SDNode *User : V1->users())
16411 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
16412 User->getOperand(1) == V2)
16413 Shuffles.push_back(User);
16414 // Limit user size to two for now.
16415 if (Shuffles.size() != 2)
16416 return SDValue();
16417 // Find out which half of the 512-bit shuffles is each smaller shuffle
16418 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
16419 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
16420 SDNode *FirstHalf;
16421 SDNode *SecondHalf;
16422 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
16423 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
16424 FirstHalf = Shuffles[0];
16425 SecondHalf = Shuffles[1];
16426 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
16427 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
16428 FirstHalf = Shuffles[1];
16429 SecondHalf = Shuffles[0];
16430 } else {
16431 return SDValue();
16432 }
16433 // Lower into unpck and perm. Return the perm of this shuffle and replace
16434 // the other.
16435 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
16436 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
16437 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16438 DAG.getTargetConstant(0x20, DL, MVT::i8));
16439 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16440 DAG.getTargetConstant(0x31, DL, MVT::i8));
16441 if (IsFirstHalf) {
16442 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
16443 return Perm1;
16444 }
16445 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
16446 return Perm2;
16447}
16448
16449/// Handle lowering of 4-lane 64-bit floating point shuffles.
16450///
16451/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
16452/// isn't available.
16454 const APInt &Zeroable, SDValue V1, SDValue V2,
16455 const X86Subtarget &Subtarget,
16456 SelectionDAG &DAG) {
16457 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16458 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16459 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16460
16461 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16462 Subtarget, DAG))
16463 return V;
16464
16465 if (V2.isUndef()) {
16466 // Check for being able to broadcast a single element.
16467 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
16468 Mask, Subtarget, DAG))
16469 return Broadcast;
16470
16471 // Use low duplicate instructions for masks that match their pattern.
16472 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
16473 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
16474
16475 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
16476 // Non-half-crossing single input shuffles can be lowered with an
16477 // interleaved permutation.
16478 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16479 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
16480 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
16481 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16482 }
16483
16484 // With AVX2 we have direct support for this permutation.
16485 if (Subtarget.hasAVX2())
16486 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
16487 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16488
16489 // Try to create an in-lane repeating shuffle mask and then shuffle the
16490 // results into the target lanes.
16492 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16493 return V;
16494
16495 // Try to permute the lanes and then use a per-lane permute.
16496 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
16497 Mask, DAG, Subtarget))
16498 return V;
16499
16500 // Otherwise, fall back.
16501 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
16502 DAG, Subtarget);
16503 }
16504
16505 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
16506 Zeroable, Subtarget, DAG))
16507 return Blend;
16508
16509 // Use dedicated unpack instructions for masks that match their pattern.
16510 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, V1, V2, Mask, DAG))
16511 return V;
16512
16513 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
16514 Zeroable, Subtarget, DAG))
16515 return Op;
16516
16517 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16518 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16519 bool V1IsSplat = isShuffleMaskInputBroadcastable(0, Mask);
16520 bool V2IsSplat = isShuffleMaskInputBroadcastable(1, Mask);
16521
16522 // If we have lane crossing shuffles AND they don't all come from the lower
16523 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16524 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
16525 // canonicalize to a blend of splat which isn't necessary for this combine.
16526 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
16527 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
16528 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
16529 (V2.getOpcode() != ISD::BUILD_VECTOR) &&
16530 (!Subtarget.hasAVX2() ||
16531 !((V1IsInPlace || V1IsSplat) && (V2IsInPlace || V2IsSplat))))
16532 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
16533
16534 // If we have one input in place, then we can permute the other input and
16535 // blend the result.
16536 if (V1IsInPlace || V2IsInPlace)
16537 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16538 Zeroable, Subtarget, DAG);
16539
16540 // Try to create an in-lane repeating shuffle mask and then shuffle the
16541 // results into the target lanes.
16543 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16544 return V;
16545
16546 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16547 // shuffle. However, if we have AVX2 and either inputs are already in place,
16548 // we will be able to shuffle even across lanes the other input in a single
16549 // instruction so skip this pattern.
16550 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
16552 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16553 return V;
16554
16555 // If we have VLX support, we can use VEXPAND.
16556 if (Subtarget.hasVLX())
16557 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4f64, V1, V2, Mask,
16558 Zeroable, Subtarget, DAG))
16559 return V;
16560
16561 // If we have AVX2 then we always want to lower with a blend because an v4 we
16562 // can fully permute the elements.
16563 if (Subtarget.hasAVX2())
16564 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16565 Zeroable, Subtarget, DAG);
16566
16567 // Otherwise fall back on generic lowering.
16568 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16569 Subtarget, DAG);
16570}
16571
16572/// Handle lowering of 4-lane 64-bit integer shuffles.
16573///
16574/// This routine is only called when we have AVX2 and thus a reasonable
16575/// instruction set for v4i64 shuffling..
16577 const APInt &Zeroable, SDValue V1, SDValue V2,
16578 const X86Subtarget &Subtarget,
16579 SelectionDAG &DAG) {
16580 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16581 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16582 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16583 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
16584
16585 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16586 Subtarget, DAG))
16587 return V;
16588
16589 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
16590 Zeroable, Subtarget, DAG))
16591 return Blend;
16592
16593 // Check for being able to broadcast a single element.
16594 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
16595 Subtarget, DAG))
16596 return Broadcast;
16597
16598 // Try to use shift instructions if fast.
16599 if (Subtarget.preferLowerShuffleAsShift())
16600 if (SDValue Shift =
16601 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16602 Subtarget, DAG, /*BitwiseOnly*/ true))
16603 return Shift;
16604
16605 if (V2.isUndef()) {
16606 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16607 // can use lower latency instructions that will operate on both lanes.
16608 SmallVector<int, 2> RepeatedMask;
16609 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
16610 SmallVector<int, 4> PSHUFDMask;
16611 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
16612 return DAG.getBitcast(
16613 MVT::v4i64,
16614 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
16615 DAG.getBitcast(MVT::v8i32, V1),
16616 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16617 }
16618
16619 // AVX2 provides a direct instruction for permuting a single input across
16620 // lanes.
16621 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
16622 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16623 }
16624
16625 // Try to use shift instructions.
16626 if (SDValue Shift =
16627 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
16628 DAG, /*BitwiseOnly*/ false))
16629 return Shift;
16630
16631 // If we have VLX support, we can use VALIGN or VEXPAND.
16632 if (Subtarget.hasVLX()) {
16633 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
16634 Zeroable, Subtarget, DAG))
16635 return Rotate;
16636
16637 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4i64, V1, V2, Mask,
16638 Zeroable, Subtarget, DAG))
16639 return V;
16640 }
16641
16642 // Try to use PALIGNR.
16643 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
16644 Subtarget, DAG))
16645 return Rotate;
16646
16647 // Use dedicated unpack instructions for masks that match their pattern.
16648 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, V1, V2, Mask, DAG))
16649 return V;
16650
16651 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16652 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16653
16654 // If we have one input in place, then we can permute the other input and
16655 // blend the result.
16656 if (V1IsInPlace || V2IsInPlace)
16657 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16658 Zeroable, Subtarget, DAG);
16659
16660 // Try to create an in-lane repeating shuffle mask and then shuffle the
16661 // results into the target lanes.
16663 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16664 return V;
16665
16666 // Try to lower to PERMQ(BLENDD(V1,V2)).
16667 if (SDValue V =
16668 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
16669 return V;
16670
16671 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16672 // shuffle. However, if we have AVX2 and either inputs are already in place,
16673 // we will be able to shuffle even across lanes the other input in a single
16674 // instruction so skip this pattern.
16675 if (!V1IsInPlace && !V2IsInPlace)
16677 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16678 return Result;
16679
16680 // Otherwise fall back on generic blend lowering.
16681 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16682 Zeroable, Subtarget, DAG);
16683}
16684
16685/// Handle lowering of 8-lane 32-bit floating point shuffles.
16686///
16687/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
16688/// isn't available.
16690 const APInt &Zeroable, SDValue V1, SDValue V2,
16691 const X86Subtarget &Subtarget,
16692 SelectionDAG &DAG) {
16693 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16694 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16695 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16696
16697 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
16698 Zeroable, Subtarget, DAG))
16699 return Blend;
16700
16701 // Check for being able to broadcast a single element.
16702 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
16703 Subtarget, DAG))
16704 return Broadcast;
16705
16706 if (!Subtarget.hasAVX2()) {
16707 SmallVector<int> InLaneMask;
16708 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
16709
16710 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
16711 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
16712 /*SimpleOnly*/ true))
16713 return R;
16714 }
16715 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16716 Zeroable, Subtarget, DAG))
16717 return DAG.getBitcast(MVT::v8f32, ZExt);
16718
16719 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16720 // options to efficiently lower the shuffle.
16721 SmallVector<int, 4> RepeatedMask;
16722 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
16723 assert(RepeatedMask.size() == 4 &&
16724 "Repeated masks must be half the mask width!");
16725
16726 // Use even/odd duplicate instructions for masks that match their pattern.
16727 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16728 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
16729 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16730 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
16731
16732 if (V2.isUndef())
16733 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
16734 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16735
16736 // Use dedicated unpack instructions for masks that match their pattern.
16737 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, V1, V2, Mask, DAG))
16738 return V;
16739
16740 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
16741 // have already handled any direct blends.
16742 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
16743 }
16744
16745 // Try to create an in-lane repeating shuffle mask and then shuffle the
16746 // results into the target lanes.
16748 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16749 return V;
16750
16751 // If we have a single input shuffle with different shuffle patterns in the
16752 // two 128-bit lanes use the variable mask to VPERMILPS.
16753 if (V2.isUndef()) {
16754 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
16755 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16756 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
16757 }
16758 if (Subtarget.hasAVX2()) {
16759 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16760 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
16761 }
16762 // Otherwise, fall back.
16763 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
16764 DAG, Subtarget);
16765 }
16766
16767 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16768 // shuffle.
16770 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16771 return Result;
16772
16773 // If we have VLX support, we can use VEXPAND.
16774 if (Subtarget.hasVLX())
16775 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f32, V1, V2, Mask,
16776 Zeroable, Subtarget, DAG))
16777 return V;
16778
16779 // Try to match an interleave of two v8f32s and lower them as unpck and
16780 // permutes using ymms. This needs to go before we try to split the vectors.
16781 // Don't attempt on AVX1 if we're likely to split vectors anyway.
16782 if ((Subtarget.hasAVX2() ||
16785 !Subtarget.hasAVX512())
16786 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
16787 Mask, DAG))
16788 return V;
16789
16790 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16791 // since after split we get a more efficient code using vpunpcklwd and
16792 // vpunpckhwd instrs than vblend.
16793 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
16794 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16795 Subtarget, DAG);
16796
16797 // If we have AVX2 then we always want to lower with a blend because at v8 we
16798 // can fully permute the elements.
16799 if (Subtarget.hasAVX2())
16800 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
16801 Zeroable, Subtarget, DAG);
16802
16803 // Otherwise fall back on generic lowering.
16804 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16805 Subtarget, DAG);
16806}
16807
16808/// Handle lowering of 8-lane 32-bit integer shuffles.
16809///
16810/// This routine is only called when we have AVX2 and thus a reasonable
16811/// instruction set for v8i32 shuffling..
16813 const APInt &Zeroable, SDValue V1, SDValue V2,
16814 const X86Subtarget &Subtarget,
16815 SelectionDAG &DAG) {
16816 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16817 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16818 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16819 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
16820
16821 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
16822
16823 // Whenever we can lower this as a zext, that instruction is strictly faster
16824 // than any alternative. It also allows us to fold memory operands into the
16825 // shuffle in many cases.
16826 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16827 Zeroable, Subtarget, DAG))
16828 return ZExt;
16829
16830 // Try to match an interleave of two v8i32s and lower them as unpck and
16831 // permutes using ymms. This needs to go before we try to split the vectors.
16832 if (!Subtarget.hasAVX512())
16833 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
16834 Mask, DAG))
16835 return V;
16836
16837 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16838 // since after split we get a more efficient code than vblend by using
16839 // vpunpcklwd and vpunpckhwd instrs.
16840 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
16841 !Subtarget.hasAVX512())
16842 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16843 Subtarget, DAG);
16844
16845 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
16846 Zeroable, Subtarget, DAG))
16847 return Blend;
16848
16849 // Check for being able to broadcast a single element.
16850 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
16851 Subtarget, DAG))
16852 return Broadcast;
16853
16854 // Try to use shift instructions if fast.
16855 if (Subtarget.preferLowerShuffleAsShift()) {
16856 if (SDValue Shift =
16857 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16858 Subtarget, DAG, /*BitwiseOnly*/ true))
16859 return Shift;
16860 if (NumV2Elements == 0)
16861 if (SDValue Rotate =
16862 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16863 return Rotate;
16864 }
16865
16866 // If the shuffle mask is repeated in each 128-bit lane we can use more
16867 // efficient instructions that mirror the shuffles across the two 128-bit
16868 // lanes.
16869 SmallVector<int, 4> RepeatedMask;
16870 bool Is128BitLaneRepeatedShuffle =
16871 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
16872 if (Is128BitLaneRepeatedShuffle) {
16873 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16874 if (V2.isUndef())
16875 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
16876 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16877
16878 // Use dedicated unpack instructions for masks that match their pattern.
16879 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, V1, V2, Mask, DAG))
16880 return V;
16881 }
16882
16883 // Try to use shift instructions.
16884 if (SDValue Shift =
16885 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
16886 DAG, /*BitwiseOnly*/ false))
16887 return Shift;
16888
16889 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
16890 if (SDValue Rotate =
16891 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16892 return Rotate;
16893
16894 // If we have VLX support, we can use VALIGN or EXPAND.
16895 if (Subtarget.hasVLX()) {
16896 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
16897 Zeroable, Subtarget, DAG))
16898 return Rotate;
16899
16900 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i32, V1, V2, Mask,
16901 Zeroable, Subtarget, DAG))
16902 return V;
16903 }
16904
16905 // Try to use byte rotation instructions.
16906 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
16907 Subtarget, DAG))
16908 return Rotate;
16909
16910 // Try to create an in-lane repeating shuffle mask and then shuffle the
16911 // results into the target lanes.
16913 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16914 return V;
16915
16916 if (V2.isUndef()) {
16917 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16918 // because that should be faster than the variable permute alternatives.
16919 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, V1, V2, Mask, DAG))
16920 return V;
16921
16922 // If the shuffle patterns aren't repeated but it's a single input, directly
16923 // generate a cross-lane VPERMD instruction.
16924 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16925 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
16926 }
16927
16928 // Assume that a single SHUFPS is faster than an alternative sequence of
16929 // multiple instructions (even if the CPU has a domain penalty).
16930 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16931 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16932 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
16933 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
16934 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
16935 CastV1, CastV2, DAG);
16936 return DAG.getBitcast(MVT::v8i32, ShufPS);
16937 }
16938
16939 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16940 // shuffle.
16942 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16943 return Result;
16944
16945 // Otherwise fall back on generic blend lowering.
16946 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
16947 Zeroable, Subtarget, DAG);
16948}
16949
16950/// Handle lowering of 16-lane 16-bit integer shuffles.
16951///
16952/// This routine is only called when we have AVX2 and thus a reasonable
16953/// instruction set for v16i16 shuffling..
16955 const APInt &Zeroable, SDValue V1, SDValue V2,
16956 const X86Subtarget &Subtarget,
16957 SelectionDAG &DAG) {
16958 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16959 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16960 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16961 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
16962
16963 // Whenever we can lower this as a zext, that instruction is strictly faster
16964 // than any alternative. It also allows us to fold memory operands into the
16965 // shuffle in many cases.
16967 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16968 return ZExt;
16969
16970 // Check for being able to broadcast a single element.
16971 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
16972 Subtarget, DAG))
16973 return Broadcast;
16974
16975 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
16976 Zeroable, Subtarget, DAG))
16977 return Blend;
16978
16979 // Use dedicated unpack instructions for masks that match their pattern.
16980 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, V1, V2, Mask, DAG))
16981 return V;
16982
16983 // Use dedicated pack instructions for masks that match their pattern.
16984 if (SDValue V =
16985 lowerShuffleWithPACK(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16986 return V;
16987
16988 // Try to use lower using a truncation.
16989 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16990 Subtarget, DAG))
16991 return V;
16992
16993 // Try to use shift instructions.
16994 if (SDValue Shift =
16995 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16996 Subtarget, DAG, /*BitwiseOnly*/ false))
16997 return Shift;
16998
16999 // Try to use byte rotation instructions.
17000 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
17001 Subtarget, DAG))
17002 return Rotate;
17003
17004 // Try to create an in-lane repeating shuffle mask and then shuffle the
17005 // results into the target lanes.
17007 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17008 return V;
17009
17010 if (V2.isUndef()) {
17011 // Try to use bit rotation instructions.
17012 if (SDValue Rotate =
17013 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
17014 return Rotate;
17015
17016 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17017 // because that should be faster than the variable permute alternatives.
17018 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, V1, V2, Mask, DAG))
17019 return V;
17020
17021 // There are no generalized cross-lane shuffle operations available on i16
17022 // element types.
17023 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
17025 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17026 return V;
17027
17028 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
17029 DAG, Subtarget);
17030 }
17031
17032 SmallVector<int, 8> RepeatedMask;
17033 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
17034 // As this is a single-input shuffle, the repeated mask should be
17035 // a strictly valid v8i16 mask that we can pass through to the v8i16
17036 // lowering to handle even the v16 case.
17038 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17039 }
17040 }
17041
17042 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
17043 Zeroable, Subtarget, DAG))
17044 return PSHUFB;
17045
17046 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
17047 if (Subtarget.hasBWI())
17048 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
17049
17050 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17051 // shuffle.
17053 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17054 return Result;
17055
17056 // Try to permute the lanes and then use a per-lane permute.
17058 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17059 return V;
17060
17061 // Try to match an interleave of two v16i16s and lower them as unpck and
17062 // permutes using ymms.
17063 if (!Subtarget.hasAVX512())
17064 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
17065 Mask, DAG))
17066 return V;
17067
17068 // Otherwise fall back on generic lowering.
17069 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17070 Subtarget, DAG);
17071}
17072
17073/// Handle lowering of 32-lane 8-bit integer shuffles.
17074///
17075/// This routine is only called when we have AVX2 and thus a reasonable
17076/// instruction set for v32i8 shuffling..
17078 const APInt &Zeroable, SDValue V1, SDValue V2,
17079 const X86Subtarget &Subtarget,
17080 SelectionDAG &DAG) {
17081 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17082 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17083 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17084 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
17085
17086 // Whenever we can lower this as a zext, that instruction is strictly faster
17087 // than any alternative. It also allows us to fold memory operands into the
17088 // shuffle in many cases.
17089 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
17090 Zeroable, Subtarget, DAG))
17091 return ZExt;
17092
17093 // Check for being able to broadcast a single element.
17094 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
17095 Subtarget, DAG))
17096 return Broadcast;
17097
17098 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
17099 Zeroable, Subtarget, DAG))
17100 return Blend;
17101
17102 // Use dedicated unpack instructions for masks that match their pattern.
17103 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, V1, V2, Mask, DAG))
17104 return V;
17105
17106 // Use dedicated pack instructions for masks that match their pattern.
17107 if (SDValue V =
17108 lowerShuffleWithPACK(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17109 return V;
17110
17111 // Try to use lower using a truncation.
17112 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17113 Subtarget, DAG))
17114 return V;
17115
17116 // Try to use shift instructions.
17117 if (SDValue Shift =
17118 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
17119 DAG, /*BitwiseOnly*/ false))
17120 return Shift;
17121
17122 // Try to use byte rotation instructions.
17123 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
17124 Subtarget, DAG))
17125 return Rotate;
17126
17127 // Try to use bit rotation instructions.
17128 if (V2.isUndef())
17129 if (SDValue Rotate =
17130 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
17131 return Rotate;
17132
17133 // Try to create an in-lane repeating shuffle mask and then shuffle the
17134 // results into the target lanes.
17136 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17137 return V;
17138
17139 // There are no generalized cross-lane shuffle operations available on i8
17140 // element types.
17141 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
17142 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17143 // because that should be faster than the variable permute alternatives.
17144 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, V1, V2, Mask, DAG))
17145 return V;
17146
17148 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17149 return V;
17150
17151 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
17152 DAG, Subtarget);
17153 }
17154
17155 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
17156 Zeroable, Subtarget, DAG))
17157 return PSHUFB;
17158
17159 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
17160 if (Subtarget.hasVBMI())
17161 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
17162
17163 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17164 // shuffle.
17166 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17167 return Result;
17168
17169 // Try to permute the lanes and then use a per-lane permute.
17171 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17172 return V;
17173
17174 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17175 // by zeroable elements in the remaining 24 elements. Turn this into two
17176 // vmovqb instructions shuffled together.
17177 if (Subtarget.hasVLX())
17178 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
17179 Mask, Zeroable, DAG))
17180 return V;
17181
17182 // Try to match an interleave of two v32i8s and lower them as unpck and
17183 // permutes using ymms.
17184 if (!Subtarget.hasAVX512())
17185 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
17186 Mask, DAG))
17187 return V;
17188
17189 // Otherwise fall back on generic lowering.
17190 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17191 Subtarget, DAG);
17192}
17193
17194/// High-level routine to lower various 256-bit x86 vector shuffles.
17195///
17196/// This routine either breaks down the specific type of a 256-bit x86 vector
17197/// shuffle or splits it into two 128-bit shuffles and fuses the results back
17198/// together based on the available instructions.
17200 SDValue V1, SDValue V2, const APInt &Zeroable,
17201 const X86Subtarget &Subtarget,
17202 SelectionDAG &DAG) {
17203 // If we have a single input to the zero element, insert that into V1 if we
17204 // can do so cheaply.
17205 int NumElts = VT.getVectorNumElements();
17206 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17207
17208 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17210 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17211 return Insertion;
17212
17213 // Handle special cases where the lower or upper half is UNDEF.
17214 if (SDValue V =
17215 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17216 return V;
17217
17218 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
17219 // can check for those subtargets here and avoid much of the subtarget
17220 // querying in the per-vector-type lowering routines. With AVX1 we have
17221 // essentially *zero* ability to manipulate a 256-bit vector with integer
17222 // types. Since we'll use floating point types there eventually, just
17223 // immediately cast everything to a float and operate entirely in that domain.
17224 if (VT.isInteger() && !Subtarget.hasAVX2()) {
17225 int ElementBits = VT.getScalarSizeInBits();
17226 if (ElementBits < 32) {
17227 // No floating point type available, if we can't use the bit operations
17228 // for masking/blending then decompose into 128-bit vectors.
17229 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17230 Subtarget, DAG))
17231 return V;
17232 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17233 return V;
17234 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17235 }
17236
17237 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
17239 V1 = DAG.getBitcast(FpVT, V1);
17240 V2 = DAG.getBitcast(FpVT, V2);
17241 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
17242 }
17243
17244 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
17245 V1 = DAG.getBitcast(MVT::v16i16, V1);
17246 V2 = DAG.getBitcast(MVT::v16i16, V2);
17247 return DAG.getBitcast(VT,
17248 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
17249 }
17250
17251 switch (VT.SimpleTy) {
17252 case MVT::v4f64:
17253 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17254 case MVT::v4i64:
17255 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17256 case MVT::v8f32:
17257 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17258 case MVT::v8i32:
17259 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17260 case MVT::v16i16:
17261 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17262 case MVT::v32i8:
17263 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17264
17265 default:
17266 llvm_unreachable("Not a valid 256-bit x86 vector type!");
17267 }
17268}
17269
17270/// Try to lower a vector shuffle as a 128-bit shuffles.
17272 const APInt &Zeroable, SDValue V1, SDValue V2,
17273 const X86Subtarget &Subtarget,
17274 SelectionDAG &DAG) {
17275 assert(VT.getScalarSizeInBits() == 64 &&
17276 "Unexpected element type size for 128bit shuffle.");
17277
17278 // To handle 256 bit vector requires VLX and most probably
17279 // function lowerV2X128VectorShuffle() is better solution.
17280 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
17281
17282 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
17283 SmallVector<int, 4> Widened128Mask;
17284 if (!canWidenShuffleElements(Mask, Widened128Mask))
17285 return SDValue();
17286 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
17287
17288 // Try to use an insert into a zero vector.
17289 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
17290 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
17291 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
17292 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
17293 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17294 DAG.getVectorIdxConstant(0, DL));
17295 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17296 getZeroVector(VT, Subtarget, DAG, DL), LoV,
17297 DAG.getVectorIdxConstant(0, DL));
17298 }
17299
17300 // Check for patterns which can be matched with a single insert of a 256-bit
17301 // subvector.
17302 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
17303 if (OnlyUsesV1 ||
17304 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
17305 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
17306 SDValue SubVec =
17307 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
17308 DAG.getVectorIdxConstant(0, DL));
17309 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17310 DAG.getVectorIdxConstant(4, DL));
17311 }
17312
17313 // See if this is an insertion of the lower 128-bits of V2 into V1.
17314 bool IsInsert = true;
17315 int V2Index = -1;
17316 for (int i = 0; i < 4; ++i) {
17317 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17318 if (Widened128Mask[i] < 0)
17319 continue;
17320
17321 // Make sure all V1 subvectors are in place.
17322 if (Widened128Mask[i] < 4) {
17323 if (Widened128Mask[i] != i) {
17324 IsInsert = false;
17325 break;
17326 }
17327 } else {
17328 // Make sure we only have a single V2 index and its the lowest 128-bits.
17329 if (V2Index >= 0 || Widened128Mask[i] != 4) {
17330 IsInsert = false;
17331 break;
17332 }
17333 V2Index = i;
17334 }
17335 }
17336 if (IsInsert && V2Index >= 0) {
17337 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17338 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
17339 DAG.getVectorIdxConstant(0, DL));
17340 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
17341 }
17342
17343 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17344 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
17345 // possible we at least ensure the lanes stay sequential to help later
17346 // combines.
17347 SmallVector<int, 2> Widened256Mask;
17348 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17349 Widened128Mask.clear();
17350 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17351 }
17352
17353 // Try to lower to vshuf64x2/vshuf32x4.
17354 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17355 int PermMask[4] = {-1, -1, -1, -1};
17356 // Ensure elements came from the same Op.
17357 for (int i = 0; i < 4; ++i) {
17358 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17359 if (Widened128Mask[i] < 0)
17360 continue;
17361
17362 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17363 unsigned OpIndex = i / 2;
17364 if (Ops[OpIndex].isUndef())
17365 Ops[OpIndex] = Op;
17366 else if (Ops[OpIndex] != Op)
17367 return SDValue();
17368
17369 PermMask[i] = Widened128Mask[i] % 4;
17370 }
17371
17372 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17373 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
17374}
17375
17376/// Handle lowering of 8-lane 64-bit floating point shuffles.
17378 const APInt &Zeroable, SDValue V1, SDValue V2,
17379 const X86Subtarget &Subtarget,
17380 SelectionDAG &DAG) {
17381 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17382 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17383 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17384
17385 if (V2.isUndef()) {
17386 // Use low duplicate instructions for masks that match their pattern.
17387 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17388 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
17389
17390 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
17391 // Non-half-crossing single input shuffles can be lowered with an
17392 // interleaved permutation.
17393 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17394 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17395 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17396 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17397 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
17398 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17399 }
17400
17401 SmallVector<int, 4> RepeatedMask;
17402 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
17403 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
17404 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17405 }
17406
17407 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
17408 V2, Subtarget, DAG))
17409 return Shuf128;
17410
17411 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, V1, V2, Mask, DAG))
17412 return Unpck;
17413
17414 // Check if the blend happens to exactly fit that of SHUFPD.
17415 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
17416 Zeroable, Subtarget, DAG))
17417 return Op;
17418
17419 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f64, V1, V2, Mask, Zeroable,
17420 Subtarget, DAG))
17421 return V;
17422
17423 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
17424 Zeroable, Subtarget, DAG))
17425 return Blend;
17426
17427 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
17428}
17429
17430/// Handle lowering of 16-lane 32-bit floating point shuffles.
17432 const APInt &Zeroable, SDValue V1, SDValue V2,
17433 const X86Subtarget &Subtarget,
17434 SelectionDAG &DAG) {
17435 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17436 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17437 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17438
17439 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17440 // options to efficiently lower the shuffle.
17441 SmallVector<int, 4> RepeatedMask;
17442 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
17443 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17444
17445 // Use even/odd duplicate instructions for masks that match their pattern.
17446 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17447 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
17448 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17449 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
17450
17451 if (V2.isUndef())
17452 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
17453 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17454
17455 // Use dedicated unpack instructions for masks that match their pattern.
17456 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, V1, V2, Mask, DAG))
17457 return V;
17458
17459 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17460 Zeroable, Subtarget, DAG))
17461 return Blend;
17462
17463 // Otherwise, fall back to a SHUFPS sequence.
17464 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
17465 }
17466
17467 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17468 Zeroable, Subtarget, DAG))
17469 return Blend;
17470
17472 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17473 return DAG.getBitcast(MVT::v16f32, ZExt);
17474
17475 // Try to create an in-lane repeating shuffle mask and then shuffle the
17476 // results into the target lanes.
17478 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
17479 return V;
17480
17481 // If we have a single input shuffle with different shuffle patterns in the
17482 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
17483 if (V2.isUndef() &&
17484 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
17485 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
17486 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
17487 }
17488
17489 // If we have AVX512F support, we can use VEXPAND.
17490 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16f32, V1, V2, Mask,
17491 Zeroable, Subtarget, DAG))
17492 return V;
17493
17494 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
17495}
17496
17497/// Handle lowering of 8-lane 64-bit integer shuffles.
17499 const APInt &Zeroable, SDValue V1, SDValue V2,
17500 const X86Subtarget &Subtarget,
17501 SelectionDAG &DAG) {
17502 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17503 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17504 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17505
17506 // Try to use shift instructions if fast.
17507 if (Subtarget.preferLowerShuffleAsShift())
17508 if (SDValue Shift =
17509 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17510 Subtarget, DAG, /*BitwiseOnly*/ true))
17511 return Shift;
17512
17513 if (V2.isUndef()) {
17514 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17515 // can use lower latency instructions that will operate on all four
17516 // 128-bit lanes.
17517 SmallVector<int, 2> Repeated128Mask;
17518 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
17519 SmallVector<int, 4> PSHUFDMask;
17520 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
17521 return DAG.getBitcast(
17522 MVT::v8i64,
17523 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
17524 DAG.getBitcast(MVT::v16i32, V1),
17525 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17526 }
17527
17528 SmallVector<int, 4> Repeated256Mask;
17529 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
17530 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
17531 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
17532 }
17533
17534 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
17535 V2, Subtarget, DAG))
17536 return Shuf128;
17537
17538 // Try to use shift instructions.
17539 if (SDValue Shift =
17540 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
17541 DAG, /*BitwiseOnly*/ false))
17542 return Shift;
17543
17544 // Try to use VALIGN.
17545 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
17546 Zeroable, Subtarget, DAG))
17547 return Rotate;
17548
17549 // Try to use PALIGNR.
17550 if (Subtarget.hasBWI())
17551 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
17552 Subtarget, DAG))
17553 return Rotate;
17554
17555 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, V1, V2, Mask, DAG))
17556 return Unpck;
17557
17558 // If we have AVX512F support, we can use VEXPAND.
17559 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17560 Subtarget, DAG))
17561 return V;
17562
17563 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
17564 Zeroable, Subtarget, DAG))
17565 return Blend;
17566
17567 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
17568}
17569
17570/// Handle lowering of 16-lane 32-bit integer shuffles.
17572 const APInt &Zeroable, SDValue V1, SDValue V2,
17573 const X86Subtarget &Subtarget,
17574 SelectionDAG &DAG) {
17575 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17576 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17577 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17578
17579 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
17580
17581 // Whenever we can lower this as a zext, that instruction is strictly faster
17582 // than any alternative. It also allows us to fold memory operands into the
17583 // shuffle in many cases.
17585 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17586 return ZExt;
17587
17588 // Try to use shift instructions if fast.
17589 if (Subtarget.preferLowerShuffleAsShift()) {
17590 if (SDValue Shift =
17591 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17592 Subtarget, DAG, /*BitwiseOnly*/ true))
17593 return Shift;
17594 if (NumV2Elements == 0)
17595 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
17596 Subtarget, DAG))
17597 return Rotate;
17598 }
17599
17600 // If the shuffle mask is repeated in each 128-bit lane we can use more
17601 // efficient instructions that mirror the shuffles across the four 128-bit
17602 // lanes.
17603 SmallVector<int, 4> RepeatedMask;
17604 bool Is128BitLaneRepeatedShuffle =
17605 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
17606 if (Is128BitLaneRepeatedShuffle) {
17607 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17608 if (V2.isUndef())
17609 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
17610 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17611
17612 // Use dedicated unpack instructions for masks that match their pattern.
17613 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, V1, V2, Mask, DAG))
17614 return V;
17615 }
17616
17617 // Try to use shift instructions.
17618 if (SDValue Shift =
17619 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17620 Subtarget, DAG, /*BitwiseOnly*/ false))
17621 return Shift;
17622
17623 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
17624 if (SDValue Rotate =
17625 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
17626 return Rotate;
17627
17628 // Try to use VALIGN.
17629 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
17630 Zeroable, Subtarget, DAG))
17631 return Rotate;
17632
17633 // Try to use byte rotation instructions.
17634 if (Subtarget.hasBWI())
17635 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
17636 Subtarget, DAG))
17637 return Rotate;
17638
17639 // Assume that a single SHUFPS is faster than using a permv shuffle.
17640 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17641 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17642 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17643 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17644 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17645 CastV1, CastV2, DAG);
17646 return DAG.getBitcast(MVT::v16i32, ShufPS);
17647 }
17648
17649 // Try to create an in-lane repeating shuffle mask and then shuffle the
17650 // results into the target lanes.
17652 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17653 return V;
17654
17655 // If we have AVX512F support, we can use VEXPAND.
17656 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16i32, V1, V2, Mask,
17657 Zeroable, Subtarget, DAG))
17658 return V;
17659
17660 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17661 Zeroable, Subtarget, DAG))
17662 return Blend;
17663
17664 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17665}
17666
17667/// Handle lowering of 32-lane 16-bit integer shuffles.
17669 const APInt &Zeroable, SDValue V1, SDValue V2,
17670 const X86Subtarget &Subtarget,
17671 SelectionDAG &DAG) {
17672 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17673 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17674 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17675 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17676
17677 // Whenever we can lower this as a zext, that instruction is strictly faster
17678 // than any alternative. It also allows us to fold memory operands into the
17679 // shuffle in many cases.
17681 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17682 return ZExt;
17683
17684 // Use dedicated unpack instructions for masks that match their pattern.
17685 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, V1, V2, Mask, DAG))
17686 return V;
17687
17688 // Use dedicated pack instructions for masks that match their pattern.
17689 if (SDValue V =
17690 lowerShuffleWithPACK(DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17691 return V;
17692
17693 // Try to use shift instructions.
17694 if (SDValue Shift =
17695 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
17696 Subtarget, DAG, /*BitwiseOnly*/ false))
17697 return Shift;
17698
17699 // Try to use byte rotation instructions.
17700 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17701 Subtarget, DAG))
17702 return Rotate;
17703
17704 if (V2.isUndef()) {
17705 // Try to use bit rotation instructions.
17706 if (SDValue Rotate =
17707 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17708 return Rotate;
17709
17710 SmallVector<int, 8> RepeatedMask;
17711 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
17712 // As this is a single-input shuffle, the repeated mask should be
17713 // a strictly valid v8i16 mask that we can pass through to the v8i16
17714 // lowering to handle even the v32 case.
17715 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
17716 RepeatedMask, Subtarget, DAG);
17717 }
17718 }
17719
17720 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
17721 Zeroable, Subtarget, DAG))
17722 return Blend;
17723
17724 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
17725 Zeroable, Subtarget, DAG))
17726 return PSHUFB;
17727
17728 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17729 // shuffle.
17730 if (!V2.isUndef())
17732 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17733 return Result;
17734
17735 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
17736}
17737
17738/// Handle lowering of 64-lane 8-bit integer shuffles.
17740 const APInt &Zeroable, SDValue V1, SDValue V2,
17741 const X86Subtarget &Subtarget,
17742 SelectionDAG &DAG) {
17743 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17744 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17745 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
17746 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
17747
17748 // Whenever we can lower this as a zext, that instruction is strictly faster
17749 // than any alternative. It also allows us to fold memory operands into the
17750 // shuffle in many cases.
17752 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17753 return ZExt;
17754
17755 // Use dedicated unpack instructions for masks that match their pattern.
17756 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, V1, V2, Mask, DAG))
17757 return V;
17758
17759 // Use dedicated pack instructions for masks that match their pattern.
17760 if (SDValue V =
17761 lowerShuffleWithPACK(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17762 return V;
17763
17764 // Try to use shift instructions.
17765 if (SDValue Shift =
17766 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
17767 DAG, /*BitwiseOnly*/ false))
17768 return Shift;
17769
17770 // Try to use byte rotation instructions.
17771 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
17772 Subtarget, DAG))
17773 return Rotate;
17774
17775 // Try to use bit rotation instructions.
17776 if (V2.isUndef())
17777 if (SDValue Rotate =
17778 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
17779 return Rotate;
17780
17781 // Lower as AND if possible.
17782 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
17783 Zeroable, Subtarget, DAG))
17784 return Masked;
17785
17786 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
17787 Zeroable, Subtarget, DAG))
17788 return PSHUFB;
17789
17790 // Try to create an in-lane repeating shuffle mask and then shuffle the
17791 // results into the target lanes.
17793 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17794 return V;
17795
17797 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
17798 return Result;
17799
17800 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
17801 Zeroable, Subtarget, DAG))
17802 return Blend;
17803
17804 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
17805 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
17806 // PALIGNR will be cheaper than the second PSHUFB+OR.
17807 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
17808 Mask, Subtarget, DAG))
17809 return V;
17810
17811 // If we can't directly blend but can use PSHUFB, that will be better as it
17812 // can both shuffle and set up the inefficient blend.
17813 bool V1InUse, V2InUse;
17814 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
17815 DAG, V1InUse, V2InUse);
17816 }
17817
17818 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17819 // shuffle.
17820 if (!V2.isUndef())
17822 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17823 return Result;
17824
17825 // VBMI can use VPERMV/VPERMV3 byte shuffles.
17826 if (Subtarget.hasVBMI())
17827 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
17828
17829 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17830}
17831
17832/// High-level routine to lower various 512-bit x86 vector shuffles.
17833///
17834/// This routine either breaks down the specific type of a 512-bit x86 vector
17835/// shuffle or splits it into two 256-bit shuffles and fuses the results back
17836/// together based on the available instructions.
17838 MVT VT, SDValue V1, SDValue V2,
17839 const APInt &Zeroable,
17840 const X86Subtarget &Subtarget,
17841 SelectionDAG &DAG) {
17842 assert(Subtarget.hasAVX512() &&
17843 "Cannot lower 512-bit vectors w/ basic ISA!");
17844
17845 // If we have a single input to the zero element, insert that into V1 if we
17846 // can do so cheaply.
17847 int NumElts = Mask.size();
17848 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17849
17850 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17852 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17853 return Insertion;
17854
17855 // Handle special cases where the lower or upper half is UNDEF.
17856 if (SDValue V =
17857 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17858 return V;
17859
17860 // Check for being able to broadcast a single element.
17861 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
17862 Subtarget, DAG))
17863 return Broadcast;
17864
17865 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
17866 // Try using bit ops for masking and blending before falling back to
17867 // splitting.
17868 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17869 Subtarget, DAG))
17870 return V;
17871 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17872 return V;
17873
17874 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17875 }
17876
17877 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
17878 if (!Subtarget.hasBWI())
17879 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17880 /*SimpleOnly*/ false);
17881
17882 V1 = DAG.getBitcast(MVT::v32i16, V1);
17883 V2 = DAG.getBitcast(MVT::v32i16, V2);
17884 return DAG.getBitcast(VT,
17885 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
17886 }
17887
17888 // Dispatch to each element type for lowering. If we don't have support for
17889 // specific element type shuffles at 512 bits, immediately split them and
17890 // lower them. Each lowering routine of a given type is allowed to assume that
17891 // the requisite ISA extensions for that element type are available.
17892 switch (VT.SimpleTy) {
17893 case MVT::v8f64:
17894 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17895 case MVT::v16f32:
17896 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17897 case MVT::v8i64:
17898 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17899 case MVT::v16i32:
17900 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17901 case MVT::v32i16:
17902 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17903 case MVT::v64i8:
17904 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17905
17906 default:
17907 llvm_unreachable("Not a valid 512-bit x86 vector type!");
17908 }
17909}
17910
17912 MVT VT, SDValue V1, SDValue V2,
17913 const X86Subtarget &Subtarget,
17914 SelectionDAG &DAG) {
17915 // Shuffle should be unary.
17916 if (!V2.isUndef())
17917 return SDValue();
17918
17919 int ShiftAmt = -1;
17920 int NumElts = Mask.size();
17921 for (int i = 0; i != NumElts; ++i) {
17922 int M = Mask[i];
17923 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
17924 "Unexpected mask index.");
17925 if (M < 0)
17926 continue;
17927
17928 // The first non-undef element determines our shift amount.
17929 if (ShiftAmt < 0) {
17930 ShiftAmt = M - i;
17931 // Need to be shifting right.
17932 if (ShiftAmt <= 0)
17933 return SDValue();
17934 }
17935 // All non-undef elements must shift by the same amount.
17936 if (ShiftAmt != M - i)
17937 return SDValue();
17938 }
17939 assert(ShiftAmt >= 0 && "All undef?");
17940
17941 // Great we found a shift right.
17942 SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL);
17943 Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,
17944 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17945 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17946 DAG.getVectorIdxConstant(0, DL));
17947}
17948
17949// Determine if this shuffle can be implemented with a KSHIFT instruction.
17950// Returns the shift amount if possible or -1 if not. This is a simplified
17951// version of matchShuffleAsShift.
17952static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
17953 int MaskOffset, const APInt &Zeroable) {
17954 int Size = Mask.size();
17955
17956 auto CheckZeros = [&](int Shift, bool Left) {
17957 for (int j = 0; j < Shift; ++j)
17958 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
17959 return false;
17960
17961 return true;
17962 };
17963
17964 auto MatchShift = [&](int Shift, bool Left) {
17965 unsigned Pos = Left ? Shift : 0;
17966 unsigned Low = Left ? 0 : Shift;
17967 unsigned Len = Size - Shift;
17968 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
17969 };
17970
17971 for (int Shift = 1; Shift != Size; ++Shift)
17972 for (bool Left : {true, false})
17973 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
17975 return Shift;
17976 }
17977
17978 return -1;
17979}
17980
17981
17982// Lower vXi1 vector shuffles.
17983// There is no a dedicated instruction on AVX-512 that shuffles the masks.
17984// The only way to shuffle bits is to sign-extend the mask vector to SIMD
17985// vector, shuffle and then truncate it back.
17987 MVT VT, SDValue V1, SDValue V2,
17988 const APInt &Zeroable,
17989 const X86Subtarget &Subtarget,
17990 SelectionDAG &DAG) {
17991 assert(Subtarget.hasAVX512() &&
17992 "Cannot lower 512-bit vectors w/o basic ISA!");
17993
17994 int NumElts = Mask.size();
17995 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17996
17997 // Try to recognize shuffles that are just padding a subvector with zeros.
17998 int SubvecElts = 0;
17999 int Src = -1;
18000 for (int i = 0; i != NumElts; ++i) {
18001 if (Mask[i] >= 0) {
18002 // Grab the source from the first valid mask. All subsequent elements need
18003 // to use this same source.
18004 if (Src < 0)
18005 Src = Mask[i] / NumElts;
18006 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
18007 break;
18008 }
18009
18010 ++SubvecElts;
18011 }
18012 assert(SubvecElts != NumElts && "Identity shuffle?");
18013
18014 // Clip to a power 2.
18015 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
18016
18017 // Make sure the number of zeroable bits in the top at least covers the bits
18018 // not covered by the subvector.
18019 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
18020 assert(Src >= 0 && "Expected a source!");
18021 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
18022 SDValue Extract =
18023 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Src == 0 ? V1 : V2,
18024 DAG.getVectorIdxConstant(0, DL));
18025 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18026 DAG.getConstant(0, DL, VT), Extract,
18027 DAG.getVectorIdxConstant(0, DL));
18028 }
18029
18030 // Try a simple shift right with undef elements. Later we'll try with zeros.
18031 if (SDValue Shift =
18032 lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget, DAG))
18033 return Shift;
18034
18035 // Try to match KSHIFTs.
18036 unsigned Offset = 0;
18037 for (SDValue V : {V1, V2}) {
18038 unsigned Opcode;
18039 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
18040 if (ShiftAmt >= 0) {
18041 SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL);
18042 MVT WideVT = Res.getSimpleValueType();
18043 // Widened right shifts need two shifts to ensure we shift in zeroes.
18044 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18045 int WideElts = WideVT.getVectorNumElements();
18046 // Shift left to put the original vector in the MSBs of the new size.
18047 Res =
18048 DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
18049 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
18050 // Increase the shift amount to account for the left shift.
18051 ShiftAmt += WideElts - NumElts;
18052 }
18053
18054 Res = DAG.getNode(Opcode, DL, WideVT, Res,
18055 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18056 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18057 DAG.getVectorIdxConstant(0, DL));
18058 }
18059 Offset += NumElts; // Increment for next iteration.
18060 }
18061
18062 // If we're performing an unary shuffle on a SETCC result, try to shuffle the
18063 // ops instead.
18064 // TODO: What other unary shuffles would benefit from this?
18065 if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
18066 SDValue Op0 = V1.getOperand(0);
18067 SDValue Op1 = V1.getOperand(1);
18069 EVT OpVT = Op0.getValueType();
18070 if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))
18071 return DAG.getSetCC(
18072 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
18073 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
18074 }
18075
18076 MVT ExtVT;
18077 switch (VT.SimpleTy) {
18078 default:
18079 llvm_unreachable("Expected a vector of i1 elements");
18080 case MVT::v2i1:
18081 ExtVT = MVT::v2i64;
18082 break;
18083 case MVT::v4i1:
18084 ExtVT = MVT::v4i32;
18085 break;
18086 case MVT::v8i1:
18087 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
18088 // shuffle.
18089 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
18090 break;
18091 case MVT::v16i1:
18092 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18093 // 256-bit operation available.
18094 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
18095 break;
18096 case MVT::v32i1:
18097 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18098 // 256-bit operation available.
18099 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
18100 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
18101 break;
18102 case MVT::v64i1:
18103 // Fall back to scalarization. FIXME: We can do better if the shuffle
18104 // can be partitioned cleanly.
18105 if (!Subtarget.useBWIRegs())
18106 return SDValue();
18107 ExtVT = MVT::v64i8;
18108 break;
18109 }
18110
18111 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
18112 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
18113
18114 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
18115 // i1 was sign extended we can use X86ISD::CVT2MASK.
18116 int NumElems = VT.getVectorNumElements();
18117 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
18118 (Subtarget.hasDQI() && (NumElems < 32)))
18119 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
18120 Shuffle, ISD::SETGT);
18121
18122 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
18123}
18124
18125/// Helper function that returns true if the shuffle mask should be
18126/// commuted to improve canonicalization.
18128 int NumElements = Mask.size();
18129
18130 int NumV1Elements = 0, NumV2Elements = 0;
18131 for (int M : Mask)
18132 if (M < 0)
18133 continue;
18134 else if (M < NumElements)
18135 ++NumV1Elements;
18136 else
18137 ++NumV2Elements;
18138
18139 // Commute the shuffle as needed such that more elements come from V1 than
18140 // V2. This allows us to match the shuffle pattern strictly on how many
18141 // elements come from V1 without handling the symmetric cases.
18142 if (NumV2Elements > NumV1Elements)
18143 return true;
18144
18145 assert(NumV1Elements > 0 && "No V1 indices");
18146
18147 if (NumV2Elements == 0)
18148 return false;
18149
18150 // When the number of V1 and V2 elements are the same, try to minimize the
18151 // number of uses of V2 in the low half of the vector. When that is tied,
18152 // ensure that the sum of indices for V1 is equal to or lower than the sum
18153 // indices for V2. When those are equal, try to ensure that the number of odd
18154 // indices for V1 is lower than the number of odd indices for V2.
18155 if (NumV1Elements == NumV2Elements) {
18156 int LowV1Elements = 0, LowV2Elements = 0;
18157 for (int M : Mask.slice(0, NumElements / 2))
18158 if (M >= NumElements)
18159 ++LowV2Elements;
18160 else if (M >= 0)
18161 ++LowV1Elements;
18162 if (LowV2Elements > LowV1Elements)
18163 return true;
18164 if (LowV2Elements == LowV1Elements) {
18165 int SumV1Indices = 0, SumV2Indices = 0;
18166 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18167 if (Mask[i] >= NumElements)
18168 SumV2Indices += i;
18169 else if (Mask[i] >= 0)
18170 SumV1Indices += i;
18171 if (SumV2Indices < SumV1Indices)
18172 return true;
18173 if (SumV2Indices == SumV1Indices) {
18174 int NumV1OddIndices = 0, NumV2OddIndices = 0;
18175 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18176 if (Mask[i] >= NumElements)
18177 NumV2OddIndices += i % 2;
18178 else if (Mask[i] >= 0)
18179 NumV1OddIndices += i % 2;
18180 if (NumV2OddIndices < NumV1OddIndices)
18181 return true;
18182 }
18183 }
18184 }
18185
18186 return false;
18187}
18188
18190 const X86Subtarget &Subtarget) {
18191 if (!Subtarget.hasAVX512())
18192 return false;
18193
18194 if (!V.getValueType().isSimple())
18195 return false;
18196
18197 MVT VT = V.getSimpleValueType().getScalarType();
18198 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
18199 return false;
18200
18201 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
18202 // are preferable to blendw/blendvb/masked-mov.
18203 if ((VT == MVT::i16 || VT == MVT::i8) &&
18204 V.getSimpleValueType().getSizeInBits() < 512)
18205 return false;
18206
18207 auto HasMaskOperation = [&](SDValue V) {
18208 // TODO: Currently we only check limited opcode. We probably extend
18209 // it to all binary operation by checking TLI.isBinOp().
18210 switch (V->getOpcode()) {
18211 default:
18212 return false;
18213 case ISD::ADD:
18214 case ISD::SUB:
18215 case ISD::AND:
18216 case ISD::XOR:
18217 case ISD::OR:
18218 case ISD::SMAX:
18219 case ISD::SMIN:
18220 case ISD::UMAX:
18221 case ISD::UMIN:
18222 case ISD::ABS:
18223 case ISD::SHL:
18224 case ISD::SRL:
18225 case ISD::SRA:
18226 case ISD::MUL:
18227 break;
18228 }
18229 if (!V->hasOneUse())
18230 return false;
18231
18232 return true;
18233 };
18234
18235 if (HasMaskOperation(V))
18236 return true;
18237
18238 return false;
18239}
18240
18241// Forward declaration.
18244 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
18245 const X86Subtarget &Subtarget);
18246
18247 /// Top-level lowering for x86 vector shuffles.
18248///
18249/// This handles decomposition, canonicalization, and lowering of all x86
18250/// vector shuffles. Most of the specific lowering strategies are encapsulated
18251/// above in helper routines. The canonicalization attempts to widen shuffles
18252/// to involve fewer lanes of wider elements, consolidate symmetric patterns
18253/// s.t. only one of the two inputs needs to be tested, etc.
18255 SelectionDAG &DAG) {
18257 ArrayRef<int> OrigMask = SVOp->getMask();
18258 SDValue V1 = Op.getOperand(0);
18259 SDValue V2 = Op.getOperand(1);
18260 MVT VT = Op.getSimpleValueType();
18261 int NumElements = VT.getVectorNumElements();
18262 SDLoc DL(Op);
18263 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
18264
18265 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
18266 "Can't lower MMX shuffles");
18267
18268 bool V1IsUndef = V1.isUndef();
18269 bool V2IsUndef = V2.isUndef();
18270 if (V1IsUndef && V2IsUndef)
18271 return DAG.getUNDEF(VT);
18272
18273 // When we create a shuffle node we put the UNDEF node to second operand,
18274 // but in some cases the first operand may be transformed to UNDEF.
18275 // In this case we should just commute the node.
18276 if (V1IsUndef)
18277 return DAG.getCommutedVectorShuffle(*SVOp);
18278
18279 // Check for non-undef masks pointing at an undef vector and make the masks
18280 // undef as well. This makes it easier to match the shuffle based solely on
18281 // the mask.
18282 if (V2IsUndef &&
18283 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
18284 SmallVector<int, 8> NewMask(OrigMask);
18285 for (int &M : NewMask)
18286 if (M >= NumElements)
18287 M = -1;
18288 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
18289 }
18290
18291 // Check for illegal shuffle mask element index values.
18292 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
18293 (void)MaskUpperLimit;
18294 assert(llvm::all_of(OrigMask,
18295 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
18296 "Out of bounds shuffle index");
18297
18298 // We actually see shuffles that are entirely re-arrangements of a set of
18299 // zero inputs. This mostly happens while decomposing complex shuffles into
18300 // simple ones. Directly lower these as a buildvector of zeros.
18301 APInt KnownUndef, KnownZero;
18302 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
18303
18304 APInt Zeroable = KnownUndef | KnownZero;
18305 if (Zeroable.isAllOnes())
18306 return getZeroVector(VT, Subtarget, DAG, DL);
18307
18308 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
18309
18310 // Try to collapse shuffles into using a vector type with fewer elements but
18311 // wider element types. We cap this to not form integers or floating point
18312 // elements wider than 64 bits. It does not seem beneficial to form i128
18313 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
18314 SmallVector<int, 16> WidenedMask;
18315 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
18316 !canCombineAsMaskOperation(V1, Subtarget) &&
18317 !canCombineAsMaskOperation(V2, Subtarget) &&
18318 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
18319 // Shuffle mask widening should not interfere with a broadcast opportunity
18320 // by obfuscating the operands with bitcasts.
18321 // TODO: Avoid lowering directly from this top-level function: make this
18322 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
18323 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
18324 Subtarget, DAG))
18325 return Broadcast;
18326
18327 MVT NewEltVT = VT.isFloatingPoint()
18330 int NewNumElts = NumElements / 2;
18331 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
18332 // Make sure that the new vector type is legal. For example, v2f64 isn't
18333 // legal on SSE1.
18334 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
18335 if (V2IsZero) {
18336 // Modify the new Mask to take all zeros from the all-zero vector.
18337 // Choose indices that are blend-friendly.
18338 bool UsedZeroVector = false;
18339 assert(is_contained(WidenedMask, SM_SentinelZero) &&
18340 "V2's non-undef elements are used?!");
18341 for (int i = 0; i != NewNumElts; ++i)
18342 if (WidenedMask[i] == SM_SentinelZero) {
18343 WidenedMask[i] = i + NewNumElts;
18344 UsedZeroVector = true;
18345 }
18346 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18347 // some elements to be undef.
18348 if (UsedZeroVector)
18349 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18350 }
18351 V1 = DAG.getBitcast(NewVT, V1);
18352 V2 = DAG.getBitcast(NewVT, V2);
18353 return DAG.getBitcast(
18354 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18355 }
18356 }
18357
18358 SmallVector<SDValue> Ops = {V1, V2};
18359 SmallVector<int> Mask(OrigMask);
18360
18361 // Canonicalize the shuffle with any horizontal ops inputs.
18362 // NOTE: This may update Ops and Mask.
18364 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
18365 return DAG.getBitcast(VT, HOp);
18366
18367 V1 = DAG.getBitcast(VT, Ops[0]);
18368 V2 = DAG.getBitcast(VT, Ops[1]);
18369 assert(NumElements == (int)Mask.size() &&
18370 "canonicalizeShuffleMaskWithHorizOp "
18371 "shouldn't alter the shuffle mask size");
18372
18373 // Canonicalize zeros/ones/fp splat constants to ensure no undefs.
18374 // These will be materialized uniformly anyway, so make splat matching easier.
18375 // TODO: Allow all int constants?
18376 auto CanonicalizeConstant = [VT, &DL, &DAG](SDValue V) {
18377 if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {
18378 BitVector Undefs;
18379 if (SDValue Splat = BV->getSplatValue(&Undefs)) {
18380 if (Undefs.any() &&
18383 V = DAG.getBitcast(VT, DAG.getSplat(BV->getValueType(0), DL, Splat));
18384 }
18385 }
18386 }
18387 return V;
18388 };
18389 V1 = CanonicalizeConstant(V1);
18390 V2 = CanonicalizeConstant(V2);
18391
18392 // Commute the shuffle if it will improve canonicalization.
18395 std::swap(V1, V2);
18396 }
18397
18398 // For each vector width, delegate to a specialized lowering routine.
18399 if (VT.is128BitVector())
18400 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18401
18402 if (VT.is256BitVector())
18403 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18404
18405 if (VT.is512BitVector())
18406 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18407
18408 if (Is1BitVector)
18409 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18410
18411 llvm_unreachable("Unimplemented!");
18412}
18413
18414// As legal vpcompress instructions depend on various AVX512 extensions, try to
18415// convert illegal vector sizes to legal ones to avoid expansion.
18417 SelectionDAG &DAG) {
18418 assert(Subtarget.hasAVX512() &&
18419 "Need AVX512 for custom VECTOR_COMPRESS lowering.");
18420
18421 SDLoc DL(Op);
18422 SDValue Vec = Op.getOperand(0);
18423 SDValue Mask = Op.getOperand(1);
18424 SDValue Passthru = Op.getOperand(2);
18425
18426 EVT VecVT = Vec.getValueType();
18427 EVT ElementVT = VecVT.getVectorElementType();
18428 unsigned NumElements = VecVT.getVectorNumElements();
18429 unsigned NumVecBits = VecVT.getFixedSizeInBits();
18430 unsigned NumElementBits = ElementVT.getFixedSizeInBits();
18431
18432 // 128- and 256-bit vectors with <= 16 elements can be converted to and
18433 // compressed as 512-bit vectors in AVX512F.
18434 if (NumVecBits != 128 && NumVecBits != 256)
18435 return SDValue();
18436
18437 if (NumElementBits == 32 || NumElementBits == 64) {
18438 unsigned NumLargeElements = 512 / NumElementBits;
18439 MVT LargeVecVT =
18440 MVT::getVectorVT(ElementVT.getSimpleVT(), NumLargeElements);
18441 MVT LargeMaskVT = MVT::getVectorVT(MVT::i1, NumLargeElements);
18442
18443 Vec = widenSubVector(LargeVecVT, Vec, /*ZeroNewElements=*/false, Subtarget,
18444 DAG, DL);
18445 Mask = widenSubVector(LargeMaskVT, Mask, /*ZeroNewElements=*/true,
18446 Subtarget, DAG, DL);
18447 Passthru = Passthru.isUndef() ? DAG.getUNDEF(LargeVecVT)
18448 : widenSubVector(LargeVecVT, Passthru,
18449 /*ZeroNewElements=*/false,
18450 Subtarget, DAG, DL);
18451
18452 SDValue Compressed =
18453 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18454 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, Compressed,
18455 DAG.getConstant(0, DL, MVT::i64));
18456 }
18457
18458 if (VecVT == MVT::v8i16 || VecVT == MVT::v8i8 || VecVT == MVT::v16i8 ||
18459 VecVT == MVT::v16i16) {
18460 MVT LageElementVT = MVT::getIntegerVT(512 / NumElements);
18461 EVT LargeVecVT = MVT::getVectorVT(LageElementVT, NumElements);
18462
18463 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Vec);
18464 Passthru = Passthru.isUndef()
18465 ? DAG.getUNDEF(LargeVecVT)
18466 : DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Passthru);
18467
18468 SDValue Compressed =
18469 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18470 return DAG.getNode(ISD::TRUNCATE, DL, VecVT, Compressed);
18471 }
18472
18473 return SDValue();
18474}
18475
18476/// Try to lower a VSELECT instruction to a vector shuffle.
18478 const X86Subtarget &Subtarget,
18479 SelectionDAG &DAG) {
18480 SDValue Cond = Op.getOperand(0);
18481 SDValue LHS = Op.getOperand(1);
18482 SDValue RHS = Op.getOperand(2);
18483 MVT VT = Op.getSimpleValueType();
18484
18485 // Only non-legal VSELECTs reach this lowering, convert those into generic
18486 // shuffles and re-use the shuffle lowering path for blends.
18490 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
18491 }
18492
18493 return SDValue();
18494}
18495
18496SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
18497 SDValue Cond = Op.getOperand(0);
18498 SDValue LHS = Op.getOperand(1);
18499 SDValue RHS = Op.getOperand(2);
18500
18501 SDLoc dl(Op);
18502 MVT VT = Op.getSimpleValueType();
18503 if (isSoftF16(VT, Subtarget)) {
18504 MVT NVT = VT.changeVectorElementTypeToInteger();
18505 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
18506 DAG.getBitcast(NVT, LHS),
18507 DAG.getBitcast(NVT, RHS)));
18508 }
18509
18510 // A vselect where all conditions and data are constants can be optimized into
18511 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
18515 return SDValue();
18516
18517 // Try to lower this to a blend-style vector shuffle. This can handle all
18518 // constant condition cases.
18519 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
18520 return BlendOp;
18521
18522 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
18523 // with patterns on the mask registers on AVX-512.
18524 MVT CondVT = Cond.getSimpleValueType();
18525 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
18526 if (CondEltSize == 1)
18527 return Op;
18528
18529 // Variable blends are only legal from SSE4.1 onward.
18530 if (!Subtarget.hasSSE41())
18531 return SDValue();
18532
18533 unsigned EltSize = VT.getScalarSizeInBits();
18534 unsigned NumElts = VT.getVectorNumElements();
18535
18536 // Expand v32i16/v64i8 without BWI.
18537 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
18538 return SDValue();
18539
18540 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
18541 // into an i1 condition so that we can use the mask-based 512-bit blend
18542 // instructions.
18543 if (VT.getSizeInBits() == 512) {
18544 // Build a mask by testing the condition against zero.
18545 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
18546 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
18547 DAG.getConstant(0, dl, CondVT),
18548 ISD::SETNE);
18549 // Now return a new VSELECT using the mask.
18550 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
18551 }
18552
18553 // SEXT/TRUNC cases where the mask doesn't match the destination size.
18554 if (CondEltSize != EltSize) {
18555 // If we don't have a sign splat, rely on the expansion.
18556 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
18557 return SDValue();
18558
18559 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
18560 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
18561 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
18562 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
18563 }
18564
18565 // v16i16/v32i8 selects without AVX2, if the condition and another operand
18566 // are free to split, then better to split before expanding the
18567 // select. Don't bother with XOP as it has the fast VPCMOV instruction.
18568 // TODO: This is very similar to narrowVectorSelect.
18569 // TODO: Add Load splitting to isFreeToSplitVector ?
18570 if (EltSize < 32 && VT.is256BitVector() && !Subtarget.hasAVX2() &&
18571 !Subtarget.hasXOP()) {
18572 bool FreeCond = isFreeToSplitVector(Cond, DAG);
18573 bool FreeLHS = isFreeToSplitVector(LHS, DAG) ||
18574 (ISD::isNormalLoad(LHS.getNode()) && LHS.hasOneUse());
18575 bool FreeRHS = isFreeToSplitVector(RHS, DAG) ||
18576 (ISD::isNormalLoad(RHS.getNode()) && RHS.hasOneUse());
18577 if (FreeCond && (FreeLHS || FreeRHS))
18578 return splitVectorOp(Op, DAG, dl);
18579 }
18580
18581 // Only some types will be legal on some subtargets. If we can emit a legal
18582 // VSELECT-matching blend, return Op, and but if we need to expand, return
18583 // a null value.
18584 switch (VT.SimpleTy) {
18585 default:
18586 // Most of the vector types have blends past SSE4.1.
18587 return Op;
18588
18589 case MVT::v32i8:
18590 // The byte blends for AVX vectors were introduced only in AVX2.
18591 if (Subtarget.hasAVX2())
18592 return Op;
18593
18594 return SDValue();
18595
18596 case MVT::v8i16:
18597 case MVT::v16i16:
18598 case MVT::v8f16:
18599 case MVT::v16f16: {
18600 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
18601 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
18602 Cond = DAG.getBitcast(CastVT, Cond);
18603 LHS = DAG.getBitcast(CastVT, LHS);
18604 RHS = DAG.getBitcast(CastVT, RHS);
18605 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
18606 return DAG.getBitcast(VT, Select);
18607 }
18608 }
18609}
18610
18612 MVT VT = Op.getSimpleValueType();
18613 SDValue Vec = Op.getOperand(0);
18614 SDValue Idx = Op.getOperand(1);
18615 assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
18616 SDLoc dl(Op);
18617
18619 return SDValue();
18620
18621 if (VT.getSizeInBits() == 8) {
18622 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
18623 // we're going to zero extend the register or fold the store.
18626 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
18627 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18628 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18629
18630 unsigned IdxVal = Idx->getAsZExtVal();
18631 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
18632 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18633 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18634 }
18635
18636 if (VT == MVT::f32) {
18637 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
18638 // the result back to FR32 register. It's only worth matching if the
18639 // result has a single use which is a store or a bitcast to i32. And in
18640 // the case of a store, it's not worth it if the index is a constant 0,
18641 // because a MOVSSmr can be used instead, which is smaller and faster.
18642 if (!Op.hasOneUse())
18643 return SDValue();
18644 SDNode *User = *Op.getNode()->user_begin();
18645 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
18646 (User->getOpcode() != ISD::BITCAST ||
18647 User->getValueType(0) != MVT::i32))
18648 return SDValue();
18649 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18650 DAG.getBitcast(MVT::v4i32, Vec), Idx);
18651 return DAG.getBitcast(MVT::f32, Extract);
18652 }
18653
18654 if (VT == MVT::i32 || VT == MVT::i64)
18655 return Op;
18656
18657 return SDValue();
18658}
18659
18660/// Extract one bit from mask vector, like v16i1 or v8i1.
18661/// AVX-512 feature.
18663 const X86Subtarget &Subtarget) {
18664 SDValue Vec = Op.getOperand(0);
18665 SDLoc dl(Vec);
18666 MVT VecVT = Vec.getSimpleValueType();
18667 SDValue Idx = Op.getOperand(1);
18668 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18669 MVT EltVT = Op.getSimpleValueType();
18670
18671 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
18672 "Unexpected vector type in ExtractBitFromMaskVector");
18673
18674 // variable index can't be handled in mask registers,
18675 // extend vector to VR512/128
18676 if (!IdxC) {
18677 unsigned NumElts = VecVT.getVectorNumElements();
18678 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
18679 // than extending to 128/256bit.
18680 if (NumElts == 1) {
18681 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18683 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
18684 }
18685 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18686 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18687 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
18688 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
18689 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
18690 }
18691
18692 unsigned IdxVal = IdxC->getZExtValue();
18693 if (IdxVal == 0) // the operation is legal
18694 return Op;
18695
18696 // Extend to natively supported kshift.
18697 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18698
18699 // Use kshiftr instruction to move to the lower element.
18700 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18701 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18702
18703 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18704 DAG.getVectorIdxConstant(0, dl));
18705}
18706
18707// Helper to find all the extracted elements from a vector.
18709 MVT VT = N->getSimpleValueType(0);
18710 unsigned NumElts = VT.getVectorNumElements();
18711 APInt DemandedElts = APInt::getZero(NumElts);
18712 for (SDNode *User : N->users()) {
18713 switch (User->getOpcode()) {
18714 case X86ISD::PEXTRB:
18715 case X86ISD::PEXTRW:
18718 DemandedElts.setAllBits();
18719 return DemandedElts;
18720 }
18721 DemandedElts.setBit(User->getConstantOperandVal(1));
18722 break;
18723 case ISD::BITCAST: {
18724 if (!User->getValueType(0).isSimple() ||
18725 !User->getValueType(0).isVector()) {
18726 DemandedElts.setAllBits();
18727 return DemandedElts;
18728 }
18729 APInt DemandedSrcElts = getExtractedDemandedElts(User);
18730 DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
18731 break;
18732 }
18733 default:
18734 DemandedElts.setAllBits();
18735 return DemandedElts;
18736 }
18737 }
18738 return DemandedElts;
18739}
18740
18741SDValue
18742X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
18743 SelectionDAG &DAG) const {
18744 SDLoc dl(Op);
18745 SDValue Vec = Op.getOperand(0);
18746 MVT VecVT = Vec.getSimpleValueType();
18747 SDValue Idx = Op.getOperand(1);
18748 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18749
18750 if (VecVT.getVectorElementType() == MVT::i1)
18751 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
18752
18753 if (!IdxC) {
18754 // Its more profitable to go through memory (1 cycles throughput)
18755 // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput)
18756 // IACA tool was used to get performance estimation
18757 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18758 //
18759 // example : extractelement <16 x i8> %a, i32 %i
18760 //
18761 // Block Throughput: 3.00 Cycles
18762 // Throughput Bottleneck: Port5
18763 //
18764 // | Num Of | Ports pressure in cycles | |
18765 // | Uops | 0 - DV | 5 | 6 | 7 | |
18766 // ---------------------------------------------
18767 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
18768 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
18769 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
18770 // Total Num Of Uops: 4
18771 //
18772 //
18773 // Block Throughput: 1.00 Cycles
18774 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
18775 //
18776 // | | Ports pressure in cycles | |
18777 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
18778 // ---------------------------------------------------------
18779 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18780 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
18781 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
18782 // Total Num Of Uops: 4
18783
18784 return SDValue();
18785 }
18786
18787 unsigned IdxVal = IdxC->getZExtValue();
18788
18789 // If this is a 256-bit vector result, first extract the 128-bit vector and
18790 // then extract the element from the 128-bit vector.
18791 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
18792 // Get the 128-bit vector.
18793 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
18794 MVT EltVT = VecVT.getVectorElementType();
18795
18796 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
18797 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
18798
18799 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
18800 // this can be done with a mask.
18801 IdxVal &= ElemsPerChunk - 1;
18802 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18803 DAG.getVectorIdxConstant(IdxVal, dl));
18804 }
18805
18806 assert(VecVT.is128BitVector() && "Unexpected vector length");
18807
18808 MVT VT = Op.getSimpleValueType();
18809
18810 if (VT == MVT::i16) {
18811 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
18812 // we're going to zero extend the register or fold the store (SSE41 only).
18813 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
18814 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
18815 if (Subtarget.hasFP16())
18816 return Op;
18817
18818 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
18819 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18820 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18821 }
18822
18823 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
18824 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18825 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18826 }
18827
18828 if (Subtarget.hasSSE41())
18829 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
18830 return Res;
18831
18832 // Only extract a single element from a v16i8 source - determine the common
18833 // DWORD/WORD that all extractions share, and extract the sub-byte.
18834 // TODO: Add QWORD MOVQ extraction?
18835 if (VT == MVT::i8) {
18836 APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
18837 assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
18838
18839 // Extract either the lowest i32 or any i16, and extract the sub-byte.
18840 int DWordIdx = IdxVal / 4;
18841 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
18842 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18843 DAG.getBitcast(MVT::v4i32, Vec),
18844 DAG.getVectorIdxConstant(DWordIdx, dl));
18845 int ShiftVal = (IdxVal % 4) * 8;
18846 if (ShiftVal != 0)
18847 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
18848 DAG.getConstant(ShiftVal, dl, MVT::i8));
18849 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18850 }
18851
18852 int WordIdx = IdxVal / 2;
18853 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
18854 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
18855 DAG.getBitcast(MVT::v8i16, Vec),
18856 DAG.getVectorIdxConstant(WordIdx, dl));
18857 int ShiftVal = (IdxVal % 2) * 8;
18858 if (ShiftVal != 0)
18859 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
18860 DAG.getConstant(ShiftVal, dl, MVT::i8));
18861 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18862 }
18863 }
18864
18865 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
18866 if (IdxVal == 0)
18867 return Op;
18868
18869 // Shuffle the element to the lowest element, then movss or movsh.
18870 SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
18871 Mask[0] = static_cast<int>(IdxVal);
18872 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18873 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18874 DAG.getVectorIdxConstant(0, dl));
18875 }
18876
18877 if (VT.getSizeInBits() == 64) {
18878 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18879 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18880 // to match extract_elt for f64.
18881 if (IdxVal == 0)
18882 return Op;
18883
18884 // UNPCKHPD the element to the lowest double word, then movsd.
18885 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18886 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18887 int Mask[2] = { 1, -1 };
18888 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18889 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18890 DAG.getVectorIdxConstant(0, dl));
18891 }
18892
18893 return SDValue();
18894}
18895
18896/// Insert one bit to mask vector, like v16i1 or v8i1.
18897/// AVX-512 feature.
18899 const X86Subtarget &Subtarget) {
18900 SDLoc dl(Op);
18901 SDValue Vec = Op.getOperand(0);
18902 SDValue Elt = Op.getOperand(1);
18903 SDValue Idx = Op.getOperand(2);
18904 MVT VecVT = Vec.getSimpleValueType();
18905
18906 if (!isa<ConstantSDNode>(Idx)) {
18907 // Non constant index. Extend source and destination,
18908 // insert element and then truncate the result.
18909 unsigned NumElts = VecVT.getVectorNumElements();
18910 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18911 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18912 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18913 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18914 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18915 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18916 }
18917
18918 // Copy into a k-register, extract to v1i1 and insert_subvector.
18919 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18920 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
18921}
18922
18923SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18924 SelectionDAG &DAG) const {
18925 MVT VT = Op.getSimpleValueType();
18926 MVT EltVT = VT.getVectorElementType();
18927 unsigned NumElts = VT.getVectorNumElements();
18928 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
18929
18930 if (EltVT == MVT::i1)
18931 return InsertBitToMaskVector(Op, DAG, Subtarget);
18932
18933 SDLoc dl(Op);
18934 SDValue N0 = Op.getOperand(0);
18935 SDValue N1 = Op.getOperand(1);
18936 SDValue N2 = Op.getOperand(2);
18937 auto *N2C = dyn_cast<ConstantSDNode>(N2);
18938
18939 if (EltVT == MVT::bf16) {
18940 MVT IVT = VT.changeVectorElementTypeToInteger();
18941 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
18942 DAG.getBitcast(IVT, N0),
18943 DAG.getBitcast(MVT::i16, N1), N2);
18944 return DAG.getBitcast(VT, Res);
18945 }
18946
18947 if (!N2C) {
18948 // Variable insertion indices, usually we're better off spilling to stack,
18949 // but AVX512 can use a variable compare+select by comparing against all
18950 // possible vector indices, and FP insertion has less gpr->simd traffic.
18951 if (!(Subtarget.hasBWI() ||
18952 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
18953 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
18954 return SDValue();
18955
18956 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
18957 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
18958 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
18959 return SDValue();
18960
18961 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
18962 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
18963 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
18964
18965 SmallVector<SDValue, 16> RawIndices;
18966 for (unsigned I = 0; I != NumElts; ++I)
18967 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
18968 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
18969
18970 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18971 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18973 }
18974
18975 if (N2C->getAPIntValue().uge(NumElts))
18976 return SDValue();
18977 uint64_t IdxVal = N2C->getZExtValue();
18978
18979 bool IsZeroElt = X86::isZeroNode(N1);
18980 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
18981
18982 if (IsZeroElt || IsAllOnesElt) {
18983 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
18984 // We don't deal with i8 0 since it appears to be handled elsewhere.
18985 if (IsAllOnesElt &&
18986 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
18987 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
18988 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
18989 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
18990 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
18991 CstVectorElts[IdxVal] = OnesCst;
18992 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
18993 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
18994 }
18995 // See if we can do this more efficiently with a blend shuffle with a
18996 // rematerializable vector.
18997 if (Subtarget.hasSSE41() &&
18998 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
18999 SmallVector<int, 8> BlendMask;
19000 for (unsigned i = 0; i != NumElts; ++i)
19001 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19002 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
19003 : getOnesVector(VT, DAG, dl);
19004 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
19005 }
19006 }
19007
19008 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
19009 // into that, and then insert the subvector back into the result.
19010 if (VT.is256BitVector() || VT.is512BitVector()) {
19011 // With a 256-bit vector, we can insert into the zero element efficiently
19012 // using a blend if we have AVX or AVX2 and the right data type.
19013 if (VT.is256BitVector() && IdxVal == 0) {
19014 // TODO: It is worthwhile to cast integer to floating point and back
19015 // and incur a domain crossing penalty if that's what we'll end up
19016 // doing anyway after extracting to a 128-bit vector.
19017 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
19018 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
19019 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19020 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
19021 DAG.getTargetConstant(1, dl, MVT::i8));
19022 }
19023 }
19024
19025 unsigned NumEltsIn128 = 128 / EltSizeInBits;
19026 assert(isPowerOf2_32(NumEltsIn128) &&
19027 "Vectors will always have power-of-two number of elements.");
19028
19029 // If we are not inserting into the low 128-bit vector chunk,
19030 // then prefer the broadcast+blend sequence.
19031 // FIXME: relax the profitability check iff all N1 uses are insertions.
19032 if (IdxVal >= NumEltsIn128 &&
19033 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
19034 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
19035 X86::mayFoldLoad(N1, Subtarget)))) {
19036 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
19037 SmallVector<int, 8> BlendMask;
19038 for (unsigned i = 0; i != NumElts; ++i)
19039 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19040 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
19041 }
19042
19043 // Get the desired 128-bit vector chunk.
19044 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
19045
19046 // Insert the element into the desired chunk.
19047 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
19048 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
19049
19050 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
19051 DAG.getVectorIdxConstant(IdxIn128, dl));
19052
19053 // Insert the changed part back into the bigger vector
19054 return insert128BitVector(N0, V, IdxVal, DAG, dl);
19055 }
19056 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
19057
19058 // This will be just movw/movd/movq/movsh/movss/movsd.
19059 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
19060 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
19061 EltVT == MVT::f16 || EltVT == MVT::i64) {
19062 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19063 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19064 }
19065
19066 // We can't directly insert an i8 or i16 into a vector, so zero extend
19067 // it to i32 first.
19068 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
19069 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
19070 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
19071 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
19072 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19073 return DAG.getBitcast(VT, N1);
19074 }
19075 }
19076
19077 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
19078 // argument. SSE41 required for pinsrb.
19079 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19080 unsigned Opc;
19081 if (VT == MVT::v8i16) {
19082 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
19084 } else {
19085 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
19086 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
19088 }
19089
19090 assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
19091 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
19092 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
19093 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
19094 }
19095
19096 if (Subtarget.hasSSE41()) {
19097 if (EltVT == MVT::f32) {
19098 // Bits [7:6] of the constant are the source select. This will always be
19099 // zero here. The DAG Combiner may combine an extract_elt index into
19100 // these bits. For example (insert (extract, 3), 2) could be matched by
19101 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
19102 // Bits [5:4] of the constant are the destination select. This is the
19103 // value of the incoming immediate.
19104 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
19105 // combine either bitwise AND or insert of float 0.0 to set these bits.
19106
19107 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
19108 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
19109 // If this is an insertion of 32-bits into the low 32-bits of
19110 // a vector, we prefer to generate a blend with immediate rather
19111 // than an insertps. Blends are simpler operations in hardware and so
19112 // will always have equal or better performance than insertps.
19113 // But if optimizing for size and there's a load folding opportunity,
19114 // generate insertps because blendps does not have a 32-bit memory
19115 // operand form.
19116 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19117 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19118 DAG.getTargetConstant(1, dl, MVT::i8));
19119 }
19120 // Create this as a scalar to vector..
19121 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19122 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19123 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
19124 }
19125
19126 // PINSR* works with constant index.
19127 if (EltVT == MVT::i32 || EltVT == MVT::i64)
19128 return Op;
19129 }
19130
19131 return SDValue();
19132}
19133
19135 SelectionDAG &DAG) {
19136 SDLoc dl(Op);
19137 MVT OpVT = Op.getSimpleValueType();
19138
19139 // It's always cheaper to replace a xor+movd with xorps and simplifies further
19140 // combines.
19141 if (X86::isZeroNode(Op.getOperand(0)))
19142 return getZeroVector(OpVT, Subtarget, DAG, dl);
19143
19144 // If this is a 256-bit vector result, first insert into a 128-bit
19145 // vector and then insert into the 256-bit vector.
19146 if (!OpVT.is128BitVector()) {
19147 // Insert into a 128-bit vector.
19148 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
19150 OpVT.getVectorNumElements() / SizeFactor);
19151
19152 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
19153
19154 // Insert the 128-bit vector.
19155 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
19156 }
19157 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
19158 "Expected an SSE type!");
19159
19160 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
19161 // tblgen.
19162 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
19163 return Op;
19164
19165 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
19166 return DAG.getBitcast(
19167 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
19168}
19169
19170// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
19171// simple superregister reference or explicit instructions to insert
19172// the upper bits of a vector.
19174 SelectionDAG &DAG) {
19175 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
19176
19177 return insert1BitVector(Op, DAG, Subtarget);
19178}
19179
19181 SelectionDAG &DAG) {
19182 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
19183 "Only vXi1 extract_subvectors need custom lowering");
19184
19185 SDLoc dl(Op);
19186 SDValue Vec = Op.getOperand(0);
19187 uint64_t IdxVal = Op.getConstantOperandVal(1);
19188
19189 if (IdxVal == 0) // the operation is legal
19190 return Op;
19191
19192 // Extend to natively supported kshift.
19193 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
19194
19195 // Shift to the LSB.
19196 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
19197 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19198
19199 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
19200 DAG.getVectorIdxConstant(0, dl));
19201}
19202
19203// Returns the appropriate wrapper opcode for a global reference.
19204unsigned X86TargetLowering::getGlobalWrapperKind(
19205 const GlobalValue *GV, const unsigned char OpFlags) const {
19206 // References to absolute symbols are never PC-relative.
19207 if (GV && GV->isAbsoluteSymbolRef())
19208 return X86ISD::Wrapper;
19209
19210 // The following OpFlags under RIP-rel PIC use RIP.
19211 if (Subtarget.isPICStyleRIPRel() &&
19212 (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB ||
19213 OpFlags == X86II::MO_DLLIMPORT))
19214 return X86ISD::WrapperRIP;
19215
19216 // GOTPCREL references must always use RIP.
19217 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
19218 return X86ISD::WrapperRIP;
19219
19220 return X86ISD::Wrapper;
19221}
19222
19223// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
19224// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
19225// one of the above mentioned nodes. It has to be wrapped because otherwise
19226// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
19227// be used to form addressing mode. These wrapped nodes will be selected
19228// into MOV32ri.
19229SDValue
19230X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
19231 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
19232
19233 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19234 // global base reg.
19235 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19236
19237 auto PtrVT = getPointerTy(DAG.getDataLayout());
19239 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
19240 SDLoc DL(CP);
19241 Result =
19242 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19243 // With PIC, the address is actually $g + Offset.
19244 if (OpFlag) {
19245 Result =
19246 DAG.getNode(ISD::ADD, DL, PtrVT,
19247 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19248 }
19249
19250 return Result;
19251}
19252
19253SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
19254 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
19255
19256 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19257 // global base reg.
19258 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19259
19260 EVT PtrVT = Op.getValueType();
19261 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
19262 SDLoc DL(JT);
19263 Result =
19264 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19265
19266 // With PIC, the address is actually $g + Offset.
19267 if (OpFlag)
19268 Result =
19269 DAG.getNode(ISD::ADD, DL, PtrVT,
19270 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19271
19272 return Result;
19273}
19274
19275SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
19276 SelectionDAG &DAG) const {
19277 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19278}
19279
19280SDValue
19281X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
19282 // Create the TargetBlockAddressAddress node.
19283 unsigned char OpFlags =
19284 Subtarget.classifyBlockAddressReference();
19285 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
19286 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
19287 SDLoc dl(Op);
19288 EVT PtrVT = Op.getValueType();
19289 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
19290 Result =
19291 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result);
19292
19293 // With PIC, the address is actually $g + Offset.
19294 if (isGlobalRelativeToPICBase(OpFlags)) {
19295 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19296 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19297 }
19298
19299 return Result;
19300}
19301
19302/// Creates target global address or external symbol nodes for calls or
19303/// other uses.
19304SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
19305 bool ForCall,
19306 bool *IsImpCall) const {
19307 // Unpack the global address or external symbol.
19308 SDLoc dl(Op);
19309 const GlobalValue *GV = nullptr;
19310 int64_t Offset = 0;
19311 const char *ExternalSym = nullptr;
19312 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
19313 GV = G->getGlobal();
19314 Offset = G->getOffset();
19315 } else {
19316 const auto *ES = cast<ExternalSymbolSDNode>(Op);
19317 ExternalSym = ES->getSymbol();
19318 }
19319
19320 // Calculate some flags for address lowering.
19322 unsigned char OpFlags;
19323 if (ForCall)
19324 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
19325 else
19326 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
19327 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
19328 bool NeedsLoad = isGlobalStubReference(OpFlags);
19329
19331 EVT PtrVT = Op.getValueType();
19333
19334 if (GV) {
19335 // Create a target global address if this is a global. If possible, fold the
19336 // offset into the global address reference. Otherwise, ADD it on later.
19337 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
19338 // allowed because if the address of foo is 0, the ELF R_X86_64_32
19339 // relocation will compute to a negative value, which is invalid.
19340 int64_t GlobalOffset = 0;
19341 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
19343 std::swap(GlobalOffset, Offset);
19344 }
19345 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
19346 } else {
19347 // If this is not a global address, this must be an external symbol.
19348 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
19349 }
19350
19351 // If this is a direct call, avoid the wrapper if we don't need to do any
19352 // loads or adds. This allows SDAG ISel to match direct calls.
19353 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
19354 return Result;
19355
19356 // If Import Call Optimization is enabled and this is an imported function
19357 // then make a note of it and return the global address without wrapping.
19358 if (IsImpCall && (OpFlags == X86II::MO_DLLIMPORT) &&
19359 Mod.getModuleFlag("import-call-optimization")) {
19360 assert(ForCall && "Should only enable import call optimization if we are "
19361 "lowering a call");
19362 *IsImpCall = true;
19363 return Result;
19364 }
19365
19366 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19367
19368 // With PIC, the address is actually $g + Offset.
19369 if (HasPICReg) {
19370 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19371 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19372 }
19373
19374 // For globals that require a load from a stub to get the address, emit the
19375 // load.
19376 if (NeedsLoad)
19377 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
19379
19380 // If there was a non-zero offset that we didn't fold, create an explicit
19381 // addition for it.
19382 if (Offset != 0)
19383 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
19384 DAG.getSignedConstant(Offset, dl, PtrVT));
19385
19386 return Result;
19387}
19388
19389SDValue
19390X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
19391 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19392}
19393
19395 const EVT PtrVT, unsigned ReturnReg,
19396 unsigned char OperandFlags,
19397 bool LoadGlobalBaseReg = false,
19398 bool LocalDynamic = false) {
19400 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19401 SDLoc dl(GA);
19402 SDValue TGA;
19403 bool UseTLSDESC = DAG.getTarget().useTLSDESC();
19404 SDValue Chain = DAG.getEntryNode();
19405 SDValue Ret;
19406 if (LocalDynamic && UseTLSDESC) {
19407 TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
19408 // Reuse existing GetTLSADDR node if we can find it.
19409 if (TGA->hasOneUse()) {
19410 // TLSDESC uses TGA.
19411 SDNode *TLSDescOp = *TGA->user_begin();
19412 assert(TLSDescOp->getOpcode() == X86ISD::TLSDESC &&
19413 "Unexpected TLSDESC DAG");
19414 // CALLSEQ_END uses TGA via a chain and glue.
19415 auto *CallSeqEndOp = TLSDescOp->getGluedUser();
19416 assert(CallSeqEndOp && CallSeqEndOp->getOpcode() == ISD::CALLSEQ_END &&
19417 "Unexpected TLSDESC DAG");
19418 // CopyFromReg uses CALLSEQ_END via a chain and glue.
19419 auto *CopyFromRegOp = CallSeqEndOp->getGluedUser();
19420 assert(CopyFromRegOp && CopyFromRegOp->getOpcode() == ISD::CopyFromReg &&
19421 "Unexpected TLSDESC DAG");
19422 Ret = SDValue(CopyFromRegOp, 0);
19423 }
19424 } else {
19425 TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19426 GA->getOffset(), OperandFlags);
19427 }
19428
19429 if (!Ret) {
19430 X86ISD::NodeType CallType = UseTLSDESC ? X86ISD::TLSDESC
19431 : LocalDynamic ? X86ISD::TLSBASEADDR
19433
19434 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
19435 if (LoadGlobalBaseReg) {
19436 SDValue InGlue;
19437 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
19438 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT),
19439 InGlue);
19440 InGlue = Chain.getValue(1);
19441 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA, InGlue});
19442 } else {
19443 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA});
19444 }
19445 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), dl);
19446
19447 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
19448 MFI.setHasCalls(true);
19449
19450 SDValue Glue = Chain.getValue(1);
19451 Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
19452 }
19453
19454 if (!UseTLSDESC)
19455 return Ret;
19456
19457 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
19458 unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS;
19459
19461 SDValue Offset =
19462 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19464 return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset);
19465}
19466
19467// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
19468static SDValue
19470 const EVT PtrVT) {
19471 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD,
19472 /*LoadGlobalBaseReg=*/true);
19473}
19474
19475// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
19476static SDValue
19478 const EVT PtrVT) {
19479 return GetTLSADDR(DAG, GA, PtrVT, X86::RAX, X86II::MO_TLSGD);
19480}
19481
19482// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
19483static SDValue
19485 const EVT PtrVT) {
19486 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD);
19487}
19488
19490 SelectionDAG &DAG, const EVT PtrVT,
19491 bool Is64Bit, bool Is64BitLP64) {
19492 SDLoc dl(GA);
19493
19494 // Get the start address of the TLS block for this module.
19498
19499 SDValue Base;
19500 if (Is64Bit) {
19501 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
19502 Base = GetTLSADDR(DAG, GA, PtrVT, ReturnReg, X86II::MO_TLSLD,
19503 /*LoadGlobalBaseReg=*/false,
19504 /*LocalDynamic=*/true);
19505 } else {
19506 Base = GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSLDM,
19507 /*LoadGlobalBaseReg=*/true,
19508 /*LocalDynamic=*/true);
19509 }
19510
19511 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
19512 // of Base.
19513
19514 // Build x@dtpoff.
19515 unsigned char OperandFlags = X86II::MO_DTPOFF;
19516 unsigned WrapperKind = X86ISD::Wrapper;
19517 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19518 GA->getValueType(0),
19519 GA->getOffset(), OperandFlags);
19520 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19521
19522 // Add x@dtpoff with the base.
19523 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
19524}
19525
19526// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
19528 const EVT PtrVT, TLSModel::Model model,
19529 bool is64Bit, bool isPIC) {
19530 SDLoc dl(GA);
19531
19532 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
19535
19536 SDValue ThreadPointer =
19537 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19539
19540 unsigned char OperandFlags = 0;
19541 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
19542 // initialexec.
19543 unsigned WrapperKind = X86ISD::Wrapper;
19544 if (model == TLSModel::LocalExec) {
19545 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
19546 } else if (model == TLSModel::InitialExec) {
19547 if (is64Bit) {
19548 OperandFlags = X86II::MO_GOTTPOFF;
19549 WrapperKind = X86ISD::WrapperRIP;
19550 } else {
19551 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
19552 }
19553 } else {
19554 llvm_unreachable("Unexpected model");
19555 }
19556
19557 // emit "addl x@ntpoff,%eax" (local exec)
19558 // or "addl x@indntpoff,%eax" (initial exec)
19559 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
19560 SDValue TGA =
19561 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19562 GA->getOffset(), OperandFlags);
19563 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19564
19565 if (model == TLSModel::InitialExec) {
19566 if (isPIC && !is64Bit) {
19567 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
19568 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19569 Offset);
19570 }
19571
19572 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
19574 }
19575
19576 // The address of the thread local variable is the add of the thread
19577 // pointer with the offset of the variable.
19578 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
19579}
19580
19581SDValue
19582X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
19583
19584 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
19585
19586 if (DAG.getTarget().useEmulatedTLS())
19587 return LowerToTLSEmulatedModel(GA, DAG);
19588
19589 const GlobalValue *GV = GA->getGlobal();
19590 EVT PtrVT = Op.getValueType();
19591 bool PositionIndependent = isPositionIndependent();
19592
19593 if (Subtarget.isTargetELF()) {
19594 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
19595 switch (model) {
19597 if (Subtarget.is64Bit()) {
19598 if (Subtarget.isTarget64BitLP64())
19599 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
19600 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
19601 }
19602 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
19604 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
19605 Subtarget.isTarget64BitLP64());
19608 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
19609 PositionIndependent);
19610 }
19611 llvm_unreachable("Unknown TLS model.");
19612 }
19613
19614 if (Subtarget.isTargetDarwin()) {
19615 // Darwin only has one model of TLS. Lower to that.
19616 unsigned char OpFlag = 0;
19617 unsigned WrapperKind = 0;
19618
19619 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19620 // global base reg.
19621 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
19622 if (PIC32) {
19623 OpFlag = X86II::MO_TLVP_PIC_BASE;
19624 WrapperKind = X86ISD::Wrapper;
19625 } else {
19626 OpFlag = X86II::MO_TLVP;
19627 WrapperKind = X86ISD::WrapperRIP;
19628 }
19629 SDLoc DL(Op);
19631 GA->getValueType(0),
19632 GA->getOffset(), OpFlag);
19633 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
19634
19635 // With PIC32, the address is actually $g + Offset.
19636 if (PIC32)
19637 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
19638 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19639 Offset);
19640
19641 // Lowering the machine isd will make sure everything is in the right
19642 // location.
19643 SDValue Chain = DAG.getEntryNode();
19644 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19645 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
19646 SDValue Args[] = { Chain, Offset };
19647 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
19648 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
19649
19650 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
19651 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19652 MFI.setAdjustsStack(true);
19653
19654 // And our return value (tls address) is in the standard call return value
19655 // location.
19656 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
19657 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
19658 }
19659
19660 if (Subtarget.isOSWindows()) {
19661 // Just use the implicit TLS architecture
19662 // Need to generate something similar to:
19663 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
19664 // ; from TEB
19665 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
19666 // mov rcx, qword [rdx+rcx*8]
19667 // mov eax, .tls$:tlsvar
19668 // [rax+rcx] contains the address
19669 // Windows 64bit: gs:0x58
19670 // Windows 32bit: fs:__tls_array
19671
19672 SDLoc dl(GA);
19673 SDValue Chain = DAG.getEntryNode();
19674
19675 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
19676 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
19677 // use its literal value of 0x2C.
19679 Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), X86AS::GS)
19681
19682 SDValue TlsArray = Subtarget.is64Bit()
19683 ? DAG.getIntPtrConstant(0x58, dl)
19684 : (Subtarget.isTargetWindowsGNU()
19685 ? DAG.getIntPtrConstant(0x2C, dl)
19686 : DAG.getExternalSymbol("_tls_array", PtrVT));
19687
19689 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
19690
19691 SDValue res;
19693 res = ThreadPointer;
19694 } else {
19695 // Load the _tls_index variable
19696 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
19697 if (Subtarget.is64Bit())
19698 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
19699 MachinePointerInfo(), MVT::i32);
19700 else
19701 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
19702
19703 const DataLayout &DL = DAG.getDataLayout();
19704 SDValue Scale =
19705 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
19706 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
19707
19708 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
19709 }
19710
19711 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
19712
19713 // Get the offset of start of .tls section
19714 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19715 GA->getValueType(0),
19717 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
19718
19719 // The address of the thread local variable is the add of the thread
19720 // pointer with the offset of the variable.
19721 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
19722 }
19723
19724 llvm_unreachable("TLS not implemented for this target.");
19725}
19726
19728 if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
19729 const TargetMachine &TM = getTargetMachine();
19730 TLSModel::Model Model = TM.getTLSModel(&GV);
19731 switch (Model) {
19734 // We can include the %fs segment register in addressing modes.
19735 return true;
19738 // These models do not result in %fs relative addresses unless
19739 // TLS descriptior are used.
19740 //
19741 // Even in the case of TLS descriptors we currently have no way to model
19742 // the difference between %fs access and the computations needed for the
19743 // offset and returning `true` for TLS-desc currently duplicates both
19744 // which is detrimental :-/
19745 return false;
19746 }
19747 }
19748 return false;
19749}
19750
19751/// Lower SRA_PARTS and friends, which return two i32 values
19752/// and take a 2 x i32 value to shift plus a shift amount.
19753/// TODO: Can this be moved to general expansion code?
19755 SDValue Lo, Hi;
19756 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
19757 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
19758}
19759
19760// Try to use a packed vector operation to handle i64 on 32-bit targets when
19761// AVX512DQ is enabled.
19763 SelectionDAG &DAG,
19764 const X86Subtarget &Subtarget) {
19765 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19766 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19767 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19768 Op.getOpcode() == ISD::UINT_TO_FP) &&
19769 "Unexpected opcode!");
19770 bool IsStrict = Op->isStrictFPOpcode();
19771 unsigned OpNo = IsStrict ? 1 : 0;
19772 SDValue Src = Op.getOperand(OpNo);
19773 MVT SrcVT = Src.getSimpleValueType();
19774 MVT VT = Op.getSimpleValueType();
19775
19776 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
19777 (VT != MVT::f32 && VT != MVT::f64))
19778 return SDValue();
19779
19780 // Pack the i64 into a vector, do the operation and extract.
19781
19782 // Using 256-bit to ensure result is 128-bits for f32 case.
19783 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19784 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
19785 MVT VecVT = MVT::getVectorVT(VT, NumElts);
19786
19787 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
19788 if (IsStrict) {
19789 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
19790 {Op.getOperand(0), InVec});
19791 SDValue Chain = CvtVec.getValue(1);
19792 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19793 DAG.getVectorIdxConstant(0, dl));
19794 return DAG.getMergeValues({Value, Chain}, dl);
19795 }
19796
19797 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
19798
19799 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19800 DAG.getVectorIdxConstant(0, dl));
19801}
19802
19803// Try to use a packed vector operation to handle i64 on 32-bit targets.
19805 const X86Subtarget &Subtarget) {
19806 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19807 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19808 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19809 Op.getOpcode() == ISD::UINT_TO_FP) &&
19810 "Unexpected opcode!");
19811 bool IsStrict = Op->isStrictFPOpcode();
19812 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19813 MVT SrcVT = Src.getSimpleValueType();
19814 MVT VT = Op.getSimpleValueType();
19815
19816 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
19817 return SDValue();
19818
19819 // Pack the i64 into a vector, do the operation and extract.
19820
19821 assert(Subtarget.hasFP16() && "Expected FP16");
19822
19823 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
19824 if (IsStrict) {
19825 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
19826 {Op.getOperand(0), InVec});
19827 SDValue Chain = CvtVec.getValue(1);
19828 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19829 DAG.getVectorIdxConstant(0, dl));
19830 return DAG.getMergeValues({Value, Chain}, dl);
19831 }
19832
19833 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
19834
19835 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19836 DAG.getVectorIdxConstant(0, dl));
19837}
19838
19839static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
19840 const X86Subtarget &Subtarget) {
19841 switch (Opcode) {
19842 case ISD::SINT_TO_FP:
19843 // TODO: Handle wider types with AVX/AVX512.
19844 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
19845 return false;
19846 // CVTDQ2PS or (V)CVTDQ2PD
19847 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
19848
19849 case ISD::UINT_TO_FP:
19850 // TODO: Handle wider types and i64 elements.
19851 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
19852 return false;
19853 // VCVTUDQ2PS or VCVTUDQ2PD
19854 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19855
19856 default:
19857 return false;
19858 }
19859}
19860
19861/// Given a scalar cast operation that is extracted from a vector, try to
19862/// vectorize the cast op followed by extraction. This will avoid an expensive
19863/// round-trip between XMM and GPR.
19865 SelectionDAG &DAG,
19866 const X86Subtarget &Subtarget) {
19867 // TODO: This could be enhanced to handle smaller integer types by peeking
19868 // through an extend.
19869 SDValue Extract = Cast.getOperand(0);
19870 MVT DestVT = Cast.getSimpleValueType();
19871 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19872 !isa<ConstantSDNode>(Extract.getOperand(1)))
19873 return SDValue();
19874
19875 // See if we have a 128-bit vector cast op for this type of cast.
19876 SDValue VecOp = Extract.getOperand(0);
19877 MVT FromVT = VecOp.getSimpleValueType();
19878 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
19879 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
19880 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
19881 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
19882 return SDValue();
19883
19884 // If we are extracting from a non-zero element, first shuffle the source
19885 // vector to allow extracting from element zero.
19886 if (!isNullConstant(Extract.getOperand(1))) {
19887 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19888 Mask[0] = Extract.getConstantOperandVal(1);
19889 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
19890 }
19891 // If the source vector is wider than 128-bits, extract the low part. Do not
19892 // create an unnecessarily wide vector cast op.
19893 if (FromVT != Vec128VT)
19894 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
19895
19896 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19897 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19898 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
19899 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
19900 DAG.getVectorIdxConstant(0, DL));
19901}
19902
19903/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
19904/// try to vectorize the cast ops. This will avoid an expensive round-trip
19905/// between XMM and GPR.
19906static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL,
19907 SelectionDAG &DAG,
19908 const X86Subtarget &Subtarget) {
19909 // TODO: Allow FP_TO_UINT.
19910 SDValue CastToInt = CastToFP.getOperand(0);
19911 MVT VT = CastToFP.getSimpleValueType();
19912 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
19913 return SDValue();
19914
19915 MVT IntVT = CastToInt.getSimpleValueType();
19916 SDValue X = CastToInt.getOperand(0);
19917 MVT SrcVT = X.getSimpleValueType();
19918 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19919 return SDValue();
19920
19921 // See if we have 128-bit vector cast instructions for this type of cast.
19922 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
19923 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19924 IntVT != MVT::i32)
19925 return SDValue();
19926
19927 unsigned SrcSize = SrcVT.getSizeInBits();
19928 unsigned IntSize = IntVT.getSizeInBits();
19929 unsigned VTSize = VT.getSizeInBits();
19930 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
19931 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
19932 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
19933
19934 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19935 unsigned ToIntOpcode =
19936 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
19937 unsigned ToFPOpcode =
19938 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
19939
19940 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19941 //
19942 // We are not defining the high elements (for example, zero them) because
19943 // that could nullify any performance advantage that we hoped to gain from
19944 // this vector op hack. We do not expect any adverse effects (like denorm
19945 // penalties) with cast ops.
19946 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
19947 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
19948 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
19949 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
19950 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
19951}
19952
19954 SelectionDAG &DAG,
19955 const X86Subtarget &Subtarget) {
19956 bool IsStrict = Op->isStrictFPOpcode();
19957 MVT VT = Op->getSimpleValueType(0);
19958 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19959
19960 if (Subtarget.hasDQI()) {
19961 assert(!Subtarget.hasVLX() && "Unexpected features");
19962
19963 assert((Src.getSimpleValueType() == MVT::v2i64 ||
19964 Src.getSimpleValueType() == MVT::v4i64) &&
19965 "Unsupported custom type");
19966
19967 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19968 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
19969 "Unexpected VT!");
19970 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19971
19972 // Need to concat with zero vector for strict fp to avoid spurious
19973 // exceptions.
19974 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
19975 : DAG.getUNDEF(MVT::v8i64);
19976 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
19977 DAG.getVectorIdxConstant(0, DL));
19978 SDValue Res, Chain;
19979 if (IsStrict) {
19980 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
19981 {Op->getOperand(0), Src});
19982 Chain = Res.getValue(1);
19983 } else {
19984 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
19985 }
19986
19987 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19988 DAG.getVectorIdxConstant(0, DL));
19989
19990 if (IsStrict)
19991 return DAG.getMergeValues({Res, Chain}, DL);
19992 return Res;
19993 }
19994
19995 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19996 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19997 if (VT != MVT::v4f32 || IsSigned)
19998 return SDValue();
19999
20000 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
20001 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
20002 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
20003 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
20004 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
20005 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
20006 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
20007 SmallVector<SDValue, 4> SignCvts(4);
20008 SmallVector<SDValue, 4> Chains(4);
20009 for (int i = 0; i != 4; ++i) {
20010 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
20011 DAG.getVectorIdxConstant(i, DL));
20012 if (IsStrict) {
20013 SignCvts[i] =
20014 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
20015 {Op.getOperand(0), Elt});
20016 Chains[i] = SignCvts[i].getValue(1);
20017 } else {
20018 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
20019 }
20020 }
20021 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
20022
20023 SDValue Slow, Chain;
20024 if (IsStrict) {
20025 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
20026 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
20027 {Chain, SignCvt, SignCvt});
20028 Chain = Slow.getValue(1);
20029 } else {
20030 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
20031 }
20032
20033 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
20034 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
20035
20036 if (IsStrict)
20037 return DAG.getMergeValues({Cvt, Chain}, DL);
20038
20039 return Cvt;
20040}
20041
20043 SelectionDAG &DAG) {
20044 bool IsStrict = Op->isStrictFPOpcode();
20045 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20046 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20047 MVT VT = Op.getSimpleValueType();
20048 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
20049
20050 SDValue Rnd = DAG.getIntPtrConstant(0, dl, /*isTarget=*/true);
20051 if (IsStrict)
20052 return DAG.getNode(
20053 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
20054 {Chain,
20055 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
20056 Rnd});
20057 return DAG.getNode(ISD::FP_ROUND, dl, VT,
20058 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
20059}
20060
20061static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned,
20062 const X86Subtarget &Subtarget) {
20063 if (FloatVT.getScalarType() != MVT::f16 || Subtarget.hasVLX()) {
20064 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
20065 return true;
20066 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
20067 return true;
20068 }
20069 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
20070 return true;
20071 if (Subtarget.useAVX512Regs()) {
20072 if (VT == MVT::v16i32)
20073 return true;
20074 if (VT == MVT::v8i64 && FloatVT == MVT::v8f16 && Subtarget.hasFP16())
20075 return true;
20076 if (VT == MVT::v8i64 && Subtarget.hasDQI())
20077 return true;
20078 }
20079 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
20080 (VT == MVT::v2i64 || VT == MVT::v4i64))
20081 return true;
20082 return false;
20083}
20084
20085SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
20086 SelectionDAG &DAG) const {
20087 bool IsStrict = Op->isStrictFPOpcode();
20088 unsigned OpNo = IsStrict ? 1 : 0;
20089 SDValue Src = Op.getOperand(OpNo);
20090 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20091 MVT SrcVT = Src.getSimpleValueType();
20092 MVT VT = Op.getSimpleValueType();
20093 SDLoc dl(Op);
20094
20095 if (isSoftF16(VT, Subtarget))
20096 return promoteXINT_TO_FP(Op, dl, DAG);
20097 else if (isLegalConversion(SrcVT, VT, true, Subtarget))
20098 return Op;
20099
20100 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20101 return LowerWin64_INT128_TO_FP(Op, DAG);
20102
20103 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20104 return Extract;
20105
20106 if (SDValue R = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
20107 return R;
20108
20109 if (SrcVT.isVector()) {
20110 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
20111 // Note: Since v2f64 is a legal type. We don't need to zero extend the
20112 // source for strict FP.
20113 if (IsStrict)
20114 return DAG.getNode(
20115 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
20116 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20117 DAG.getUNDEF(SrcVT))});
20118 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
20119 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20120 DAG.getUNDEF(SrcVT)));
20121 }
20122 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
20123 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20124
20125 return SDValue();
20126 }
20127
20128 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
20129 "Unknown SINT_TO_FP to lower!");
20130
20131 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
20132
20133 // These are really Legal; return the operand so the caller accepts it as
20134 // Legal.
20135 if (SrcVT == MVT::i32 && UseSSEReg)
20136 return Op;
20137 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20138 return Op;
20139
20140 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20141 return V;
20142 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20143 return V;
20144
20145 // SSE doesn't have an i16 conversion so we need to promote.
20146 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20147 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
20148 if (IsStrict)
20149 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
20150 {Chain, Ext});
20151
20152 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
20153 }
20154
20155 if (VT == MVT::f128 || !Subtarget.hasX87())
20156 return SDValue();
20157
20158 SDValue ValueToStore = Src;
20159 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20160 // Bitcasting to f64 here allows us to do a single 64-bit store from
20161 // an SSE register, avoiding the store forwarding penalty that would come
20162 // with two 32-bit stores.
20163 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20164
20165 unsigned Size = SrcVT.getStoreSize();
20166 Align Alignment(Size);
20167 MachineFunction &MF = DAG.getMachineFunction();
20168 auto PtrVT = getPointerTy(MF.getDataLayout());
20169 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
20170 MachinePointerInfo MPI =
20172 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20173 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20174 std::pair<SDValue, SDValue> Tmp =
20175 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20176
20177 if (IsStrict)
20178 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20179
20180 return Tmp.first;
20181}
20182
20183std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
20184 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
20185 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
20186 // Build the FILD
20187 SDVTList Tys;
20188 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
20189 if (useSSE)
20190 Tys = DAG.getVTList(MVT::f80, MVT::Other);
20191 else
20192 Tys = DAG.getVTList(DstVT, MVT::Other);
20193
20194 SDValue FILDOps[] = {Chain, Pointer};
20195 SDValue Result =
20196 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
20197 Alignment, MachineMemOperand::MOLoad);
20198 Chain = Result.getValue(1);
20199
20200 if (useSSE) {
20202 unsigned SSFISize = DstVT.getStoreSize();
20203 int SSFI =
20204 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
20205 auto PtrVT = getPointerTy(MF.getDataLayout());
20206 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20207 Tys = DAG.getVTList(MVT::Other);
20208 SDValue FSTOps[] = {Chain, Result, StackSlot};
20211 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
20212
20213 Chain =
20214 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
20215 Result = DAG.getLoad(
20216 DstVT, DL, Chain, StackSlot,
20218 Chain = Result.getValue(1);
20219 }
20220
20221 return { Result, Chain };
20222}
20223
20224/// Horizontal vector math instructions may be slower than normal math with
20225/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
20226/// implementation, and likely shuffle complexity of the alternate sequence.
20227static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
20228 const X86Subtarget &Subtarget) {
20229 bool IsOptimizingSize = DAG.shouldOptForSize();
20230 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20231 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20232}
20233
20234/// 64-bit unsigned integer to double expansion.
20236 SelectionDAG &DAG,
20237 const X86Subtarget &Subtarget) {
20238 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
20239 // when converting 0 when rounding toward negative infinity. Caller will
20240 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
20241 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
20242 // This algorithm is not obvious. Here it is what we're trying to output:
20243 /*
20244 movq %rax, %xmm0
20245 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
20246 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
20247 #ifdef __SSE3__
20248 haddpd %xmm0, %xmm0
20249 #else
20250 pshufd $0x4e, %xmm0, %xmm1
20251 addpd %xmm1, %xmm0
20252 #endif
20253 */
20254
20255 LLVMContext *Context = DAG.getContext();
20256
20257 // Build some magic constants.
20258 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
20259 Constant *C0 = ConstantDataVector::get(*Context, CV0);
20260 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20261 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
20262
20264 CV1.push_back(
20265 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20266 APInt(64, 0x4330000000000000ULL))));
20267 CV1.push_back(
20268 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20269 APInt(64, 0x4530000000000000ULL))));
20270 Constant *C1 = ConstantVector::get(CV1);
20271 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
20272
20273 // Load the 64-bit value into an XMM register.
20274 SDValue XR1 =
20275 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
20276 SDValue CLod0 = DAG.getLoad(
20277 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
20279 SDValue Unpck1 =
20280 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
20281
20282 SDValue CLod1 = DAG.getLoad(
20283 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
20285 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
20286 // TODO: Are there any fast-math-flags to propagate here?
20287 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
20288 SDValue Result;
20289
20290 if (Subtarget.hasSSE3() &&
20291 shouldUseHorizontalOp(true, DAG, Subtarget)) {
20292 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
20293 } else {
20294 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
20295 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
20296 }
20297 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
20298 DAG.getVectorIdxConstant(0, dl));
20299 return Result;
20300}
20301
20302/// 32-bit unsigned integer to float expansion.
20304 SelectionDAG &DAG,
20305 const X86Subtarget &Subtarget) {
20306 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20307 // FP constant to bias correct the final result.
20308 SDValue Bias = DAG.getConstantFP(
20309 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
20310
20311 // Load the 32-bit value into an XMM register.
20312 SDValue Load =
20313 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
20314
20315 // Zero out the upper parts of the register.
20316 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
20317
20318 // Or the load with the bias.
20319 SDValue Or = DAG.getNode(
20320 ISD::OR, dl, MVT::v2i64,
20321 DAG.getBitcast(MVT::v2i64, Load),
20322 DAG.getBitcast(MVT::v2i64,
20323 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
20324 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
20325 DAG.getBitcast(MVT::v2f64, Or),
20326 DAG.getVectorIdxConstant(0, dl));
20327
20328 if (Op.getNode()->isStrictFPOpcode()) {
20329 // Subtract the bias.
20330 // TODO: Are there any fast-math-flags to propagate here?
20331 SDValue Chain = Op.getOperand(0);
20332 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
20333 {Chain, Or, Bias});
20334
20335 if (Op.getValueType() == Sub.getValueType())
20336 return Sub;
20337
20338 // Handle final rounding.
20339 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
20340 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
20341
20342 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
20343 }
20344
20345 // Subtract the bias.
20346 // TODO: Are there any fast-math-flags to propagate here?
20347 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
20348
20349 // Handle final rounding.
20350 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
20351}
20352
20354 SelectionDAG &DAG,
20355 const X86Subtarget &Subtarget) {
20356 if (Op.getSimpleValueType() != MVT::v2f64)
20357 return SDValue();
20358
20359 bool IsStrict = Op->isStrictFPOpcode();
20360
20361 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
20362 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
20363
20364 if (Subtarget.hasAVX512()) {
20365 if (!Subtarget.hasVLX()) {
20366 // Let generic type legalization widen this.
20367 if (!IsStrict)
20368 return SDValue();
20369 // Otherwise pad the integer input with 0s and widen the operation.
20370 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20371 DAG.getConstant(0, DL, MVT::v2i32));
20372 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20373 {Op.getOperand(0), N0});
20374 SDValue Chain = Res.getValue(1);
20375 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
20376 DAG.getVectorIdxConstant(0, DL));
20377 return DAG.getMergeValues({Res, Chain}, DL);
20378 }
20379
20380 // Legalize to v4i32 type.
20381 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20382 DAG.getUNDEF(MVT::v2i32));
20383 if (IsStrict)
20384 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20385 {Op.getOperand(0), N0});
20386 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20387 }
20388
20389 // Zero extend to 2i64, OR with the floating point representation of 2^52.
20390 // This gives us the floating point equivalent of 2^52 + the i32 integer
20391 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20392 // point leaving just our i32 integers in double format.
20393 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
20394 SDValue VBias = DAG.getConstantFP(
20395 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
20396 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
20397 DAG.getBitcast(MVT::v2i64, VBias));
20398 Or = DAG.getBitcast(MVT::v2f64, Or);
20399
20400 if (IsStrict)
20401 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
20402 {Op.getOperand(0), Or, VBias});
20403 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
20404}
20405
20407 SelectionDAG &DAG,
20408 const X86Subtarget &Subtarget) {
20409 bool IsStrict = Op->isStrictFPOpcode();
20410 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20411 MVT VecIntVT = V.getSimpleValueType();
20412 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
20413 "Unsupported custom type");
20414
20415 if (Subtarget.hasAVX512()) {
20416 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20417 assert(!Subtarget.hasVLX() && "Unexpected features");
20418 MVT VT = Op->getSimpleValueType(0);
20419
20420 // v8i32->v8f64 is legal with AVX512 so just return it.
20421 if (VT == MVT::v8f64)
20422 return Op;
20423
20424 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64 ||
20425 VT == MVT::v8f16) &&
20426 "Unexpected VT!");
20427 MVT WideVT = VT == MVT::v8f16 ? MVT::v16f16 : MVT::v16f32;
20428 MVT WideIntVT = MVT::v16i32;
20429 if (VT == MVT::v4f64) {
20430 WideVT = MVT::v8f64;
20431 WideIntVT = MVT::v8i32;
20432 }
20433
20434 // Need to concat with zero vector for strict fp to avoid spurious
20435 // exceptions.
20436 SDValue Tmp =
20437 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
20438 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
20439 DAG.getVectorIdxConstant(0, DL));
20440 SDValue Res, Chain;
20441 if (IsStrict) {
20442 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
20443 {Op->getOperand(0), V});
20444 Chain = Res.getValue(1);
20445 } else {
20446 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
20447 }
20448
20449 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20450 DAG.getVectorIdxConstant(0, DL));
20451
20452 if (IsStrict)
20453 return DAG.getMergeValues({Res, Chain}, DL);
20454 return Res;
20455 }
20456
20457 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
20458 Op->getSimpleValueType(0) == MVT::v4f64) {
20459 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
20460 Constant *Bias = ConstantFP::get(
20461 *DAG.getContext(),
20462 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
20463 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20464 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
20465 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
20466 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
20467 SDValue VBias = DAG.getMemIntrinsicNode(
20468 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
20471
20472 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
20473 DAG.getBitcast(MVT::v4i64, VBias));
20474 Or = DAG.getBitcast(MVT::v4f64, Or);
20475
20476 if (IsStrict)
20477 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
20478 {Op.getOperand(0), Or, VBias});
20479 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
20480 }
20481
20482 // The algorithm is the following:
20483 // #ifdef __SSE4_1__
20484 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20485 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20486 // (uint4) 0x53000000, 0xaa);
20487 // #else
20488 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20489 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20490 // #endif
20491 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20492 // return (float4) lo + fhi;
20493
20494 bool Is128 = VecIntVT == MVT::v4i32;
20495 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
20496 // If we convert to something else than the supported type, e.g., to v4f64,
20497 // abort early.
20498 if (VecFloatVT != Op->getSimpleValueType(0))
20499 return SDValue();
20500
20501 // In the #idef/#else code, we have in common:
20502 // - The vector of constants:
20503 // -- 0x4b000000
20504 // -- 0x53000000
20505 // - A shift:
20506 // -- v >> 16
20507
20508 // Create the splat vector for 0x4b000000.
20509 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
20510 // Create the splat vector for 0x53000000.
20511 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
20512
20513 // Create the right shift.
20514 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
20515 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
20516
20517 SDValue Low, High;
20518 if (Subtarget.hasSSE41()) {
20519 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
20520 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20521 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
20522 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
20523 // Low will be bitcasted right away, so do not bother bitcasting back to its
20524 // original type.
20525 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
20526 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20527 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20528 // (uint4) 0x53000000, 0xaa);
20529 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
20530 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
20531 // High will be bitcasted right away, so do not bother bitcasting back to
20532 // its original type.
20533 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
20534 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20535 } else {
20536 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
20537 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20538 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
20539 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
20540
20541 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20542 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
20543 }
20544
20545 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
20546 SDValue VecCstFSub = DAG.getConstantFP(
20547 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
20548
20549 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20550 // NOTE: By using fsub of a positive constant instead of fadd of a negative
20551 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
20552 // enabled. See PR24512.
20553 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
20554 // TODO: Are there any fast-math-flags to propagate here?
20555 // (float4) lo;
20556 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
20557 // return (float4) lo + fhi;
20558 if (IsStrict) {
20559 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
20560 {Op.getOperand(0), HighBitcast, VecCstFSub});
20561 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
20562 {FHigh.getValue(1), LowBitcast, FHigh});
20563 }
20564
20565 SDValue FHigh =
20566 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
20567 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
20568}
20569
20571 const X86Subtarget &Subtarget) {
20572 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20573 SDValue N0 = Op.getOperand(OpNo);
20574 MVT SrcVT = N0.getSimpleValueType();
20575
20576 switch (SrcVT.SimpleTy) {
20577 default:
20578 llvm_unreachable("Custom UINT_TO_FP is not supported!");
20579 case MVT::v2i32:
20580 return lowerUINT_TO_FP_v2i32(Op, dl, DAG, Subtarget);
20581 case MVT::v4i32:
20582 case MVT::v8i32:
20583 return lowerUINT_TO_FP_vXi32(Op, dl, DAG, Subtarget);
20584 case MVT::v2i64:
20585 case MVT::v4i64:
20586 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20587 }
20588}
20589
20590SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
20591 SelectionDAG &DAG) const {
20592 bool IsStrict = Op->isStrictFPOpcode();
20593 unsigned OpNo = IsStrict ? 1 : 0;
20594 SDValue Src = Op.getOperand(OpNo);
20595 SDLoc dl(Op);
20596 auto PtrVT = getPointerTy(DAG.getDataLayout());
20597 MVT SrcVT = Src.getSimpleValueType();
20598 MVT DstVT = Op->getSimpleValueType(0);
20599 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20600
20601 // Bail out when we don't have native conversion instructions.
20602 if (DstVT == MVT::f128)
20603 return SDValue();
20604
20605 if (isSoftF16(DstVT, Subtarget))
20606 return promoteXINT_TO_FP(Op, dl, DAG);
20607 else if (isLegalConversion(SrcVT, DstVT, false, Subtarget))
20608 return Op;
20609
20610 if (DstVT.isVector())
20611 return lowerUINT_TO_FP_vec(Op, dl, DAG, Subtarget);
20612
20613 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20614 return LowerWin64_INT128_TO_FP(Op, DAG);
20615
20616 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20617 return Extract;
20618
20619 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
20620 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
20621 // Conversions from unsigned i32 to f32/f64 are legal,
20622 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
20623 return Op;
20624 }
20625
20626 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
20627 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
20628 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
20629 if (IsStrict)
20630 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
20631 {Chain, Src});
20632 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
20633 }
20634
20635 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20636 return V;
20637 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20638 return V;
20639
20640 // The transform for i64->f64 isn't correct for 0 when rounding to negative
20641 // infinity. It produces -0.0, so disable under strictfp.
20642 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
20643 !IsStrict)
20644 return LowerUINT_TO_FP_i64(Op, dl, DAG, Subtarget);
20645 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
20646 // negative infinity. So disable under strictfp. Using FILD instead.
20647 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
20648 !IsStrict)
20649 return LowerUINT_TO_FP_i32(Op, dl, DAG, Subtarget);
20650 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
20651 (DstVT == MVT::f32 || DstVT == MVT::f64))
20652 return SDValue();
20653
20654 // Make a 64-bit buffer, and use it to build an FILD.
20655 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
20656 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20657 Align SlotAlign(8);
20658 MachinePointerInfo MPI =
20660 if (SrcVT == MVT::i32) {
20661 SDValue OffsetSlot =
20662 DAG.getMemBasePlusOffset(StackSlot, TypeSize::getFixed(4), dl);
20663 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
20664 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
20665 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
20666 std::pair<SDValue, SDValue> Tmp =
20667 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
20668 if (IsStrict)
20669 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20670
20671 return Tmp.first;
20672 }
20673
20674 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
20675 SDValue ValueToStore = Src;
20676 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
20677 // Bitcasting to f64 here allows us to do a single 64-bit store from
20678 // an SSE register, avoiding the store forwarding penalty that would come
20679 // with two 32-bit stores.
20680 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20681 }
20682 SDValue Store =
20683 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
20684 // For i64 source, we need to add the appropriate power of 2 if the input
20685 // was negative. We must be careful to do the computation in x87 extended
20686 // precision, not in SSE.
20687 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20688 SDValue Ops[] = {Store, StackSlot};
20689 SDValue Fild =
20690 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
20691 SlotAlign, MachineMemOperand::MOLoad);
20692 Chain = Fild.getValue(1);
20693
20694 // Check whether the sign bit is set.
20695 SDValue SignSet = DAG.getSetCC(
20696 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
20697 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
20698
20699 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
20700 APInt FF(64, 0x5F80000000000000ULL);
20701 SDValue FudgePtr =
20702 DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF), PtrVT);
20703 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
20704
20705 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
20706 SDValue Zero = DAG.getIntPtrConstant(0, dl);
20707 SDValue Four = DAG.getIntPtrConstant(4, dl);
20708 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
20709 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
20710
20711 // Load the value out, extending it from f32 to f80.
20712 SDValue Fudge = DAG.getExtLoad(
20713 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
20715 CPAlignment);
20716 Chain = Fudge.getValue(1);
20717 // Extend everything to 80 bits to force it to be done on x87.
20718 // TODO: Are there any fast-math-flags to propagate here?
20719 if (IsStrict) {
20720 unsigned Opc = ISD::STRICT_FADD;
20721 // Windows needs the precision control changed to 80bits around this add.
20722 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20724
20725 SDValue Add =
20726 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
20727 // STRICT_FP_ROUND can't handle equal types.
20728 if (DstVT == MVT::f80)
20729 return Add;
20730 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
20731 {Add.getValue(1), Add,
20732 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
20733 }
20734 unsigned Opc = ISD::FADD;
20735 // Windows needs the precision control changed to 80bits around this add.
20736 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20738
20739 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
20740 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
20741 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
20742}
20743
20744// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
20745// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
20746// just return an SDValue().
20747// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
20748// to i16, i32 or i64, and we lower it to a legal sequence and return the
20749// result.
20750SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
20751 bool IsSigned,
20752 SDValue &Chain) const {
20753 bool IsStrict = Op->isStrictFPOpcode();
20754 SDLoc DL(Op);
20755
20756 EVT DstTy = Op.getValueType();
20757 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
20758 EVT TheVT = Value.getValueType();
20759 auto PtrVT = getPointerTy(DAG.getDataLayout());
20760
20761 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
20762 // f16 must be promoted before using the lowering in this routine.
20763 // fp128 does not use this lowering.
20764 return SDValue();
20765 }
20766
20767 // If using FIST to compute an unsigned i64, we'll need some fixup
20768 // to handle values above the maximum signed i64. A FIST is always
20769 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
20770 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
20771
20772 // FIXME: This does not generate an invalid exception if the input does not
20773 // fit in i32. PR44019
20774 if (!IsSigned && DstTy != MVT::i64) {
20775 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
20776 // The low 32 bits of the fist result will have the correct uint32 result.
20777 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
20778 DstTy = MVT::i64;
20779 }
20780
20781 assert(DstTy.getSimpleVT() <= MVT::i64 &&
20782 DstTy.getSimpleVT() >= MVT::i16 &&
20783 "Unknown FP_TO_INT to lower!");
20784
20785 // We lower FP->int64 into FISTP64 followed by a load from a temporary
20786 // stack slot.
20787 MachineFunction &MF = DAG.getMachineFunction();
20788 unsigned MemSize = DstTy.getStoreSize();
20789 int SSFI =
20790 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
20791 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20792
20793 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20794
20795 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
20796
20797 if (UnsignedFixup) {
20798 //
20799 // Conversion to unsigned i64 is implemented with a select,
20800 // depending on whether the source value fits in the range
20801 // of a signed i64. Let Thresh be the FP equivalent of
20802 // 0x8000000000000000ULL.
20803 //
20804 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
20805 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
20806 // FistSrc = (Value - FltOfs);
20807 // Fist-to-mem64 FistSrc
20808 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20809 // to XOR'ing the high 32 bits with Adjust.
20810 //
20811 // Being a power of 2, Thresh is exactly representable in all FP formats.
20812 // For X87 we'd like to use the smallest FP type for this constant, but
20813 // for DAG type consistency we have to match the FP operand type.
20814
20815 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
20817 bool LosesInfo = false;
20818 if (TheVT == MVT::f64)
20819 // The rounding mode is irrelevant as the conversion should be exact.
20820 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
20821 &LosesInfo);
20822 else if (TheVT == MVT::f80)
20823 Status = Thresh.convert(APFloat::x87DoubleExtended(),
20824 APFloat::rmNearestTiesToEven, &LosesInfo);
20825
20826 assert(Status == APFloat::opOK && !LosesInfo &&
20827 "FP conversion should have been exact");
20828
20829 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
20830
20831 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
20832 *DAG.getContext(), TheVT);
20833 SDValue Cmp;
20834 if (IsStrict) {
20835 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
20836 /*IsSignaling*/ true);
20837 Chain = Cmp.getValue(1);
20838 } else {
20839 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
20840 }
20841
20842 // Our preferred lowering of
20843 //
20844 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
20845 //
20846 // is
20847 //
20848 // (Value >= Thresh) << 63
20849 //
20850 // but since we can get here after LegalOperations, DAGCombine might do the
20851 // wrong thing if we create a select. So, directly create the preferred
20852 // version.
20853 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
20854 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
20855 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
20856
20857 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
20858 DAG.getConstantFP(0.0, DL, TheVT));
20859
20860 if (IsStrict) {
20861 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
20862 { Chain, Value, FltOfs });
20863 Chain = Value.getValue(1);
20864 } else
20865 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
20866 }
20867
20868 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
20869
20870 // FIXME This causes a redundant load/store if the SSE-class value is already
20871 // in memory, such as if it is on the callstack.
20872 if (isScalarFPTypeInSSEReg(TheVT)) {
20873 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
20874 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
20875 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20876 SDValue Ops[] = { Chain, StackSlot };
20877
20878 unsigned FLDSize = TheVT.getStoreSize();
20879 assert(FLDSize <= MemSize && "Stack slot not big enough");
20880 MachineMemOperand *MMO = MF.getMachineMemOperand(
20881 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
20882 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
20883 Chain = Value.getValue(1);
20884 }
20885
20886 // Build the FP_TO_INT*_IN_MEM
20887 MachineMemOperand *MMO = MF.getMachineMemOperand(
20888 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
20889 SDValue Ops[] = { Chain, Value, StackSlot };
20891 DAG.getVTList(MVT::Other),
20892 Ops, DstTy, MMO);
20893
20894 SDValue Res = DAG.getLoad(Op.getValueType(), DL, FIST, StackSlot, MPI);
20895 Chain = Res.getValue(1);
20896
20897 // If we need an unsigned fixup, XOR the result with adjust.
20898 if (UnsignedFixup)
20899 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
20900
20901 return Res;
20902}
20903
20905 const X86Subtarget &Subtarget) {
20906 MVT VT = Op.getSimpleValueType();
20907 SDValue In = Op.getOperand(0);
20908 MVT InVT = In.getSimpleValueType();
20909 unsigned Opc = Op.getOpcode();
20910
20911 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
20913 "Unexpected extension opcode");
20915 "Expected same number of elements");
20916 assert((VT.getVectorElementType() == MVT::i16 ||
20917 VT.getVectorElementType() == MVT::i32 ||
20918 VT.getVectorElementType() == MVT::i64) &&
20919 "Unexpected element type");
20920 assert((InVT.getVectorElementType() == MVT::i8 ||
20921 InVT.getVectorElementType() == MVT::i16 ||
20922 InVT.getVectorElementType() == MVT::i32) &&
20923 "Unexpected element type");
20924
20925 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
20926
20927 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20928 assert(InVT == MVT::v32i8 && "Unexpected VT!");
20929 return splitVectorIntUnary(Op, DAG, dl);
20930 }
20931
20932 if (Subtarget.hasInt256())
20933 return Op;
20934
20935 // Optimize vectors in AVX mode:
20936 //
20937 // v8i16 -> v8i32
20938 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
20939 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
20940 // Concat upper and lower parts.
20941 //
20942 // v4i32 -> v4i64
20943 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
20944 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
20945 // Concat upper and lower parts.
20946 //
20947 MVT HalfVT = VT.getHalfNumVectorElementsVT();
20948 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
20949
20950 // Short-circuit if we can determine that each 128-bit half is the same value.
20951 // Otherwise, this is difficult to match and optimize.
20952 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20953 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20954 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
20955
20956 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
20957 SDValue Undef = DAG.getUNDEF(InVT);
20958 bool NeedZero = Opc == ISD::ZERO_EXTEND;
20959 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
20960 OpHi = DAG.getBitcast(HalfVT, OpHi);
20961
20962 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20963}
20964
20965// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
20966static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
20967 const SDLoc &dl, SelectionDAG &DAG) {
20968 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
20969 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20970 DAG.getVectorIdxConstant(0, dl));
20971 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20972 DAG.getVectorIdxConstant(8, dl));
20973 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
20974 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
20975 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
20976 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20977}
20978
20980 const X86Subtarget &Subtarget,
20981 SelectionDAG &DAG) {
20982 MVT VT = Op->getSimpleValueType(0);
20983 SDValue In = Op->getOperand(0);
20984 MVT InVT = In.getSimpleValueType();
20985 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
20986 unsigned NumElts = VT.getVectorNumElements();
20987
20988 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
20989 // avoids a constant pool load.
20990 if (VT.getVectorElementType() != MVT::i8) {
20991 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
20992 return DAG.getNode(ISD::SRL, DL, VT, Extend,
20993 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20994 }
20995
20996 // Extend VT if BWI is not supported.
20997 MVT ExtVT = VT;
20998 if (!Subtarget.hasBWI()) {
20999 // If v16i32 is to be avoided, we'll need to split and concatenate.
21000 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
21001 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
21002
21003 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
21004 }
21005
21006 // Widen to 512-bits if VLX is not supported.
21007 MVT WideVT = ExtVT;
21008 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
21009 NumElts *= 512 / ExtVT.getSizeInBits();
21010 InVT = MVT::getVectorVT(MVT::i1, NumElts);
21011 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT), In,
21012 DAG.getVectorIdxConstant(0, DL));
21013 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
21014 }
21015
21016 SDValue One = DAG.getConstant(1, DL, WideVT);
21017 SDValue Zero = DAG.getConstant(0, DL, WideVT);
21018
21019 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
21020
21021 // Truncate if we had to extend above.
21022 if (VT != ExtVT) {
21023 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
21024 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
21025 }
21026
21027 // Extract back to 128/256-bit if we widened.
21028 if (WideVT != VT)
21029 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
21030 DAG.getVectorIdxConstant(0, DL));
21031
21032 return SelectedVal;
21033}
21034
21036 SelectionDAG &DAG) {
21037 SDValue In = Op.getOperand(0);
21038 MVT SVT = In.getSimpleValueType();
21039 SDLoc DL(Op);
21040
21041 if (SVT.getVectorElementType() == MVT::i1)
21042 return LowerZERO_EXTEND_Mask(Op, DL, Subtarget, DAG);
21043
21044 assert(Subtarget.hasAVX() && "Expected AVX support");
21045 return LowerAVXExtend(Op, DL, DAG, Subtarget);
21046}
21047
21048/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
21049/// It makes use of the fact that vectors with enough leading sign/zero bits
21050/// prevent the PACKSS/PACKUS from saturating the results.
21051/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
21052/// within each 128-bit lane.
21053static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
21054 const SDLoc &DL, SelectionDAG &DAG,
21055 const X86Subtarget &Subtarget) {
21056 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
21057 "Unexpected PACK opcode");
21058 assert(DstVT.isVector() && "VT not a vector?");
21059
21060 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
21061 if (!Subtarget.hasSSE2())
21062 return SDValue();
21063
21064 EVT SrcVT = In.getValueType();
21065
21066 // No truncation required, we might get here due to recursive calls.
21067 if (SrcVT == DstVT)
21068 return In;
21069
21070 unsigned NumElems = SrcVT.getVectorNumElements();
21071 if (NumElems < 2 || !isPowerOf2_32(NumElems) )
21072 return SDValue();
21073
21074 unsigned DstSizeInBits = DstVT.getSizeInBits();
21075 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
21076 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
21077 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
21078
21079 LLVMContext &Ctx = *DAG.getContext();
21080 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
21081 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21082
21083 // Pack to the largest type possible:
21084 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
21085 EVT InVT = MVT::i16, OutVT = MVT::i8;
21086 if (SrcVT.getScalarSizeInBits() > 16 &&
21087 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
21088 InVT = MVT::i32;
21089 OutVT = MVT::i16;
21090 }
21091
21092 // Sub-128-bit truncation - widen to 128-bit src and pack in the lower half.
21093 // On pre-AVX512, pack the src in both halves to help value tracking.
21094 if (SrcSizeInBits <= 128) {
21095 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
21096 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
21097 In = widenSubVector(In, false, Subtarget, DAG, DL, 128);
21098 SDValue LHS = DAG.getBitcast(InVT, In);
21099 SDValue RHS = Subtarget.hasAVX512() ? DAG.getUNDEF(InVT) : LHS;
21100 SDValue Res = DAG.getNode(Opcode, DL, OutVT, LHS, RHS);
21101 Res = extractSubVector(Res, 0, DAG, DL, SrcSizeInBits / 2);
21102 Res = DAG.getBitcast(PackedVT, Res);
21103 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21104 }
21105
21106 // Split lower/upper subvectors.
21107 SDValue Lo, Hi;
21108 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
21109
21110 // If Hi is undef, then don't bother packing it and widen the result instead.
21111 if (Hi.isUndef()) {
21112 EVT DstHalfVT = DstVT.getHalfNumVectorElementsVT(Ctx);
21113 if (SDValue Res =
21114 truncateVectorWithPACK(Opcode, DstHalfVT, Lo, DL, DAG, Subtarget))
21115 return widenSubVector(Res, false, Subtarget, DAG, DL, DstSizeInBits);
21116 }
21117
21118 unsigned SubSizeInBits = SrcSizeInBits / 2;
21119 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
21120 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
21121
21122 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
21123 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
21124 Lo = DAG.getBitcast(InVT, Lo);
21125 Hi = DAG.getBitcast(InVT, Hi);
21126 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21127 return DAG.getBitcast(DstVT, Res);
21128 }
21129
21130 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
21131 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
21132 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
21133 Lo = DAG.getBitcast(InVT, Lo);
21134 Hi = DAG.getBitcast(InVT, Hi);
21135 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21136
21137 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
21138 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
21139 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
21141 int Scale = 64 / OutVT.getScalarSizeInBits();
21142 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
21143 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
21144
21145 if (DstVT.is256BitVector())
21146 return DAG.getBitcast(DstVT, Res);
21147
21148 // If 512bit -> 128bit truncate another stage.
21149 Res = DAG.getBitcast(PackedVT, Res);
21150 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21151 }
21152
21153 // Recursively pack lower/upper subvectors, concat result and pack again.
21154 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
21155
21156 if (PackedVT.is128BitVector()) {
21157 // Avoid CONCAT_VECTORS on sub-128bit nodes as these can fail after
21158 // type legalization.
21159 SDValue Res =
21160 truncateVectorWithPACK(Opcode, PackedVT, In, DL, DAG, Subtarget);
21161 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21162 }
21163
21164 EVT HalfPackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
21165 Lo = truncateVectorWithPACK(Opcode, HalfPackedVT, Lo, DL, DAG, Subtarget);
21166 Hi = truncateVectorWithPACK(Opcode, HalfPackedVT, Hi, DL, DAG, Subtarget);
21167 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
21168 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21169}
21170
21171/// Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
21172/// e.g. trunc <8 x i32> X to <8 x i16> -->
21173/// MaskX = X & 0xffff (clear high bits to prevent saturation)
21174/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
21176 const X86Subtarget &Subtarget,
21177 SelectionDAG &DAG) {
21178 In = DAG.getZeroExtendInReg(In, DL, DstVT);
21179 return truncateVectorWithPACK(X86ISD::PACKUS, DstVT, In, DL, DAG, Subtarget);
21180}
21181
21182/// Truncate using inreg sign extension and X86ISD::PACKSS.
21184 const X86Subtarget &Subtarget,
21185 SelectionDAG &DAG) {
21186 EVT SrcVT = In.getValueType();
21187 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, SrcVT, In,
21188 DAG.getValueType(DstVT));
21189 return truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, Subtarget);
21190}
21191
21192/// Helper to determine if \p In truncated to \p DstVT has the necessary
21193/// signbits / leading zero bits to be truncated with PACKSS / PACKUS,
21194/// possibly by converting a SRL node to SRA for sign extension.
21195static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
21196 SDValue In, const SDLoc &DL,
21197 SelectionDAG &DAG,
21198 const X86Subtarget &Subtarget,
21199 const SDNodeFlags Flags = SDNodeFlags()) {
21200 // Requires SSE2.
21201 if (!Subtarget.hasSSE2())
21202 return SDValue();
21203
21204 EVT SrcVT = In.getValueType();
21205 EVT DstSVT = DstVT.getVectorElementType();
21206 EVT SrcSVT = SrcVT.getVectorElementType();
21207 unsigned NumDstEltBits = DstSVT.getSizeInBits();
21208 unsigned NumSrcEltBits = SrcSVT.getSizeInBits();
21209
21210 // Check we have a truncation suited for PACKSS/PACKUS.
21211 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21212 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
21213 return SDValue();
21214
21215 assert(NumSrcEltBits > NumDstEltBits && "Bad truncation");
21216 unsigned NumStages = Log2_32(NumSrcEltBits / NumDstEltBits);
21217
21218 // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
21219 // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
21220 // Truncation from v2i64 to v2i8 can be better handled with PSHUFB.
21221 if ((DstSVT == MVT::i32 && SrcVT.getSizeInBits() <= 128) ||
21222 (DstSVT == MVT::i16 && SrcVT.getSizeInBits() <= (64 * NumStages)) ||
21223 (DstVT == MVT::v2i8 && SrcVT == MVT::v2i64 && Subtarget.hasSSSE3()))
21224 return SDValue();
21225
21226 // Prefer to lower v4i64 -> v4i32 as a shuffle unless we can cheaply
21227 // split this for packing.
21228 if (SrcVT == MVT::v4i64 && DstVT == MVT::v4i32 &&
21229 !isFreeToSplitVector(In, DAG) &&
21230 (!Subtarget.hasAVX() || DAG.ComputeNumSignBits(In) != 64))
21231 return SDValue();
21232
21233 // Don't truncate AVX512 targets as multiple PACK nodes stages.
21234 if (Subtarget.hasAVX512() && NumStages > 1)
21235 return SDValue();
21236
21237 unsigned NumPackedSignBits = std::min<unsigned>(NumDstEltBits, 16);
21238 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
21239
21240 // Truncate with PACKUS if we are truncating a vector with leading zero
21241 // bits that extend all the way to the packed/truncated value.
21242 // e.g. Masks, zext_in_reg, etc.
21243 // Pre-SSE41 we can only use PACKUSWB.
21244 KnownBits Known = DAG.computeKnownBits(In);
21245 if ((Flags.hasNoUnsignedWrap() && NumDstEltBits <= NumPackedZeroBits) ||
21246 (NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
21247 PackOpcode = X86ISD::PACKUS;
21248 return In;
21249 }
21250
21251 // Truncate with PACKSS if we are truncating a vector with sign-bits
21252 // that extend all the way to the packed/truncated value.
21253 // e.g. Comparison result, sext_in_reg, etc.
21254 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
21255
21256 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
21257 // a sign splat (or AVX512 VPSRAQ support). ComputeNumSignBits struggles to
21258 // see through BITCASTs later on and combines/simplifications can't then use
21259 // it.
21260 if (DstSVT == MVT::i32 && NumSignBits != NumSrcEltBits &&
21261 !Subtarget.hasAVX512())
21262 return SDValue();
21263
21264 unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
21265 if ((Flags.hasNoSignedWrap() && DstSVT != MVT::i32) ||
21266 MinSignBits < NumSignBits) {
21267 PackOpcode = X86ISD::PACKSS;
21268 return In;
21269 }
21270
21271 // If we have a srl that only generates signbits that we will discard in
21272 // the truncation then we can use PACKSS by converting the srl to a sra.
21273 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
21274 if (In.getOpcode() == ISD::SRL && In->hasOneUse())
21275 if (std::optional<unsigned> ShAmt = DAG.getValidShiftAmount(In)) {
21276 if (*ShAmt == MinSignBits) {
21277 PackOpcode = X86ISD::PACKSS;
21278 return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
21279 }
21280 }
21281
21282 return SDValue();
21283}
21284
21285/// This function lowers a vector truncation of 'extended sign-bits' or
21286/// 'extended zero-bits' values.
21287/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
21289 MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget,
21290 SelectionDAG &DAG, const SDNodeFlags Flags = SDNodeFlags()) {
21291 MVT SrcVT = In.getSimpleValueType();
21292 MVT DstSVT = DstVT.getVectorElementType();
21293 MVT SrcSVT = SrcVT.getVectorElementType();
21294 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21295 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
21296 return SDValue();
21297
21298 // If the upper half of the source is undef, then attempt to split and
21299 // only truncate the lower half.
21300 if (DstVT.getSizeInBits() >= 128) {
21301 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
21302 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
21303 if (SDValue Res = LowerTruncateVecPackWithSignBits(DstHalfVT, Lo, DL,
21304 Subtarget, DAG))
21305 return widenSubVector(Res, false, Subtarget, DAG, DL,
21306 DstVT.getSizeInBits());
21307 }
21308 }
21309
21310 unsigned PackOpcode;
21311 if (SDValue Src = matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG,
21312 Subtarget, Flags))
21313 return truncateVectorWithPACK(PackOpcode, DstVT, Src, DL, DAG, Subtarget);
21314
21315 return SDValue();
21316}
21317
21318/// This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into
21319/// X86ISD::PACKUS/X86ISD::PACKSS operations.
21321 const X86Subtarget &Subtarget,
21322 SelectionDAG &DAG) {
21323 MVT SrcVT = In.getSimpleValueType();
21324 MVT DstSVT = DstVT.getVectorElementType();
21325 MVT SrcSVT = SrcVT.getVectorElementType();
21326 unsigned NumElems = DstVT.getVectorNumElements();
21327 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21328 (DstSVT == MVT::i8 || DstSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
21329 NumElems >= 8))
21330 return SDValue();
21331
21332 // SSSE3's pshufb results in less instructions in the cases below.
21333 if (Subtarget.hasSSSE3() && NumElems == 8) {
21334 if (SrcSVT == MVT::i16)
21335 return SDValue();
21336 if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.hasSSE41()))
21337 return SDValue();
21338 }
21339
21340 // If the upper half of the source is undef, then attempt to split and
21341 // only truncate the lower half.
21342 if (DstVT.getSizeInBits() >= 128) {
21343 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
21344 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
21345 if (SDValue Res = LowerTruncateVecPack(DstHalfVT, Lo, DL, Subtarget, DAG))
21346 return widenSubVector(Res, false, Subtarget, DAG, DL,
21347 DstVT.getSizeInBits());
21348 }
21349 }
21350
21351 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
21352 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
21353 // truncate 2 x v4i32 to v8i16.
21354 if (Subtarget.hasSSE41() || DstSVT == MVT::i8)
21355 return truncateVectorWithPACKUS(DstVT, In, DL, Subtarget, DAG);
21356
21357 if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32)
21358 return truncateVectorWithPACKSS(DstVT, In, DL, Subtarget, DAG);
21359
21360 // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS.
21361 if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) {
21362 MVT TruncVT = MVT::getVectorVT(MVT::i32, NumElems);
21363 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, In);
21364 return truncateVectorWithPACKSS(DstVT, Trunc, DL, Subtarget, DAG);
21365 }
21366
21367 return SDValue();
21368}
21369
21371 SelectionDAG &DAG,
21372 const X86Subtarget &Subtarget) {
21373 MVT VT = Op.getSimpleValueType();
21374 SDValue In = Op.getOperand(0);
21375 MVT InVT = In.getSimpleValueType();
21376 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
21377
21378 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
21379 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
21380 if (InVT.getScalarSizeInBits() <= 16) {
21381 if (Subtarget.hasBWI()) {
21382 // legal, will go to VPMOVB2M, VPMOVW2M
21383 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21384 // We need to shift to get the lsb into sign position.
21385 // Shift packed bytes not supported natively, bitcast to word
21386 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
21387 In = DAG.getNode(ISD::SHL, DL, ExtVT,
21388 DAG.getBitcast(ExtVT, In),
21389 DAG.getConstant(ShiftInx, DL, ExtVT));
21390 In = DAG.getBitcast(InVT, In);
21391 }
21392 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
21393 In, ISD::SETGT);
21394 }
21395 // Use TESTD/Q, extended vector to packed dword/qword.
21396 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
21397 "Unexpected vector type.");
21398 unsigned NumElts = InVT.getVectorNumElements();
21399 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
21400 // We need to change to a wider element type that we have support for.
21401 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
21402 // For 16 element vectors we extend to v16i32 unless we are explicitly
21403 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
21404 // we need to split into two 8 element vectors which we can extend to v8i32,
21405 // truncate and concat the results. There's an additional complication if
21406 // the original type is v16i8. In that case we can't split the v16i8
21407 // directly, so we need to shuffle high elements to low and use
21408 // sign_extend_vector_inreg.
21409 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
21410 SDValue Lo, Hi;
21411 if (InVT == MVT::v16i8) {
21412 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
21413 Hi = DAG.getVectorShuffle(
21414 InVT, DL, In, In,
21415 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21416 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
21417 } else {
21418 assert(InVT == MVT::v16i16 && "Unexpected VT!");
21419 Lo = extract128BitVector(In, 0, DAG, DL);
21420 Hi = extract128BitVector(In, 8, DAG, DL);
21421 }
21422 // We're split now, just emit two truncates and a concat. The two
21423 // truncates will trigger legalization to come back to this function.
21424 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
21425 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
21426 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21427 }
21428 // We either have 8 elements or we're allowed to use 512-bit vectors.
21429 // If we have VLX, we want to use the narrowest vector that can get the
21430 // job done so we use vXi32.
21431 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
21432 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
21433 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
21434 InVT = ExtVT;
21435 ShiftInx = InVT.getScalarSizeInBits() - 1;
21436 }
21437
21438 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21439 // We need to shift to get the lsb into sign position.
21440 In = DAG.getNode(ISD::SHL, DL, InVT, In,
21441 DAG.getConstant(ShiftInx, DL, InVT));
21442 }
21443 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
21444 if (Subtarget.hasDQI())
21445 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
21446 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
21447}
21448
21449SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
21450 SDLoc DL(Op);
21451 MVT VT = Op.getSimpleValueType();
21452 SDValue In = Op.getOperand(0);
21453 MVT InVT = In.getSimpleValueType();
21455 "Invalid TRUNCATE operation");
21456
21457 // If we're called by the type legalizer, handle a few cases.
21458 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21459 if (!TLI.isTypeLegal(VT) || !TLI.isTypeLegal(InVT)) {
21460 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
21461 VT.is128BitVector() && Subtarget.hasAVX512()) {
21462 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
21463 "Unexpected subtarget!");
21464 // The default behavior is to truncate one step, concatenate, and then
21465 // truncate the remainder. We'd rather produce two 64-bit results and
21466 // concatenate those.
21467 SDValue Lo, Hi;
21468 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
21469
21470 EVT LoVT, HiVT;
21471 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
21472
21473 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
21474 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
21475 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21476 }
21477
21478 // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
21479 if (!Subtarget.hasAVX512() ||
21480 (InVT.is512BitVector() && VT.is256BitVector()))
21482 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21483 return SignPack;
21484
21485 // Pre-AVX512 see if we can make use of PACKSS/PACKUS.
21486 if (!Subtarget.hasAVX512())
21487 return LowerTruncateVecPack(VT, In, DL, Subtarget, DAG);
21488
21489 // Otherwise let default legalization handle it.
21490 return SDValue();
21491 }
21492
21493 if (VT.getVectorElementType() == MVT::i1)
21494 return LowerTruncateVecI1(Op, DL, DAG, Subtarget);
21495
21496 // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
21497 // concat from subvectors to use VPTRUNC etc.
21498 if (!Subtarget.hasAVX512() || isFreeToSplitVector(In, DAG))
21500 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21501 return SignPack;
21502
21503 // vpmovqb/w/d, vpmovdb/w, vpmovwb
21504 if (Subtarget.hasAVX512()) {
21505 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
21506 assert(VT == MVT::v32i8 && "Unexpected VT!");
21507 return splitVectorIntUnary(Op, DAG, DL);
21508 }
21509
21510 // word to byte only under BWI. Otherwise we have to promoted to v16i32
21511 // and then truncate that. But we should only do that if we haven't been
21512 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
21513 // handled by isel patterns.
21514 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
21515 Subtarget.canExtendTo512DQ())
21516 return Op;
21517 }
21518
21519 // Handle truncation of V256 to V128 using shuffles.
21520 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
21521
21522 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
21523 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
21524 if (Subtarget.hasInt256()) {
21525 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21526 In = DAG.getBitcast(MVT::v8i32, In);
21527 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
21528 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
21529 DAG.getVectorIdxConstant(0, DL));
21530 }
21531
21532 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21533 DAG.getVectorIdxConstant(0, DL));
21534 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21535 DAG.getVectorIdxConstant(2, DL));
21536 static const int ShufMask[] = {0, 2, 4, 6};
21537 return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
21538 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
21539 }
21540
21541 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
21542 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
21543 if (Subtarget.hasInt256()) {
21544 // The PSHUFB mask:
21545 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
21546 -1, -1, -1, -1, -1, -1, -1, -1,
21547 16, 17, 20, 21, 24, 25, 28, 29,
21548 -1, -1, -1, -1, -1, -1, -1, -1 };
21549 In = DAG.getBitcast(MVT::v32i8, In);
21550 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
21551 In = DAG.getBitcast(MVT::v4i64, In);
21552
21553 static const int ShufMask2[] = {0, 2, -1, -1};
21554 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
21555 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21556 DAG.getVectorIdxConstant(0, DL));
21557 return DAG.getBitcast(MVT::v8i16, In);
21558 }
21559
21560 return Subtarget.hasSSE41()
21561 ? truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG)
21562 : truncateVectorWithPACKSS(VT, In, DL, Subtarget, DAG);
21563 }
21564
21565 if (VT == MVT::v16i8 && InVT == MVT::v16i16)
21566 return truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG);
21567
21568 llvm_unreachable("All 256->128 cases should have been handled above!");
21569}
21570
21571// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
21572// behaves on out of range inputs to generate optimized conversions.
21574 SelectionDAG &DAG,
21575 const X86Subtarget &Subtarget) {
21576 MVT SrcVT = Src.getSimpleValueType();
21577 unsigned DstBits = VT.getScalarSizeInBits();
21578 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
21579
21580 // Calculate the converted result for values in the range 0 to
21581 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21582 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
21583 SDValue Big =
21584 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
21585 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
21586 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
21587
21588 // The "CVTTP2SI" instruction conveniently sets the sign bit if
21589 // and only if the value was out of range. So we can use that
21590 // as our indicator that we rather use "Big" instead of "Small".
21591 //
21592 // Use "Small" if "IsOverflown" has all bits cleared
21593 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21594
21595 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
21596 // use the slightly slower blendv select instead.
21597 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
21598 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
21599 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
21600 }
21601
21602 SDValue IsOverflown =
21603 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
21604 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
21605 return DAG.getNode(ISD::OR, dl, VT, Small,
21606 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21607}
21608
21609SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
21610 bool IsStrict = Op->isStrictFPOpcode();
21611 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
21612 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
21613 bool HasVLX = Subtarget.hasVLX();
21614 MVT VT = Op->getSimpleValueType(0);
21615 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21616 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
21617 MVT SrcVT = Src.getSimpleValueType();
21618 SDLoc dl(Op);
21619
21620 SDValue Res;
21621 if (isSoftF16(SrcVT, Subtarget)) {
21622 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
21623 if (IsStrict)
21624 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
21625 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
21626 {NVT, MVT::Other}, {Chain, Src})});
21627 return DAG.getNode(Op.getOpcode(), dl, VT,
21628 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
21629 } else if (isTypeLegal(SrcVT) &&
21630 isLegalConversion(VT, SrcVT, IsSigned, Subtarget)) {
21631 return Op;
21632 }
21633
21634 if (VT.isVector()) {
21635 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
21636 MVT ResVT = MVT::v4i32;
21637 MVT TruncVT = MVT::v4i1;
21638 unsigned Opc;
21639 if (IsStrict)
21641 else
21642 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21643
21644 if (!IsSigned && !HasVLX) {
21645 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21646 // Widen to 512-bits.
21647 ResVT = MVT::v8i32;
21648 TruncVT = MVT::v8i1;
21649 Opc = Op.getOpcode();
21650 // Need to concat with zero vector for strict fp to avoid spurious
21651 // exceptions.
21652 // TODO: Should we just do this for non-strict as well?
21653 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
21654 : DAG.getUNDEF(MVT::v8f64);
21655 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
21656 DAG.getVectorIdxConstant(0, dl));
21657 }
21658 if (IsStrict) {
21659 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
21660 Chain = Res.getValue(1);
21661 } else {
21662 Res = DAG.getNode(Opc, dl, ResVT, Src);
21663 }
21664
21665 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
21666 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
21667 DAG.getVectorIdxConstant(0, dl));
21668 if (IsStrict)
21669 return DAG.getMergeValues({Res, Chain}, dl);
21670 return Res;
21671 }
21672
21673 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
21674 if ((HasVLX && (VT == MVT::v8i16 || VT == MVT::v16i16)) ||
21675 VT == MVT::v32i16)
21676 return Op;
21677
21678 MVT ResVT = VT;
21679 MVT EleVT = VT.getVectorElementType();
21680 if (EleVT != MVT::i64)
21681 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
21682
21683 if (SrcVT == MVT::v2f16 || SrcVT == MVT::v4f16) {
21684 SDValue Tmp =
21685 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
21686 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
21687 Ops[0] = Src;
21688 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
21689 }
21690
21691 if (!HasVLX) {
21692 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21693 // Widen to 512-bits.
21694 unsigned IntSize = EleVT.getSizeInBits();
21695 unsigned Num = IntSize > 16 ? 512 / IntSize : 32;
21696 ResVT = MVT::getVectorVT(EleVT, Num);
21697 Src = widenSubVector(MVT::getVectorVT(MVT::f16, Num), Src, IsStrict,
21698 Subtarget, DAG, dl);
21699 }
21700
21701 if (IsStrict) {
21702 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
21704 dl, {ResVT, MVT::Other}, {Chain, Src});
21705 Chain = Res.getValue(1);
21706 } else {
21707 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
21708 ResVT, Src);
21709 }
21710
21711 // TODO: Need to add exception check code for strict FP.
21712 if (EleVT.getSizeInBits() < 16) {
21713 if (HasVLX)
21714 ResVT = MVT::getVectorVT(EleVT, 8);
21715 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
21716 }
21717
21718 if (ResVT != VT)
21719 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21720 DAG.getVectorIdxConstant(0, dl));
21721
21722 if (IsStrict)
21723 return DAG.getMergeValues({Res, Chain}, dl);
21724 return Res;
21725 }
21726
21727 // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
21728 if (VT.getVectorElementType() == MVT::i16) {
21729 assert((SrcVT.getVectorElementType() == MVT::f32 ||
21730 SrcVT.getVectorElementType() == MVT::f64) &&
21731 "Expected f32/f64 vector!");
21732 MVT NVT = VT.changeVectorElementType(MVT::i32);
21733 if (IsStrict) {
21734 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
21736 dl, {NVT, MVT::Other}, {Chain, Src});
21737 Chain = Res.getValue(1);
21738 } else {
21739 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
21740 NVT, Src);
21741 }
21742
21743 // TODO: Need to add exception check code for strict FP.
21744 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21745
21746 if (IsStrict)
21747 return DAG.getMergeValues({Res, Chain}, dl);
21748 return Res;
21749 }
21750
21751 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
21752 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
21753 assert(!IsSigned && "Expected unsigned conversion!");
21754 assert(Subtarget.useAVX512Regs() && "Requires avx512f");
21755 return Op;
21756 }
21757
21758 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
21759 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
21760 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
21761 Subtarget.useAVX512Regs()) {
21762 assert(!IsSigned && "Expected unsigned conversion!");
21763 assert(!Subtarget.hasVLX() && "Unexpected features!");
21764 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21765 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21766 // Need to concat with zero vector for strict fp to avoid spurious
21767 // exceptions.
21768 // TODO: Should we just do this for non-strict as well?
21769 SDValue Tmp =
21770 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21771 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21772 DAG.getVectorIdxConstant(0, dl));
21773
21774 if (IsStrict) {
21775 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
21776 {Chain, Src});
21777 Chain = Res.getValue(1);
21778 } else {
21779 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
21780 }
21781
21782 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21783 DAG.getVectorIdxConstant(0, dl));
21784
21785 if (IsStrict)
21786 return DAG.getMergeValues({Res, Chain}, dl);
21787 return Res;
21788 }
21789
21790 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21791 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21792 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
21793 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
21794 assert(!Subtarget.hasVLX() && "Unexpected features!");
21795 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21796 // Need to concat with zero vector for strict fp to avoid spurious
21797 // exceptions.
21798 // TODO: Should we just do this for non-strict as well?
21799 SDValue Tmp =
21800 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21801 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21802 DAG.getVectorIdxConstant(0, dl));
21803
21804 if (IsStrict) {
21805 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21806 {Chain, Src});
21807 Chain = Res.getValue(1);
21808 } else {
21809 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21810 }
21811
21812 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21813 DAG.getVectorIdxConstant(0, dl));
21814
21815 if (IsStrict)
21816 return DAG.getMergeValues({Res, Chain}, dl);
21817 return Res;
21818 }
21819
21820 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21821 if (!Subtarget.hasVLX()) {
21822 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21823 // legalizer and then widened again by vector op legalization.
21824 if (!IsStrict)
21825 return SDValue();
21826
21827 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21828 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21829 {Src, Zero, Zero, Zero});
21830 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21831 {Chain, Tmp});
21832 SDValue Chain = Tmp.getValue(1);
21833 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21834 DAG.getVectorIdxConstant(0, dl));
21835 return DAG.getMergeValues({Tmp, Chain}, dl);
21836 }
21837
21838 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
21839 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21840 DAG.getUNDEF(MVT::v2f32));
21841 if (IsStrict) {
21842 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21844 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21845 }
21846 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21847 return DAG.getNode(Opc, dl, VT, Tmp);
21848 }
21849
21850 // Generate optimized instructions for pre AVX512 unsigned conversions from
21851 // vXf32 to vXi32.
21852 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21853 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21854 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21855 assert(!IsSigned && "Expected unsigned conversion!");
21856 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
21857 }
21858
21859 return SDValue();
21860 }
21861
21862 assert(!VT.isVector());
21863
21864 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21865
21866 if (!IsSigned && UseSSEReg) {
21867 // Conversions from f32/f64 with AVX512 should be legal.
21868 if (Subtarget.hasAVX512())
21869 return Op;
21870
21871 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
21872 // behaves on out of range inputs to generate optimized conversions.
21873 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21874 (VT == MVT::i64 && Subtarget.is64Bit()))) {
21875 unsigned DstBits = VT.getScalarSizeInBits();
21876 APInt UIntLimit = APInt::getSignMask(DstBits);
21877 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
21878 DAG.getConstant(UIntLimit, dl, VT));
21879 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
21880
21881 // Calculate the converted result for values in the range:
21882 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21883 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21884 SDValue Small =
21885 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
21886 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
21887 SDValue Big = DAG.getNode(
21888 X86ISD::CVTTS2SI, dl, VT,
21889 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
21890 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
21891
21892 // The "CVTTS2SI" instruction conveniently sets the sign bit if
21893 // and only if the value was out of range. So we can use that
21894 // as our indicator that we rather use "Big" instead of "Small".
21895 //
21896 // Use "Small" if "IsOverflown" has all bits cleared
21897 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21898 SDValue IsOverflown = DAG.getNode(
21899 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21900 return DAG.getNode(ISD::OR, dl, VT, Small,
21901 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21902 }
21903
21904 // Use default expansion for i64.
21905 if (VT == MVT::i64)
21906 return SDValue();
21907
21908 assert(VT == MVT::i32 && "Unexpected VT!");
21909
21910 // Promote i32 to i64 and use a signed operation on 64-bit targets.
21911 // FIXME: This does not generate an invalid exception if the input does not
21912 // fit in i32. PR44019
21913 if (Subtarget.is64Bit()) {
21914 if (IsStrict) {
21915 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
21916 {Chain, Src});
21917 Chain = Res.getValue(1);
21918 } else
21919 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21920
21921 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21922 if (IsStrict)
21923 return DAG.getMergeValues({Res, Chain}, dl);
21924 return Res;
21925 }
21926
21927 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21928 // use fisttp which will be handled later.
21929 if (!Subtarget.hasSSE3())
21930 return SDValue();
21931 }
21932
21933 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21934 // FIXME: This does not generate an invalid exception if the input does not
21935 // fit in i16. PR44019
21936 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21937 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
21938 if (IsStrict) {
21939 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
21940 {Chain, Src});
21941 Chain = Res.getValue(1);
21942 } else
21943 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21944
21945 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21946 if (IsStrict)
21947 return DAG.getMergeValues({Res, Chain}, dl);
21948 return Res;
21949 }
21950
21951 // If this is a FP_TO_SINT using SSEReg we're done.
21952 if (UseSSEReg && IsSigned)
21953 return Op;
21954
21955 // fp128 needs to use a libcall.
21956 if (SrcVT == MVT::f128) {
21957 RTLIB::Libcall LC;
21958 if (IsSigned)
21959 LC = RTLIB::getFPTOSINT(SrcVT, VT);
21960 else
21961 LC = RTLIB::getFPTOUINT(SrcVT, VT);
21962
21963 MakeLibCallOptions CallOptions;
21964 std::pair<SDValue, SDValue> Tmp =
21965 makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain);
21966
21967 if (IsStrict)
21968 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21969
21970 return Tmp.first;
21971 }
21972
21973 // Fall back to X87.
21974 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
21975 if (IsStrict)
21976 return DAG.getMergeValues({V, Chain}, dl);
21977 return V;
21978 }
21979
21980 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
21981}
21982
21983SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
21984 SelectionDAG &DAG) const {
21985 SDValue Src = Op.getOperand(0);
21986 EVT DstVT = Op.getSimpleValueType();
21987 MVT SrcVT = Src.getSimpleValueType();
21988
21989 if (SrcVT.isVector())
21990 return DstVT.getScalarType() == MVT::i32 ? Op : SDValue();
21991
21992 if (SrcVT == MVT::f16)
21993 return SDValue();
21994
21995 // If the source is in an SSE register, the node is Legal.
21996 if (isScalarFPTypeInSSEReg(SrcVT))
21997 return Op;
21998
21999 return LRINT_LLRINTHelper(Op.getNode(), DAG);
22000}
22001
22002SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
22003 SelectionDAG &DAG) const {
22004 EVT DstVT = N->getValueType(0);
22005 SDValue Src = N->getOperand(0);
22006 EVT SrcVT = Src.getValueType();
22007
22008 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
22009 // f16 must be promoted before using the lowering in this routine.
22010 // fp128 does not use this lowering.
22011 return SDValue();
22012 }
22013
22014 SDLoc DL(N);
22015 SDValue Chain = DAG.getEntryNode();
22016
22017 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
22018
22019 // If we're converting from SSE, the stack slot needs to hold both types.
22020 // Otherwise it only needs to hold the DstVT.
22021 EVT OtherVT = UseSSE ? SrcVT : DstVT;
22022 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
22023 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
22024 MachinePointerInfo MPI =
22026
22027 if (UseSSE) {
22028 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
22029 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
22030 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
22031 SDValue Ops[] = { Chain, StackPtr };
22032
22033 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
22034 /*Align*/ std::nullopt,
22036 Chain = Src.getValue(1);
22037 }
22038
22039 SDValue StoreOps[] = { Chain, Src, StackPtr };
22040 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
22041 StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
22043
22044 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
22045}
22046
22047SDValue
22048X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
22049 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
22050 // but making use of X86 specifics to produce better instruction sequences.
22051 SDNode *Node = Op.getNode();
22052 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
22053 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
22054 SDLoc dl(SDValue(Node, 0));
22055 SDValue Src = Node->getOperand(0);
22056
22057 // There are three types involved here: SrcVT is the source floating point
22058 // type, DstVT is the type of the result, and TmpVT is the result of the
22059 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
22060 // DstVT).
22061 EVT SrcVT = Src.getValueType();
22062 EVT DstVT = Node->getValueType(0);
22063 EVT TmpVT = DstVT;
22064
22065 // This code is only for floats and doubles. Fall back to generic code for
22066 // anything else.
22067 if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftF16(SrcVT, Subtarget))
22068 return SDValue();
22069
22070 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
22071 unsigned SatWidth = SatVT.getScalarSizeInBits();
22072 unsigned DstWidth = DstVT.getScalarSizeInBits();
22073 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
22074 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
22075 "Expected saturation width smaller than result width");
22076
22077 // Promote result of FP_TO_*INT to at least 32 bits.
22078 if (TmpWidth < 32) {
22079 TmpVT = MVT::i32;
22080 TmpWidth = 32;
22081 }
22082
22083 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
22084 // us to use a native signed conversion instead.
22085 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
22086 TmpVT = MVT::i64;
22087 TmpWidth = 64;
22088 }
22089
22090 // If the saturation width is smaller than the size of the temporary result,
22091 // we can always use signed conversion, which is native.
22092 if (SatWidth < TmpWidth)
22093 FpToIntOpcode = ISD::FP_TO_SINT;
22094
22095 // Determine minimum and maximum integer values and their corresponding
22096 // floating-point values.
22097 APInt MinInt, MaxInt;
22098 if (IsSigned) {
22099 MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
22100 MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
22101 } else {
22102 MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
22103 MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
22104 }
22105
22106 const fltSemantics &Sem = SrcVT.getFltSemantics();
22107 APFloat MinFloat(Sem);
22108 APFloat MaxFloat(Sem);
22109
22110 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
22111 MinInt, IsSigned, APFloat::rmTowardZero);
22112 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
22113 MaxInt, IsSigned, APFloat::rmTowardZero);
22114 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
22115 && !(MaxStatus & APFloat::opStatus::opInexact);
22116
22117 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
22118 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
22119
22120 // If the integer bounds are exactly representable as floats, emit a
22121 // min+max+fptoi sequence. Otherwise use comparisons and selects.
22122 if (AreExactFloatBounds) {
22123 if (DstVT != TmpVT) {
22124 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
22125 SDValue MinClamped = DAG.getNode(
22126 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
22127 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
22128 SDValue BothClamped = DAG.getNode(
22129 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
22130 // Convert clamped value to integer.
22131 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
22132
22133 // NaN will become INDVAL, with the top bit set and the rest zero.
22134 // Truncation will discard the top bit, resulting in zero.
22135 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22136 }
22137
22138 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
22139 SDValue MinClamped = DAG.getNode(
22140 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
22141 // Clamp by MaxFloat from above. NaN cannot occur.
22142 SDValue BothClamped = DAG.getNode(
22143 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
22144 // Convert clamped value to integer.
22145 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
22146
22147 if (!IsSigned) {
22148 // In the unsigned case we're done, because we mapped NaN to MinFloat,
22149 // which is zero.
22150 return FpToInt;
22151 }
22152
22153 // Otherwise, select zero if Src is NaN.
22154 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22155 return DAG.getSelectCC(
22156 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
22157 }
22158
22159 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
22160 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
22161
22162 // Result of direct conversion, which may be selected away.
22163 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
22164
22165 if (DstVT != TmpVT) {
22166 // NaN will become INDVAL, with the top bit set and the rest zero.
22167 // Truncation will discard the top bit, resulting in zero.
22168 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22169 }
22170
22171 SDValue Select = FpToInt;
22172 // For signed conversions where we saturate to the same size as the
22173 // result type of the fptoi instructions, INDVAL coincides with integer
22174 // minimum, so we don't need to explicitly check it.
22175 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
22176 // If Src ULT MinFloat, select MinInt. In particular, this also selects
22177 // MinInt if Src is NaN.
22178 Select = DAG.getSelectCC(
22179 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
22180 }
22181
22182 // If Src OGT MaxFloat, select MaxInt.
22183 Select = DAG.getSelectCC(
22184 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
22185
22186 // In the unsigned case we are done, because we mapped NaN to MinInt, which
22187 // is already zero. The promoted case was already handled above.
22188 if (!IsSigned || DstVT != TmpVT) {
22189 return Select;
22190 }
22191
22192 // Otherwise, select 0 if Src is NaN.
22193 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22194 return DAG.getSelectCC(
22195 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
22196}
22197
22198SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
22199 bool IsStrict = Op->isStrictFPOpcode();
22200
22201 SDLoc DL(Op);
22202 MVT VT = Op.getSimpleValueType();
22203 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22204 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22205 MVT SVT = In.getSimpleValueType();
22206
22207 // Let f16->f80 get lowered to a libcall, except for darwin, where we should
22208 // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)
22209 if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&
22210 !Subtarget.getTargetTriple().isOSDarwin()))
22211 return SDValue();
22212
22213 if ((SVT == MVT::v8f16 && Subtarget.hasF16C()) ||
22214 (SVT == MVT::v16f16 && Subtarget.useAVX512Regs()))
22215 return Op;
22216
22217 if (SVT == MVT::f16) {
22218 if (Subtarget.hasFP16())
22219 return Op;
22220
22221 if (VT != MVT::f32) {
22222 if (IsStrict)
22223 return DAG.getNode(
22224 ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
22225 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
22226 {MVT::f32, MVT::Other}, {Chain, In})});
22227
22228 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
22229 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
22230 }
22231
22232 if (!Subtarget.hasF16C()) {
22233 if (!Subtarget.getTargetTriple().isOSDarwin())
22234 return SDValue();
22235
22236 assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall");
22237
22238 // Need a libcall, but ABI for f16 is soft-float on MacOS.
22239 TargetLowering::CallLoweringInfo CLI(DAG);
22240 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22241
22242 In = DAG.getBitcast(MVT::i16, In);
22244 TargetLowering::ArgListEntry Entry(
22245 In, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()));
22246 Entry.IsSExt = false;
22247 Entry.IsZExt = true;
22248 Args.push_back(Entry);
22249
22251 getLibcallName(RTLIB::FPEXT_F16_F32),
22253 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
22254 CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
22255 std::move(Args));
22256
22257 SDValue Res;
22258 std::tie(Res,Chain) = LowerCallTo(CLI);
22259 if (IsStrict)
22260 Res = DAG.getMergeValues({Res, Chain}, DL);
22261
22262 return Res;
22263 }
22264
22265 In = DAG.getBitcast(MVT::i16, In);
22266 SDValue Res;
22267 if (IsStrict) {
22268 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
22269 getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
22270 DAG.getVectorIdxConstant(0, DL));
22271 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
22272 {Chain, In});
22273 Chain = Res.getValue(1);
22274 } else {
22275 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
22276 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
22277 DAG.getUNDEF(MVT::v4i32), In,
22278 DAG.getVectorIdxConstant(0, DL));
22279 In = DAG.getBitcast(MVT::v8i16, In);
22280 Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
22281 DAG.getTargetConstant(4, DL, MVT::i32));
22282 }
22283 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
22284 DAG.getVectorIdxConstant(0, DL));
22285 if (IsStrict)
22286 return DAG.getMergeValues({Res, Chain}, DL);
22287 return Res;
22288 }
22289
22290 if (!SVT.isVector() || SVT.getVectorElementType() == MVT::bf16)
22291 return Op;
22292
22293 if (SVT.getVectorElementType() == MVT::f16) {
22294 if (Subtarget.hasFP16() && isTypeLegal(SVT))
22295 return Op;
22296 assert(Subtarget.hasF16C() && "Unexpected features!");
22297 if (SVT == MVT::v2f16)
22298 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
22299 DAG.getUNDEF(MVT::v2f16));
22300 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
22301 DAG.getUNDEF(MVT::v4f16));
22302 if (IsStrict)
22303 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22304 {Op->getOperand(0), Res});
22305 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22306 } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
22307 return Op;
22308 }
22309
22310 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
22311
22312 SDValue Res =
22313 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
22314 if (IsStrict)
22315 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22316 {Op->getOperand(0), Res});
22317 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22318}
22319
22320SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
22321 bool IsStrict = Op->isStrictFPOpcode();
22322
22323 SDLoc DL(Op);
22324 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22325 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22326 MVT VT = Op.getSimpleValueType();
22327 MVT SVT = In.getSimpleValueType();
22328
22329 if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
22330 return SDValue();
22331
22332 if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&
22333 !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {
22334 if (!Subtarget.getTargetTriple().isOSDarwin())
22335 return SDValue();
22336
22337 // We need a libcall but the ABI for f16 libcalls on MacOS is soft.
22338 TargetLowering::CallLoweringInfo CLI(DAG);
22339 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22340
22342 TargetLowering::ArgListEntry Entry(
22343 In, EVT(SVT).getTypeForEVT(*DAG.getContext()));
22344 Entry.IsSExt = false;
22345 Entry.IsZExt = true;
22346 Args.push_back(Entry);
22347
22349 getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16
22350 : RTLIB::FPROUND_F32_F16),
22352 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
22353 CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,
22354 std::move(Args));
22355
22356 SDValue Res;
22357 std::tie(Res, Chain) = LowerCallTo(CLI);
22358
22359 Res = DAG.getBitcast(MVT::f16, Res);
22360
22361 if (IsStrict)
22362 Res = DAG.getMergeValues({Res, Chain}, DL);
22363
22364 return Res;
22365 }
22366
22367 if (VT.getScalarType() == MVT::bf16) {
22368 if (SVT.getScalarType() == MVT::f32 &&
22369 ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
22370 Subtarget.hasAVXNECONVERT()))
22371 return Op;
22372 return SDValue();
22373 }
22374
22375 if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
22376 if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
22377 return SDValue();
22378
22379 if (VT.isVector())
22380 return Op;
22381
22382 SDValue Res;
22384 MVT::i32);
22385 if (IsStrict) {
22386 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
22387 DAG.getConstantFP(0, DL, MVT::v4f32), In,
22388 DAG.getVectorIdxConstant(0, DL));
22389 Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
22390 {Chain, Res, Rnd});
22391 Chain = Res.getValue(1);
22392 } else {
22393 // FIXME: Should we use zeros for upper elements for non-strict?
22394 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
22395 Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
22396 }
22397
22398 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22399 DAG.getVectorIdxConstant(0, DL));
22400 Res = DAG.getBitcast(MVT::f16, Res);
22401
22402 if (IsStrict)
22403 return DAG.getMergeValues({Res, Chain}, DL);
22404
22405 return Res;
22406 }
22407
22408 return Op;
22409}
22410
22412 bool IsStrict = Op->isStrictFPOpcode();
22413 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22414 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
22415 "Unexpected VT!");
22416
22417 SDLoc dl(Op);
22418 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
22419 DAG.getConstant(0, dl, MVT::v8i16), Src,
22420 DAG.getVectorIdxConstant(0, dl));
22421
22422 SDValue Chain;
22423 if (IsStrict) {
22424 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
22425 {Op.getOperand(0), Res});
22426 Chain = Res.getValue(1);
22427 } else {
22428 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
22429 }
22430
22431 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
22432 DAG.getVectorIdxConstant(0, dl));
22433
22434 if (IsStrict)
22435 return DAG.getMergeValues({Res, Chain}, dl);
22436
22437 return Res;
22438}
22439
22441 bool IsStrict = Op->isStrictFPOpcode();
22442 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22443 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
22444 "Unexpected VT!");
22445
22446 SDLoc dl(Op);
22447 SDValue Res, Chain;
22448 if (IsStrict) {
22449 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
22450 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
22451 DAG.getVectorIdxConstant(0, dl));
22452 Res = DAG.getNode(
22453 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
22454 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
22455 Chain = Res.getValue(1);
22456 } else {
22457 // FIXME: Should we use zeros for upper elements for non-strict?
22458 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
22459 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
22460 DAG.getTargetConstant(4, dl, MVT::i32));
22461 }
22462
22463 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
22464 DAG.getVectorIdxConstant(0, dl));
22465
22466 if (IsStrict)
22467 return DAG.getMergeValues({Res, Chain}, dl);
22468
22469 return Res;
22470}
22471
22472SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
22473 SelectionDAG &DAG) const {
22474 SDLoc DL(Op);
22475
22476 MVT SVT = Op.getOperand(0).getSimpleValueType();
22477 if (SVT == MVT::f32 && ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
22478 Subtarget.hasAVXNECONVERT())) {
22479 SDValue Res;
22480 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, Op.getOperand(0));
22481 Res = DAG.getNode(X86ISD::CVTNEPS2BF16, DL, MVT::v8bf16, Res);
22482 Res = DAG.getBitcast(MVT::v8i16, Res);
22483 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22484 DAG.getVectorIdxConstant(0, DL));
22485 }
22486
22487 MakeLibCallOptions CallOptions;
22488 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
22489 SDValue Res =
22490 makeLibCall(DAG, LC, MVT::f16, Op.getOperand(0), CallOptions, DL).first;
22491 return DAG.getBitcast(MVT::i16, Res);
22492}
22493
22494/// Depending on uarch and/or optimizing for size, we might prefer to use a
22495/// vector operation in place of the typical scalar operation.
22497 SelectionDAG &DAG,
22498 const X86Subtarget &Subtarget) {
22499 // If both operands have other uses, this is probably not profitable.
22500 SDValue LHS = Op.getOperand(0);
22501 SDValue RHS = Op.getOperand(1);
22502 if (!LHS.hasOneUse() && !RHS.hasOneUse())
22503 return Op;
22504
22505 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
22506 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
22507 if (IsFP && !Subtarget.hasSSE3())
22508 return Op;
22509 if (!IsFP && !Subtarget.hasSSSE3())
22510 return Op;
22511
22512 // Extract from a common vector.
22513 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22514 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22515 LHS.getOperand(0) != RHS.getOperand(0) ||
22516 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
22517 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
22518 !shouldUseHorizontalOp(true, DAG, Subtarget))
22519 return Op;
22520
22521 // Allow commuted 'hadd' ops.
22522 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
22523 unsigned HOpcode;
22524 switch (Op.getOpcode()) {
22525 // clang-format off
22526 case ISD::ADD: HOpcode = X86ISD::HADD; break;
22527 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
22528 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
22529 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
22530 default:
22531 llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
22532 // clang-format on
22533 }
22534 unsigned LExtIndex = LHS.getConstantOperandVal(1);
22535 unsigned RExtIndex = RHS.getConstantOperandVal(1);
22536 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
22537 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
22538 std::swap(LExtIndex, RExtIndex);
22539
22540 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
22541 return Op;
22542
22543 SDValue X = LHS.getOperand(0);
22544 EVT VecVT = X.getValueType();
22545 unsigned BitWidth = VecVT.getSizeInBits();
22546 unsigned NumLanes = BitWidth / 128;
22547 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
22548 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
22549 "Not expecting illegal vector widths here");
22550
22551 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
22552 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
22553 if (BitWidth == 256 || BitWidth == 512) {
22554 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
22555 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
22556 LExtIndex %= NumEltsPerLane;
22557 }
22558
22559 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
22560 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
22561 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
22562 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
22563 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
22564 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
22565 DAG.getVectorIdxConstant(LExtIndex / 2, DL));
22566}
22567
22568/// Depending on uarch and/or optimizing for size, we might prefer to use a
22569/// vector operation in place of the typical scalar operation.
22570SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
22571 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
22572 "Only expecting float/double");
22573 return lowerAddSubToHorizontalOp(Op, SDLoc(Op), DAG, Subtarget);
22574}
22575
22576/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
22577/// This mode isn't supported in hardware on X86. But as long as we aren't
22578/// compiling with trapping math, we can emulate this with
22579/// trunc(X + copysign(nextafter(0.5, 0.0), X)).
22581 SDValue N0 = Op.getOperand(0);
22582 SDLoc dl(Op);
22583 MVT VT = Op.getSimpleValueType();
22584
22585 // N0 += copysign(nextafter(0.5, 0.0), N0)
22586 const fltSemantics &Sem = VT.getFltSemantics();
22587 bool Ignored;
22588 APFloat Point5Pred = APFloat(0.5f);
22589 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
22590 Point5Pred.next(/*nextDown*/true);
22591
22592 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
22593 DAG.getConstantFP(Point5Pred, dl, VT), N0);
22594 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
22595
22596 // Truncate the result to remove fraction.
22597 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
22598}
22599
22600/// The only differences between FABS and FNEG are the mask and the logic op.
22601/// FNEG also has a folding opportunity for FNEG(FABS(x)).
22603 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
22604 "Wrong opcode for lowering FABS or FNEG.");
22605
22606 bool IsFABS = (Op.getOpcode() == ISD::FABS);
22607
22608 // If this is a FABS and it has an FNEG user, bail out to fold the combination
22609 // into an FNABS. We'll lower the FABS after that if it is still in use.
22610 if (IsFABS)
22611 for (SDNode *User : Op->users())
22612 if (User->getOpcode() == ISD::FNEG)
22613 return Op;
22614
22615 SDLoc dl(Op);
22616 MVT VT = Op.getSimpleValueType();
22617
22618 bool IsF128 = (VT == MVT::f128);
22619 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22621 "Unexpected type in LowerFABSorFNEG");
22622
22623 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOptLevel to
22624 // decide if we should generate a 16-byte constant mask when we only need 4 or
22625 // 8 bytes for the scalar case.
22626
22627 // There are no scalar bitwise logical SSE/AVX instructions, so we
22628 // generate a 16-byte vector constant and logic op even for the scalar case.
22629 // Using a 16-byte mask allows folding the load of the mask with
22630 // the logic op, so it can save (~4 bytes) on code size.
22631 bool IsFakeVector = !VT.isVector() && !IsF128;
22632 MVT LogicVT = VT;
22633 if (IsFakeVector)
22634 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22635 : (VT == MVT::f32) ? MVT::v4f32
22636 : MVT::v8f16;
22637
22638 unsigned EltBits = VT.getScalarSizeInBits();
22639 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
22640 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
22641 APInt::getSignMask(EltBits);
22642 const fltSemantics &Sem = VT.getFltSemantics();
22643 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
22644
22645 SDValue Op0 = Op.getOperand(0);
22646 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
22647 unsigned LogicOp = IsFABS ? X86ISD::FAND :
22648 IsFNABS ? X86ISD::FOR :
22650 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
22651
22652 if (VT.isVector() || IsF128)
22653 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22654
22655 // For the scalar case extend to a 128-bit vector, perform the logic op,
22656 // and extract the scalar result back out.
22657 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
22658 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22659 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
22660 DAG.getVectorIdxConstant(0, dl));
22661}
22662
22664 SDValue Mag = Op.getOperand(0);
22665 SDValue Sign = Op.getOperand(1);
22666 SDLoc dl(Op);
22667
22668 // If the sign operand is smaller, extend it first.
22669 MVT VT = Op.getSimpleValueType();
22670 if (Sign.getSimpleValueType().bitsLT(VT))
22671 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
22672
22673 // And if it is bigger, shrink it first.
22674 if (Sign.getSimpleValueType().bitsGT(VT))
22675 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,
22676 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
22677
22678 // At this point the operands and the result should have the same
22679 // type, and that won't be f80 since that is not custom lowered.
22680 bool IsF128 = (VT == MVT::f128);
22681 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22683 "Unexpected type in LowerFCOPYSIGN");
22684
22685 const fltSemantics &Sem = VT.getFltSemantics();
22686
22687 // Perform all scalar logic operations as 16-byte vectors because there are no
22688 // scalar FP logic instructions in SSE.
22689 // TODO: This isn't necessary. If we used scalar types, we might avoid some
22690 // unnecessary splats, but we might miss load folding opportunities. Should
22691 // this decision be based on OptimizeForSize?
22692 bool IsFakeVector = !VT.isVector() && !IsF128;
22693 MVT LogicVT = VT;
22694 if (IsFakeVector)
22695 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22696 : (VT == MVT::f32) ? MVT::v4f32
22697 : MVT::v8f16;
22698
22699 // The mask constants are automatically splatted for vector types.
22700 unsigned EltSizeInBits = VT.getScalarSizeInBits();
22701 SDValue SignMask = DAG.getConstantFP(
22702 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
22703 SDValue MagMask = DAG.getConstantFP(
22704 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
22705
22706 // First, clear all bits but the sign bit from the second operand (sign).
22707 if (IsFakeVector)
22708 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
22709 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
22710
22711 // Next, clear the sign bit from the first operand (magnitude).
22712 // TODO: If we had general constant folding for FP logic ops, this check
22713 // wouldn't be necessary.
22714 SDValue MagBits;
22715 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
22716 APFloat APF = Op0CN->getValueAPF();
22717 APF.clearSign();
22718 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
22719 } else {
22720 // If the magnitude operand wasn't a constant, we need to AND out the sign.
22721 if (IsFakeVector)
22722 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
22723 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
22724 }
22725
22726 // OR the magnitude value with the sign bit.
22727 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
22728 return !IsFakeVector ? Or
22729 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
22730 DAG.getVectorIdxConstant(0, dl));
22731}
22732
22734 SDValue N0 = Op.getOperand(0);
22735 SDLoc dl(Op);
22736 MVT VT = Op.getSimpleValueType();
22737
22738 MVT OpVT = N0.getSimpleValueType();
22739 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
22740 "Unexpected type for FGETSIGN");
22741
22742 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
22743 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
22744 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
22745 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
22746 Res = DAG.getZExtOrTrunc(Res, dl, VT);
22747 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
22748 return Res;
22749}
22750
22751/// Helper for attempting to create a X86ISD::BT node.
22752static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
22753 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
22754 // instruction. Since the shift amount is in-range-or-undefined, we know
22755 // that doing a bittest on the i32 value is ok. We extend to i32 because
22756 // the encoding for the i16 version is larger than the i32 version.
22757 // Also promote i16 to i32 for performance / code size reason.
22758 if (Src.getValueType().getScalarSizeInBits() < 32)
22759 Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
22760
22761 // No legal type found, give up.
22762 if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
22763 return SDValue();
22764
22765 // See if we can use the 32-bit instruction instead of the 64-bit one for a
22766 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
22767 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
22768 // known to be zero.
22769 if (Src.getValueType() == MVT::i64 &&
22770 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
22771 Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
22772
22773 // If the operand types disagree, extend the shift amount to match. Since
22774 // BT ignores high bits (like shifts) we can use anyextend.
22775 if (Src.getValueType() != BitNo.getValueType()) {
22776 // Peek through a mask/modulo operation.
22777 // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but
22778 // we probably need a better IsDesirableToPromoteOp to handle this as well.
22779 if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
22780 BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),
22781 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22782 BitNo.getOperand(0)),
22783 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22784 BitNo.getOperand(1)));
22785 else
22786 BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
22787 }
22788
22789 return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
22790}
22791
22792/// Helper for creating a X86ISD::SETCC node.
22794 SelectionDAG &DAG) {
22795 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22796 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
22797}
22798
22799/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
22800/// recognizable memcmp expansion.
22801static bool isOrXorXorTree(SDValue X, bool Root = true) {
22802 if (X.getOpcode() == ISD::OR)
22803 return isOrXorXorTree(X.getOperand(0), false) &&
22804 isOrXorXorTree(X.getOperand(1), false);
22805 if (Root)
22806 return false;
22807 return X.getOpcode() == ISD::XOR;
22808}
22809
22810/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
22811/// expansion.
22812template <typename F>
22814 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
22815 SDValue Op0 = X.getOperand(0);
22816 SDValue Op1 = X.getOperand(1);
22817 if (X.getOpcode() == ISD::OR) {
22818 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22819 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22820 if (VecVT != CmpVT)
22821 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
22822 if (HasPT)
22823 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
22824 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
22825 }
22826 if (X.getOpcode() == ISD::XOR) {
22827 SDValue A = SToV(Op0);
22828 SDValue B = SToV(Op1);
22829 if (VecVT != CmpVT)
22830 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
22831 if (HasPT)
22832 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
22833 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
22834 }
22835 llvm_unreachable("Impossible");
22836}
22837
22838/// Try to map a 128-bit or larger integer comparison to vector instructions
22839/// before type legalization splits it up into chunks.
22841 ISD::CondCode CC,
22842 const SDLoc &DL,
22843 SelectionDAG &DAG,
22844 const X86Subtarget &Subtarget) {
22845 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
22846
22847 // We're looking for an oversized integer equality comparison.
22848 EVT OpVT = X.getValueType();
22849 unsigned OpSize = OpVT.getSizeInBits();
22850 if (!OpVT.isScalarInteger() || OpSize < 128)
22851 return SDValue();
22852
22853 // Ignore a comparison with zero because that gets special treatment in
22854 // EmitTest(). But make an exception for the special case of a pair of
22855 // logically-combined vector-sized operands compared to zero. This pattern may
22856 // be generated by the memcmp expansion pass with oversized integer compares
22857 // (see PR33325).
22858 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
22859 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
22860 return SDValue();
22861
22862 // Don't perform this combine if constructing the vector will be expensive.
22863 auto IsVectorBitCastCheap = [](SDValue X) {
22865 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
22866 X.getOpcode() == ISD::LOAD;
22867 };
22868 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
22869 !IsOrXorXorTreeCCZero)
22870 return SDValue();
22871
22872 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
22873 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
22874 // Otherwise use PCMPEQ (plus AND) and mask testing.
22875 bool NoImplicitFloatOps =
22877 Attribute::NoImplicitFloat);
22878 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
22879 ((OpSize == 128 && Subtarget.hasSSE2()) ||
22880 (OpSize == 256 && Subtarget.hasAVX()) ||
22881 (OpSize == 512 && Subtarget.useAVX512Regs()))) {
22882 bool HasPT = Subtarget.hasSSE41();
22883
22884 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
22885 // vector registers are essentially free. (Technically, widening registers
22886 // prevents load folding, but the tradeoff is worth it.)
22887 bool PreferKOT = Subtarget.preferMaskRegisters();
22888 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
22889
22890 EVT VecVT = MVT::v16i8;
22891 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
22892 if (OpSize == 256) {
22893 VecVT = MVT::v32i8;
22894 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
22895 }
22896 EVT CastVT = VecVT;
22897 bool NeedsAVX512FCast = false;
22898 if (OpSize == 512 || NeedZExt) {
22899 if (Subtarget.hasBWI()) {
22900 VecVT = MVT::v64i8;
22901 CmpVT = MVT::v64i1;
22902 if (OpSize == 512)
22903 CastVT = VecVT;
22904 } else {
22905 VecVT = MVT::v16i32;
22906 CmpVT = MVT::v16i1;
22907 CastVT = OpSize == 512 ? VecVT
22908 : OpSize == 256 ? MVT::v8i32
22909 : MVT::v4i32;
22910 NeedsAVX512FCast = true;
22911 }
22912 }
22913
22914 auto ScalarToVector = [&](SDValue X) -> SDValue {
22915 bool TmpZext = false;
22916 EVT TmpCastVT = CastVT;
22917 if (X.getOpcode() == ISD::ZERO_EXTEND) {
22918 SDValue OrigX = X.getOperand(0);
22919 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
22920 if (OrigSize < OpSize) {
22921 if (OrigSize == 128) {
22922 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
22923 X = OrigX;
22924 TmpZext = true;
22925 } else if (OrigSize == 256) {
22926 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
22927 X = OrigX;
22928 TmpZext = true;
22929 }
22930 }
22931 }
22932 X = DAG.getBitcast(TmpCastVT, X);
22933 if (!NeedZExt && !TmpZext)
22934 return X;
22935 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
22936 DAG.getConstant(0, DL, VecVT), X,
22937 DAG.getVectorIdxConstant(0, DL));
22938 };
22939
22940 SDValue Cmp;
22941 if (IsOrXorXorTreeCCZero) {
22942 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
22943 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
22944 // Use 2 vector equality compares and 'and' the results before doing a
22945 // MOVMSK.
22946 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
22947 } else {
22948 SDValue VecX = ScalarToVector(X);
22949 SDValue VecY = ScalarToVector(Y);
22950 if (VecVT != CmpVT) {
22951 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
22952 } else if (HasPT) {
22953 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
22954 } else {
22955 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
22956 }
22957 }
22958 // AVX512 should emit a setcc that will lower to kortest.
22959 if (VecVT != CmpVT) {
22960 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64
22961 : CmpVT == MVT::v32i1 ? MVT::i32
22962 : MVT::i16;
22963 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
22964 DAG.getConstant(0, DL, KRegVT), CC);
22965 }
22966 if (HasPT) {
22967 SDValue BCCmp =
22968 DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);
22969 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
22971 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
22972 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
22973 }
22974 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
22975 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
22976 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
22977 assert(Cmp.getValueType() == MVT::v16i8 &&
22978 "Non 128-bit vector on pre-SSE41 target");
22979 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
22980 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
22981 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
22982 }
22983
22984 return SDValue();
22985}
22986
22987/// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))
22988/// style scalarized (associative) reduction patterns. Partial reductions
22989/// are supported when the pointer SrcMask is non-null.
22990/// TODO - move this to SelectionDAG?
22993 SmallVectorImpl<APInt> *SrcMask = nullptr) {
22995 DenseMap<SDValue, APInt> SrcOpMap;
22996 EVT VT = MVT::Other;
22997
22998 // Recognize a special case where a vector is casted into wide integer to
22999 // test all 0s.
23000 assert(Op.getOpcode() == unsigned(BinOp) &&
23001 "Unexpected bit reduction opcode");
23002 Opnds.push_back(Op.getOperand(0));
23003 Opnds.push_back(Op.getOperand(1));
23004
23005 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
23007 // BFS traverse all BinOp operands.
23008 if (I->getOpcode() == unsigned(BinOp)) {
23009 Opnds.push_back(I->getOperand(0));
23010 Opnds.push_back(I->getOperand(1));
23011 // Re-evaluate the number of nodes to be traversed.
23012 e += 2; // 2 more nodes (LHS and RHS) are pushed.
23013 continue;
23014 }
23015
23016 // Quit if a non-EXTRACT_VECTOR_ELT
23017 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
23018 return false;
23019
23020 // Quit if without a constant index.
23021 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
23022 if (!Idx)
23023 return false;
23024
23025 SDValue Src = I->getOperand(0);
23026 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
23027 if (M == SrcOpMap.end()) {
23028 VT = Src.getValueType();
23029 // Quit if not the same type.
23030 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
23031 return false;
23032 unsigned NumElts = VT.getVectorNumElements();
23033 APInt EltCount = APInt::getZero(NumElts);
23034 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
23035 SrcOps.push_back(Src);
23036 }
23037
23038 // Quit if element already used.
23039 unsigned CIdx = Idx->getZExtValue();
23040 if (M->second[CIdx])
23041 return false;
23042 M->second.setBit(CIdx);
23043 }
23044
23045 if (SrcMask) {
23046 // Collect the source partial masks.
23047 for (SDValue &SrcOp : SrcOps)
23048 SrcMask->push_back(SrcOpMap[SrcOp]);
23049 } else {
23050 // Quit if not all elements are used.
23051 for (const auto &I : SrcOpMap)
23052 if (!I.second.isAllOnes())
23053 return false;
23054 }
23055
23056 return true;
23057}
23058
23059// Helper function for comparing all bits of two vectors.
23061 ISD::CondCode CC, const APInt &OriginalMask,
23062 const X86Subtarget &Subtarget,
23063 SelectionDAG &DAG, X86::CondCode &X86CC) {
23064 EVT VT = LHS.getValueType();
23065 unsigned ScalarSize = VT.getScalarSizeInBits();
23066 if (OriginalMask.getBitWidth() != ScalarSize) {
23067 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
23068 return SDValue();
23069 }
23070
23071 // Quit if not convertable to legal scalar or 128/256-bit vector.
23073 return SDValue();
23074
23075 // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.
23076 if (VT.isFloatingPoint())
23077 return SDValue();
23078
23079 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
23080 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
23081
23082 APInt Mask = OriginalMask;
23083
23084 auto MaskBits = [&](SDValue Src) {
23085 if (Mask.isAllOnes())
23086 return Src;
23087 EVT SrcVT = Src.getValueType();
23088 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
23089 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
23090 };
23091
23092 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
23093 if (VT.getSizeInBits() < 128) {
23094 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
23095 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {
23096 if (IntVT != MVT::i64)
23097 return SDValue();
23098 auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,
23099 MVT::i32, MVT::i32);
23100 auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,
23101 MVT::i32, MVT::i32);
23102 SDValue Lo =
23103 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);
23104 SDValue Hi =
23105 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);
23106 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
23107 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),
23108 DAG.getConstant(0, DL, MVT::i32));
23109 }
23110 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
23111 DAG.getBitcast(IntVT, MaskBits(LHS)),
23112 DAG.getBitcast(IntVT, MaskBits(RHS)));
23113 }
23114
23115 // Without PTEST, a masked v2i64 or-reduction is not faster than
23116 // scalarization.
23117 bool UseKORTEST = Subtarget.useAVX512Regs();
23118 bool UsePTEST = Subtarget.hasSSE41();
23119 if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)
23120 return SDValue();
23121
23122 // Split down to 128/256/512-bit vector.
23123 unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);
23124
23125 // If the input vector has vector elements wider than the target test size,
23126 // then cast to <X x i64> so it will safely split.
23127 if (ScalarSize > TestSize) {
23128 if (!Mask.isAllOnes())
23129 return SDValue();
23130 VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);
23131 LHS = DAG.getBitcast(VT, LHS);
23132 RHS = DAG.getBitcast(VT, RHS);
23133 Mask = APInt::getAllOnes(64);
23134 }
23135
23136 if (VT.getSizeInBits() > TestSize) {
23137 KnownBits KnownRHS = DAG.computeKnownBits(RHS);
23138 if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {
23139 // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.
23140 while (VT.getSizeInBits() > TestSize) {
23141 auto Split = DAG.SplitVector(LHS, DL);
23142 VT = Split.first.getValueType();
23143 LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
23144 }
23145 RHS = DAG.getAllOnesConstant(DL, VT);
23146 } else if (!UsePTEST && !KnownRHS.isZero()) {
23147 // MOVMSK Special Case:
23148 // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)
23149 MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;
23150 VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());
23151 LHS = DAG.getBitcast(VT, MaskBits(LHS));
23152 RHS = DAG.getBitcast(VT, MaskBits(RHS));
23153 EVT BoolVT = VT.changeVectorElementType(MVT::i1);
23154 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);
23155 V = DAG.getSExtOrTrunc(V, DL, VT);
23156 while (VT.getSizeInBits() > TestSize) {
23157 auto Split = DAG.SplitVector(V, DL);
23158 VT = Split.first.getValueType();
23159 V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
23160 }
23161 V = DAG.getNOT(DL, V, VT);
23162 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23163 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
23164 DAG.getConstant(0, DL, MVT::i32));
23165 } else {
23166 // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.
23167 SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
23168 while (VT.getSizeInBits() > TestSize) {
23169 auto Split = DAG.SplitVector(V, DL);
23170 VT = Split.first.getValueType();
23171 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
23172 }
23173 LHS = V;
23174 RHS = DAG.getConstant(0, DL, VT);
23175 }
23176 }
23177
23178 if (UseKORTEST && VT.is512BitVector()) {
23179 MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
23180 MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);
23181 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
23182 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
23183 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);
23184 return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
23185 }
23186
23187 if (UsePTEST) {
23188 MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
23189 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
23190 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
23191 SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);
23192 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
23193 }
23194
23195 assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits");
23196 MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;
23197 LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));
23198 RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));
23199 SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);
23200 V = DAG.getNOT(DL, V, MaskVT);
23201 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23202 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
23203 DAG.getConstant(0, DL, MVT::i32));
23204}
23205
23206// Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
23207// to CMP(MOVMSK(PCMPEQB(X,Y))).
23209 ISD::CondCode CC, const SDLoc &DL,
23210 const X86Subtarget &Subtarget,
23211 SelectionDAG &DAG,
23212 X86::CondCode &X86CC) {
23213 SDValue Op = OrigLHS;
23214
23215 bool CmpNull;
23216 APInt Mask;
23217 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
23218 CmpNull = isNullConstant(OrigRHS);
23219 if (!CmpNull && !isAllOnesConstant(OrigRHS))
23220 return SDValue();
23221
23222 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
23223 return SDValue();
23224
23225 // Check whether we're masking/truncating an OR-reduction result, in which
23226 // case track the masked bits.
23227 // TODO: Add CmpAllOnes support.
23228 Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
23229 if (CmpNull) {
23230 switch (Op.getOpcode()) {
23231 case ISD::TRUNCATE: {
23232 SDValue Src = Op.getOperand(0);
23233 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
23234 Op.getScalarValueSizeInBits());
23235 Op = Src;
23236 break;
23237 }
23238 case ISD::AND: {
23239 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
23240 Mask = Cst->getAPIntValue();
23241 Op = Op.getOperand(0);
23242 }
23243 break;
23244 }
23245 }
23246 }
23247 } else if (CC == ISD::SETGT && isAllOnesConstant(OrigRHS)) {
23248 CC = ISD::SETEQ;
23249 CmpNull = true;
23250 Mask = APInt::getSignMask(Op.getScalarValueSizeInBits());
23251 } else {
23252 return SDValue();
23253 }
23254
23255 ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;
23256
23257 // Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.
23258 // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.
23260 if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {
23261 EVT VT = VecIns[0].getValueType();
23262 assert(llvm::all_of(VecIns,
23263 [VT](SDValue V) { return VT == V.getValueType(); }) &&
23264 "Reduction source vector mismatch");
23265
23266 // Quit if not splittable to scalar/128/256/512-bit vector.
23268 return SDValue();
23269
23270 // If more than one full vector is evaluated, AND/OR them first before
23271 // PTEST.
23272 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
23273 Slot += 2, e += 1) {
23274 // Each iteration will AND/OR 2 nodes and append the result until there is
23275 // only 1 node left, i.e. the final value of all vectors.
23276 SDValue LHS = VecIns[Slot];
23277 SDValue RHS = VecIns[Slot + 1];
23278 VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));
23279 }
23280
23281 return LowerVectorAllEqual(DL, VecIns.back(),
23282 CmpNull ? DAG.getConstant(0, DL, VT)
23283 : DAG.getAllOnesConstant(DL, VT),
23284 CC, Mask, Subtarget, DAG, X86CC);
23285 }
23286
23287 // Match icmp(reduce_or(X),0) anyof reduction patterns.
23288 // Match icmp(reduce_and(X),-1) allof reduction patterns.
23289 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
23290 ISD::NodeType BinOp;
23291 if (SDValue Match =
23292 DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {
23293 EVT MatchVT = Match.getValueType();
23294 return LowerVectorAllEqual(DL, Match,
23295 CmpNull ? DAG.getConstant(0, DL, MatchVT)
23296 : DAG.getAllOnesConstant(DL, MatchVT),
23297 CC, Mask, Subtarget, DAG, X86CC);
23298 }
23299 }
23300
23301 if (Mask.isAllOnes()) {
23302 assert(!Op.getValueType().isVector() &&
23303 "Illegal vector type for reduction pattern");
23305 if (Src.getValueType().isFixedLengthVector() &&
23306 Src.getValueType().getScalarType() == MVT::i1) {
23307 // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.
23308 // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.
23309 if (Src.getOpcode() == ISD::SETCC) {
23310 SDValue LHS = Src.getOperand(0);
23311 SDValue RHS = Src.getOperand(1);
23312 EVT LHSVT = LHS.getValueType();
23313 ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();
23314 if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&
23316 APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());
23317 return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,
23318 X86CC);
23319 }
23320 }
23321 // Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.
23322 // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.
23323 // Peek through truncation, mask the LSB and compare against zero/LSB.
23324 if (Src.getOpcode() == ISD::TRUNCATE) {
23325 SDValue Inner = Src.getOperand(0);
23326 EVT InnerVT = Inner.getValueType();
23328 unsigned BW = InnerVT.getScalarSizeInBits();
23329 APInt SrcMask = APInt(BW, 1);
23330 APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;
23331 return LowerVectorAllEqual(DL, Inner,
23332 DAG.getConstant(Cmp, DL, InnerVT), CC,
23333 SrcMask, Subtarget, DAG, X86CC);
23334 }
23335 }
23336 }
23337 }
23338
23339 return SDValue();
23340}
23341
23342/// return true if \c Op has a use that doesn't just read flags.
23344 for (SDUse &Use : Op->uses()) {
23345 SDNode *User = Use.getUser();
23346 unsigned UOpNo = Use.getOperandNo();
23347 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
23348 // Look past truncate.
23349 UOpNo = User->use_begin()->getOperandNo();
23350 User = User->use_begin()->getUser();
23351 }
23352
23353 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
23354 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
23355 return true;
23356 }
23357 return false;
23358}
23359
23360// Transform to an x86-specific ALU node with flags if there is a chance of
23361// using an RMW op or only the flags are used. Otherwise, leave
23362// the node alone and emit a 'cmp' or 'test' instruction.
23364 for (SDNode *U : Op->users())
23365 if (U->getOpcode() != ISD::CopyToReg &&
23366 U->getOpcode() != ISD::SETCC &&
23367 U->getOpcode() != ISD::STORE)
23368 return false;
23369
23370 return true;
23371}
23372
23373/// Emit nodes that will be selected as "test Op0,Op0", or something
23374/// equivalent.
23376 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
23377 // CF and OF aren't always set the way we want. Determine which
23378 // of these we need.
23379 bool NeedCF = false;
23380 bool NeedOF = false;
23381 switch (X86CC) {
23382 default: break;
23383 case X86::COND_A: case X86::COND_AE:
23384 case X86::COND_B: case X86::COND_BE:
23385 NeedCF = true;
23386 break;
23387 case X86::COND_G: case X86::COND_GE:
23388 case X86::COND_L: case X86::COND_LE:
23389 case X86::COND_O: case X86::COND_NO: {
23390 // Check if we really need to set the
23391 // Overflow flag. If NoSignedWrap is present
23392 // that is not actually needed.
23393 switch (Op->getOpcode()) {
23394 case ISD::ADD:
23395 case ISD::SUB:
23396 case ISD::MUL:
23397 case ISD::SHL:
23398 if (Op.getNode()->getFlags().hasNoSignedWrap())
23399 break;
23400 [[fallthrough]];
23401 default:
23402 NeedOF = true;
23403 break;
23404 }
23405 break;
23406 }
23407 }
23408 // See if we can use the EFLAGS value from the operand instead of
23409 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
23410 // we prove that the arithmetic won't overflow, we can't use OF or CF.
23411 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
23412 // Emit a CMP with 0, which is the TEST pattern.
23413 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23414 DAG.getConstant(0, dl, Op.getValueType()));
23415 }
23416 unsigned Opcode = 0;
23417 unsigned NumOperands = 0;
23418
23419 SDValue ArithOp = Op;
23420
23421 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
23422 // which may be the result of a CAST. We use the variable 'Op', which is the
23423 // non-casted variable when we check for possible users.
23424 switch (ArithOp.getOpcode()) {
23425 case ISD::AND:
23426 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
23427 // because a TEST instruction will be better.
23428 if (!hasNonFlagsUse(Op))
23429 break;
23430
23431 [[fallthrough]];
23432 case ISD::ADD:
23433 case ISD::SUB:
23434 case ISD::OR:
23435 case ISD::XOR:
23437 break;
23438
23439 // Otherwise use a regular EFLAGS-setting instruction.
23440 switch (ArithOp.getOpcode()) {
23441 // clang-format off
23442 default: llvm_unreachable("unexpected operator!");
23443 case ISD::ADD: Opcode = X86ISD::ADD; break;
23444 case ISD::SUB: Opcode = X86ISD::SUB; break;
23445 case ISD::XOR: Opcode = X86ISD::XOR; break;
23446 case ISD::AND: Opcode = X86ISD::AND; break;
23447 case ISD::OR: Opcode = X86ISD::OR; break;
23448 // clang-format on
23449 }
23450
23451 NumOperands = 2;
23452 break;
23453 case X86ISD::ADD:
23454 case X86ISD::SUB:
23455 case X86ISD::OR:
23456 case X86ISD::XOR:
23457 case X86ISD::AND:
23458 return SDValue(Op.getNode(), 1);
23459 case ISD::SSUBO:
23460 case ISD::USUBO: {
23461 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
23462 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23463 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
23464 Op->getOperand(1)).getValue(1);
23465 }
23466 default:
23467 break;
23468 }
23469
23470 if (Opcode == 0) {
23471 // Emit a CMP with 0, which is the TEST pattern.
23472 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23473 DAG.getConstant(0, dl, Op.getValueType()));
23474 }
23475 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23476 SmallVector<SDValue, 4> Ops(Op->ops().take_front(NumOperands));
23477
23478 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
23479 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
23480 return SDValue(New.getNode(), 1);
23481}
23482
23483/// Emit nodes that will be selected as "cmp Op0,Op1", or something
23484/// equivalent.
23486 const SDLoc &dl, SelectionDAG &DAG,
23487 const X86Subtarget &Subtarget) {
23488 if (isNullConstant(Op1))
23489 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
23490
23491 EVT CmpVT = Op0.getValueType();
23492
23493 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
23494 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
23495
23496 // Only promote the compare up to I32 if it is a 16 bit operation
23497 // with an immediate. 16 bit immediates are to be avoided unless the target
23498 // isn't slowed down by length changing prefixes, we're optimizing for
23499 // codesize or the comparison is with a folded load.
23500 if (CmpVT == MVT::i16 && !Subtarget.hasFastImm16() &&
23501 !X86::mayFoldLoad(Op0, Subtarget) && !X86::mayFoldLoad(Op1, Subtarget) &&
23503 auto *COp0 = dyn_cast<ConstantSDNode>(Op0);
23504 auto *COp1 = dyn_cast<ConstantSDNode>(Op1);
23505 // Don't do this if the immediate can fit in 8-bits.
23506 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
23507 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
23508 unsigned ExtendOp =
23510 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
23511 // For equality comparisons try to use SIGN_EXTEND if the input was
23512 // truncate from something with enough sign bits.
23513 if (Op0.getOpcode() == ISD::TRUNCATE) {
23514 if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
23515 ExtendOp = ISD::SIGN_EXTEND;
23516 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
23517 if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
23518 ExtendOp = ISD::SIGN_EXTEND;
23519 }
23520 }
23521
23522 CmpVT = MVT::i32;
23523 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
23524 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
23525 }
23526 }
23527
23528 // Try to shrink i64 compares if the input has enough zero bits.
23529 if (CmpVT == MVT::i64 && !isX86CCSigned(X86CC) &&
23530 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23531 DAG.MaskedValueIsZero(Op1, APInt::getHighBitsSet(64, 32)) &&
23532 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
23533 CmpVT = MVT::i32;
23534 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23535 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23536 }
23537
23538 // Try to shrink all i64 compares if the inputs are representable as signed
23539 // i32.
23540 if (CmpVT == MVT::i64 &&
23541 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23542 DAG.ComputeNumSignBits(Op1) > 32 && DAG.ComputeNumSignBits(Op0) > 32) {
23543 CmpVT = MVT::i32;
23544 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23545 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23546 }
23547
23548 // 0-x == y --> x+y == 0
23549 // 0-x != y --> x+y != 0
23550 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
23551 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23552 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23553 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
23554 return Add.getValue(1);
23555 }
23556
23557 // x == 0-y --> x+y == 0
23558 // x != 0-y --> x+y != 0
23559 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
23560 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23561 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23562 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
23563 return Add.getValue(1);
23564 }
23565
23566 // If we already have an XOR of the ops, use that to check for equality.
23567 // Else use SUB instead of CMP to enable CSE between SUB and CMP.
23568 unsigned X86Opc = X86ISD::SUB;
23569 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
23570 (DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op0, Op1}) ||
23571 DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op1, Op0})))
23572 X86Opc = X86ISD::XOR;
23573
23574 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23575 SDValue CmpOp = DAG.getNode(X86Opc, dl, VTs, Op0, Op1);
23576 return CmpOp.getValue(1);
23577}
23578
23583
23584bool X86TargetLowering::optimizeFMulOrFDivAsShiftAddBitcast(
23585 SDNode *N, SDValue, SDValue IntPow2) const {
23586 if (N->getOpcode() == ISD::FDIV)
23587 return true;
23588
23589 EVT FPVT = N->getValueType(0);
23590 EVT IntVT = IntPow2.getValueType();
23591
23592 // This indicates a non-free bitcast.
23593 // TODO: This is probably overly conservative as we will need to scale the
23594 // integer vector anyways for the int->fp cast.
23595 if (FPVT.isVector() &&
23596 FPVT.getScalarSizeInBits() != IntVT.getScalarSizeInBits())
23597 return false;
23598
23599 return true;
23600}
23601
23602/// Check if replacement of SQRT with RSQRT should be disabled.
23603bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
23604 EVT VT = Op.getValueType();
23605
23606 // We don't need to replace SQRT with RSQRT for half type.
23607 if (VT.getScalarType() == MVT::f16)
23608 return true;
23609
23610 // We never want to use both SQRT and RSQRT instructions for the same input.
23611 if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
23612 return false;
23613
23614 if (VT.isVector())
23615 return Subtarget.hasFastVectorFSQRT();
23616 return Subtarget.hasFastScalarFSQRT();
23617}
23618
23619/// The minimum architected relative accuracy is 2^-12. We need one
23620/// Newton-Raphson step to have a good float result (24 bits of precision).
23621SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
23622 SelectionDAG &DAG, int Enabled,
23623 int &RefinementSteps,
23624 bool &UseOneConstNR,
23625 bool Reciprocal) const {
23626 SDLoc DL(Op);
23627 EVT VT = Op.getValueType();
23628
23629 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
23630 // It is likely not profitable to do this for f64 because a double-precision
23631 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
23632 // instructions: convert to single, rsqrtss, convert back to double, refine
23633 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
23634 // along with FMA, this could be a throughput win.
23635 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
23636 // after legalize types.
23637 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23638 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
23639 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
23640 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23641 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23642 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23643 RefinementSteps = 1;
23644
23645 UseOneConstNR = false;
23646 // There is no FSQRT for 512-bits, but there is RSQRT14.
23647 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
23648 SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
23649 if (RefinementSteps == 0 && !Reciprocal)
23650 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
23651 return Estimate;
23652 }
23653
23654 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23655 Subtarget.hasFP16()) {
23656 assert(Reciprocal && "Don't replace SQRT with RSQRT for half type");
23657 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23658 RefinementSteps = 0;
23659
23660 if (VT == MVT::f16) {
23662 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23663 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23664 Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
23665 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23666 }
23667
23668 return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
23669 }
23670 return SDValue();
23671}
23672
23673/// The minimum architected relative accuracy is 2^-12. We need one
23674/// Newton-Raphson step to have a good float result (24 bits of precision).
23675SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
23676 int Enabled,
23677 int &RefinementSteps) const {
23678 SDLoc DL(Op);
23679 EVT VT = Op.getValueType();
23680
23681 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
23682 // It is likely not profitable to do this for f64 because a double-precision
23683 // reciprocal estimate with refinement on x86 prior to FMA requires
23684 // 15 instructions: convert to single, rcpss, convert back to double, refine
23685 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
23686 // along with FMA, this could be a throughput win.
23687
23688 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23689 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
23690 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23691 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23692 // Enable estimate codegen with 1 refinement step for vector division.
23693 // Scalar division estimates are disabled because they break too much
23694 // real-world code. These defaults are intended to match GCC behavior.
23695 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
23696 return SDValue();
23697
23698 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23699 RefinementSteps = 1;
23700
23701 // There is no FSQRT for 512-bits, but there is RCP14.
23702 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
23703 return DAG.getNode(Opcode, DL, VT, Op);
23704 }
23705
23706 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23707 Subtarget.hasFP16()) {
23708 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23709 RefinementSteps = 0;
23710
23711 if (VT == MVT::f16) {
23713 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23714 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23715 Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
23716 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23717 }
23718
23719 return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
23720 }
23721 return SDValue();
23722}
23723
23724/// If we have at least two divisions that use the same divisor, convert to
23725/// multiplication by a reciprocal. This may need to be adjusted for a given
23726/// CPU if a division's cost is not at least twice the cost of a multiplication.
23727/// This is because we still need one division to calculate the reciprocal and
23728/// then we need two multiplies by that reciprocal as replacements for the
23729/// original divisions.
23731 return 2;
23732}
23733
23734SDValue
23735X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
23736 SelectionDAG &DAG,
23737 SmallVectorImpl<SDNode *> &Created) const {
23738 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
23739 if (isIntDivCheap(N->getValueType(0), Attr))
23740 return SDValue(N,0); // Lower SDIV as SDIV
23741
23742 assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&
23743 "Unexpected divisor!");
23744
23745 // Only perform this transform if CMOV is supported otherwise the select
23746 // below will become a branch.
23747 if (!Subtarget.canUseCMOV())
23748 return SDValue();
23749
23750 // fold (sdiv X, pow2)
23751 EVT VT = N->getValueType(0);
23752 // FIXME: Support i8.
23753 if (VT != MVT::i16 && VT != MVT::i32 &&
23754 !(Subtarget.is64Bit() && VT == MVT::i64))
23755 return SDValue();
23756
23757 // If the divisor is 2 or -2, the default expansion is better.
23758 if (Divisor == 2 ||
23759 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
23760 return SDValue();
23761
23762 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
23763}
23764
23765/// Result of 'and' is compared against zero. Change to a BT node if possible.
23766/// Returns the BT node and the condition code needed to use it.
23768 SelectionDAG &DAG, X86::CondCode &X86CC) {
23769 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
23770 SDValue Op0 = And.getOperand(0);
23771 SDValue Op1 = And.getOperand(1);
23772 if (Op0.getOpcode() == ISD::TRUNCATE)
23773 Op0 = Op0.getOperand(0);
23774 if (Op1.getOpcode() == ISD::TRUNCATE)
23775 Op1 = Op1.getOperand(0);
23776
23777 SDValue Src, BitNo;
23778 if (Op1.getOpcode() == ISD::SHL)
23779 std::swap(Op0, Op1);
23780 if (Op0.getOpcode() == ISD::SHL) {
23781 if (isOneConstant(Op0.getOperand(0))) {
23782 // If we looked past a truncate, check that it's only truncating away
23783 // known zeros.
23784 unsigned BitWidth = Op0.getValueSizeInBits();
23785 unsigned AndBitWidth = And.getValueSizeInBits();
23786 if (BitWidth > AndBitWidth) {
23787 KnownBits Known = DAG.computeKnownBits(Op0);
23788 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
23789 return SDValue();
23790 }
23791 Src = Op1;
23792 BitNo = Op0.getOperand(1);
23793 }
23794 } else if (Op1.getOpcode() == ISD::Constant) {
23795 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
23796 uint64_t AndRHSVal = AndRHS->getZExtValue();
23797 SDValue AndLHS = Op0;
23798
23799 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
23800 Src = AndLHS.getOperand(0);
23801 BitNo = AndLHS.getOperand(1);
23802 } else {
23803 // Use BT if the immediate can't be encoded in a TEST instruction or we
23804 // are optimizing for size and the immedaite won't fit in a byte.
23805 bool OptForSize = DAG.shouldOptForSize();
23806 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
23807 isPowerOf2_64(AndRHSVal)) {
23808 Src = AndLHS;
23809 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
23810 Src.getValueType());
23811 }
23812 }
23813 }
23814
23815 // No patterns found, give up.
23816 if (!Src.getNode())
23817 return SDValue();
23818
23819 // Remove any bit flip.
23820 if (isBitwiseNot(Src)) {
23821 Src = Src.getOperand(0);
23822 CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;
23823 }
23824
23825 // Attempt to create the X86ISD::BT node.
23826 if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
23827 X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23828 return BT;
23829 }
23830
23831 return SDValue();
23832}
23833
23834// Check if pre-AVX condcode can be performed by a single FCMP op.
23835static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
23836 return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
23837}
23838
23839/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
23840/// CMPs.
23841static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
23842 SDValue &Op1, bool &IsAlwaysSignaling) {
23843 unsigned SSECC;
23844 bool Swap = false;
23845
23846 // SSE Condition code mapping:
23847 // 0 - EQ
23848 // 1 - LT
23849 // 2 - LE
23850 // 3 - UNORD
23851 // 4 - NEQ
23852 // 5 - NLT
23853 // 6 - NLE
23854 // 7 - ORD
23855 switch (SetCCOpcode) {
23856 // clang-format off
23857 default: llvm_unreachable("Unexpected SETCC condition");
23858 case ISD::SETOEQ:
23859 case ISD::SETEQ: SSECC = 0; break;
23860 case ISD::SETOGT:
23861 case ISD::SETGT: Swap = true; [[fallthrough]];
23862 case ISD::SETLT:
23863 case ISD::SETOLT: SSECC = 1; break;
23864 case ISD::SETOGE:
23865 case ISD::SETGE: Swap = true; [[fallthrough]];
23866 case ISD::SETLE:
23867 case ISD::SETOLE: SSECC = 2; break;
23868 case ISD::SETUO: SSECC = 3; break;
23869 case ISD::SETUNE:
23870 case ISD::SETNE: SSECC = 4; break;
23871 case ISD::SETULE: Swap = true; [[fallthrough]];
23872 case ISD::SETUGE: SSECC = 5; break;
23873 case ISD::SETULT: Swap = true; [[fallthrough]];
23874 case ISD::SETUGT: SSECC = 6; break;
23875 case ISD::SETO: SSECC = 7; break;
23876 case ISD::SETUEQ: SSECC = 8; break;
23877 case ISD::SETONE: SSECC = 12; break;
23878 // clang-format on
23879 }
23880 if (Swap)
23881 std::swap(Op0, Op1);
23882
23883 switch (SetCCOpcode) {
23884 default:
23885 IsAlwaysSignaling = true;
23886 break;
23887 case ISD::SETEQ:
23888 case ISD::SETOEQ:
23889 case ISD::SETUEQ:
23890 case ISD::SETNE:
23891 case ISD::SETONE:
23892 case ISD::SETUNE:
23893 case ISD::SETO:
23894 case ISD::SETUO:
23895 IsAlwaysSignaling = false;
23896 break;
23897 }
23898
23899 return SSECC;
23900}
23901
23902/// Break a VSETCC 256/512-bit vector into two new 128/256 ones and then
23903/// concatenate the result back.
23905 SelectionDAG &DAG, const SDLoc &dl) {
23906 assert(VT.isInteger() && LHS.getValueType() == RHS.getValueType() &&
23907 "Unsupported VTs!");
23908 SDValue CC = DAG.getCondCode(Cond);
23909
23910 // Extract the LHS Lo/Hi vectors
23911 SDValue LHS1, LHS2;
23912 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
23913
23914 // Extract the RHS Lo/Hi vectors
23915 SDValue RHS1, RHS2;
23916 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
23917
23918 // Issue the operation on the smaller types and concatenate the result back
23919 EVT LoVT, HiVT;
23920 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
23921 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
23922 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
23923 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
23924}
23925
23927 SelectionDAG &DAG) {
23928 SDValue Op0 = Op.getOperand(0);
23929 SDValue Op1 = Op.getOperand(1);
23930 SDValue CC = Op.getOperand(2);
23931 MVT VT = Op.getSimpleValueType();
23932 assert(VT.getVectorElementType() == MVT::i1 &&
23933 "Cannot set masked compare for this operation");
23934
23935 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
23936
23937 // Prefer SETGT over SETLT.
23938 if (SetCCOpcode == ISD::SETLT) {
23939 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
23940 std::swap(Op0, Op1);
23941 }
23942
23943 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
23944}
23945
23946/// Given a buildvector constant, return a new vector constant with each element
23947/// incremented or decremented. If incrementing or decrementing would result in
23948/// unsigned overflow or underflow or this is not a simple vector constant,
23949/// return an empty value.
23951 bool NSW) {
23952 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
23953 if (!BV || !V.getValueType().isSimple())
23954 return SDValue();
23955
23956 MVT VT = V.getSimpleValueType();
23957 MVT EltVT = VT.getVectorElementType();
23958 unsigned NumElts = VT.getVectorNumElements();
23960 SDLoc DL(V);
23961 for (unsigned i = 0; i < NumElts; ++i) {
23962 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
23963 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
23964 return SDValue();
23965
23966 // Avoid overflow/underflow.
23967 const APInt &EltC = Elt->getAPIntValue();
23968 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
23969 return SDValue();
23970 if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||
23971 (!IsInc && EltC.isMinSignedValue())))
23972 return SDValue();
23973
23974 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
23975 }
23976
23977 return DAG.getBuildVector(VT, DL, NewVecC);
23978}
23979
23980/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
23981/// Op0 u<= Op1:
23982/// t = psubus Op0, Op1
23983/// pcmpeq t, <0..0>
23985 ISD::CondCode Cond, const SDLoc &dl,
23986 const X86Subtarget &Subtarget,
23987 SelectionDAG &DAG) {
23988 if (!Subtarget.hasSSE2())
23989 return SDValue();
23990
23991 MVT VET = VT.getVectorElementType();
23992 if (VET != MVT::i8 && VET != MVT::i16)
23993 return SDValue();
23994
23995 switch (Cond) {
23996 default:
23997 return SDValue();
23998 case ISD::SETULT: {
23999 // If the comparison is against a constant we can turn this into a
24000 // setule. With psubus, setule does not require a swap. This is
24001 // beneficial because the constant in the register is no longer
24002 // destructed as the destination so it can be hoisted out of a loop.
24003 // Only do this pre-AVX since vpcmp* is no longer destructive.
24004 if (Subtarget.hasAVX())
24005 return SDValue();
24006 SDValue ULEOp1 =
24007 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);
24008 if (!ULEOp1)
24009 return SDValue();
24010 Op1 = ULEOp1;
24011 break;
24012 }
24013 case ISD::SETUGT: {
24014 // If the comparison is against a constant, we can turn this into a setuge.
24015 // This is beneficial because materializing a constant 0 for the PCMPEQ is
24016 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
24017 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
24018 SDValue UGEOp1 =
24019 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);
24020 if (!UGEOp1)
24021 return SDValue();
24022 Op1 = Op0;
24023 Op0 = UGEOp1;
24024 break;
24025 }
24026 // Psubus is better than flip-sign because it requires no inversion.
24027 case ISD::SETUGE:
24028 std::swap(Op0, Op1);
24029 break;
24030 case ISD::SETULE:
24031 break;
24032 }
24033
24034 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
24035 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
24036 DAG.getConstant(0, dl, VT));
24037}
24038
24039static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
24040 SelectionDAG &DAG) {
24041 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24042 Op.getOpcode() == ISD::STRICT_FSETCCS;
24043 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24044 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24045 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
24046 MVT VT = Op->getSimpleValueType(0);
24048 MVT OpVT = Op0.getSimpleValueType();
24049 SDLoc dl(Op);
24050
24051 if (OpVT.isFloatingPoint()) {
24052 MVT EltVT = OpVT.getVectorElementType();
24053 assert(EltVT == MVT::bf16 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
24054 EltVT == MVT::f64);
24055
24056 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24057 if (isSoftF16(EltVT, Subtarget)) {
24058 if (Subtarget.hasAVX512() && !Subtarget.hasVLX())
24059 return SDValue();
24060
24061 // Break 256-bit FP vector compare into smaller ones.
24062 if (OpVT.is256BitVector() && !Subtarget.useAVX512Regs())
24063 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24064
24065 // Break 512-bit FP vector compare into smaller ones.
24066 if (OpVT.is512BitVector())
24067 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24068
24069 MVT NVT = OpVT.changeVectorElementType(MVT::f32);
24070 if (IsStrict) {
24071 Op0 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
24072 {Chain, Op0});
24073 Op1 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
24074 {Chain, Op1});
24075 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
24076 {Chain, Op0, Op1, CC});
24077 }
24078 MVT DVT = VT.getVectorElementType() == MVT::i16
24079 ? VT.changeVectorElementType(MVT::i32)
24080 : VT;
24081 SDValue Cmp = DAG.getNode(Op.getOpcode(), dl, DVT,
24082 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op0),
24083 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op1), CC);
24084 return DVT == VT ? Cmp : DAG.getNode(ISD::TRUNCATE, dl, VT, Cmp);
24085 }
24086
24087 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24088
24089 // If we have a strict compare with a vXi1 result and the input is 128/256
24090 // bits we can't use a masked compare unless we have VLX. If we use a wider
24091 // compare like we do for non-strict, we might trigger spurious exceptions
24092 // from the upper elements. Instead emit a AVX compare and convert to mask.
24093 unsigned Opc;
24094 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
24095 (!IsStrict || Subtarget.hasVLX() ||
24097#ifndef NDEBUG
24098 unsigned Num = VT.getVectorNumElements();
24099 assert(Num <= 16 ||
24100 (Num == 32 && (EltVT == MVT::f16 || EltVT == MVT::bf16)));
24101#endif
24102 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
24103 } else {
24104 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
24105 // The SSE/AVX packed FP comparison nodes are defined with a
24106 // floating-point vector result that matches the operand type. This allows
24107 // them to work with an SSE1 target (integer vector types are not legal).
24108 VT = Op0.getSimpleValueType();
24109 }
24110
24111 SDValue Cmp;
24112 bool IsAlwaysSignaling;
24113 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
24114 if (!Subtarget.hasAVX()) {
24115 // TODO: We could use following steps to handle a quiet compare with
24116 // signaling encodings.
24117 // 1. Get ordered masks from a quiet ISD::SETO
24118 // 2. Use the masks to mask potential unordered elements in operand A, B
24119 // 3. Get the compare results of masked A, B
24120 // 4. Calculating final result using the mask and result from 3
24121 // But currently, we just fall back to scalar operations.
24122 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
24123 return SDValue();
24124
24125 // Insert an extra signaling instruction to raise exception.
24126 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
24127 SDValue SignalCmp = DAG.getNode(
24128 Opc, dl, {VT, MVT::Other},
24129 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
24130 // FIXME: It seems we need to update the flags of all new strict nodes.
24131 // Otherwise, mayRaiseFPException in MI will return false due to
24132 // NoFPExcept = false by default. However, I didn't find it in other
24133 // patches.
24134 SignalCmp->setFlags(Op->getFlags());
24135 Chain = SignalCmp.getValue(1);
24136 }
24137
24138 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
24139 // emit two comparisons and a logic op to tie them together.
24140 if (!cheapX86FSETCC_SSE(Cond)) {
24141 // LLVM predicate is SETUEQ or SETONE.
24142 unsigned CC0, CC1;
24143 unsigned CombineOpc;
24144 if (Cond == ISD::SETUEQ) {
24145 CC0 = 3; // UNORD
24146 CC1 = 0; // EQ
24147 CombineOpc = X86ISD::FOR;
24148 } else {
24150 CC0 = 7; // ORD
24151 CC1 = 4; // NEQ
24152 CombineOpc = X86ISD::FAND;
24153 }
24154
24155 SDValue Cmp0, Cmp1;
24156 if (IsStrict) {
24157 Cmp0 = DAG.getNode(
24158 Opc, dl, {VT, MVT::Other},
24159 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
24160 Cmp1 = DAG.getNode(
24161 Opc, dl, {VT, MVT::Other},
24162 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
24163 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
24164 Cmp1.getValue(1));
24165 } else {
24166 Cmp0 = DAG.getNode(
24167 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
24168 Cmp1 = DAG.getNode(
24169 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
24170 }
24171 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
24172 } else {
24173 if (IsStrict) {
24174 Cmp = DAG.getNode(
24175 Opc, dl, {VT, MVT::Other},
24176 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
24177 Chain = Cmp.getValue(1);
24178 } else
24179 Cmp = DAG.getNode(
24180 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
24181 }
24182 } else {
24183 // Handle all other FP comparisons here.
24184 if (IsStrict) {
24185 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
24186 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
24187 Cmp = DAG.getNode(
24188 Opc, dl, {VT, MVT::Other},
24189 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
24190 Chain = Cmp.getValue(1);
24191 } else
24192 Cmp = DAG.getNode(
24193 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
24194 }
24195
24196 if (VT.getFixedSizeInBits() >
24197 Op.getSimpleValueType().getFixedSizeInBits()) {
24198 // We emitted a compare with an XMM/YMM result. Finish converting to a
24199 // mask register using a vptestm.
24201 Cmp = DAG.getBitcast(CastVT, Cmp);
24202 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
24203 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
24204 } else {
24205 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
24206 // the result type of SETCC. The bitcast is expected to be optimized
24207 // away during combining/isel.
24208 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
24209 }
24210
24211 if (IsStrict)
24212 return DAG.getMergeValues({Cmp, Chain}, dl);
24213
24214 return Cmp;
24215 }
24216
24217 assert(!IsStrict && "Strict SETCC only handles FP operands.");
24218
24219 [[maybe_unused]] MVT VTOp0 = Op0.getSimpleValueType();
24220 assert(VTOp0 == Op1.getSimpleValueType() &&
24221 "Expected operands with same type!");
24223 "Invalid number of packed elements for source and destination!");
24224
24225 // The non-AVX512 code below works under the assumption that source and
24226 // destination types are the same.
24227 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
24228 "Value types for source and destination must be the same!");
24229
24230 // The result is boolean, but operands are int/float
24231 if (VT.getVectorElementType() == MVT::i1) {
24232 // In AVX-512 architecture setcc returns mask with i1 elements,
24233 // But there is no compare instruction for i8 and i16 elements in KNL.
24234 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
24235 "Unexpected operand type");
24236 return LowerIntVSETCC_AVX512(Op, dl, DAG);
24237 }
24238
24239 // Lower using XOP integer comparisons.
24240 if (VT.is128BitVector() && Subtarget.hasXOP()) {
24241 // Translate compare code to XOP PCOM compare mode.
24242 unsigned CmpMode = 0;
24243 switch (Cond) {
24244 // clang-format off
24245 default: llvm_unreachable("Unexpected SETCC condition");
24246 case ISD::SETULT:
24247 case ISD::SETLT: CmpMode = 0x00; break;
24248 case ISD::SETULE:
24249 case ISD::SETLE: CmpMode = 0x01; break;
24250 case ISD::SETUGT:
24251 case ISD::SETGT: CmpMode = 0x02; break;
24252 case ISD::SETUGE:
24253 case ISD::SETGE: CmpMode = 0x03; break;
24254 case ISD::SETEQ: CmpMode = 0x04; break;
24255 case ISD::SETNE: CmpMode = 0x05; break;
24256 // clang-format on
24257 }
24258
24259 // Are we comparing unsigned or signed integers?
24260 unsigned Opc =
24262
24263 return DAG.getNode(Opc, dl, VT, Op0, Op1,
24264 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
24265 }
24266
24267 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
24268 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
24270 SDValue BC0 = peekThroughBitcasts(Op0);
24271 if (BC0.getOpcode() == ISD::AND &&
24273 /*AllowUndefs=*/false)) {
24274 Cond = ISD::SETEQ;
24275 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
24276 }
24277 }
24278
24279 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
24280 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
24281 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
24283 if (C1 && C1->getAPIntValue().isPowerOf2()) {
24284 unsigned BitWidth = VT.getScalarSizeInBits();
24285 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
24286
24287 SDValue Result = Op0.getOperand(0);
24288 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
24289 DAG.getConstant(ShiftAmt, dl, VT));
24290 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
24291 DAG.getConstant(BitWidth - 1, dl, VT));
24292 return Result;
24293 }
24294 }
24295
24296 // Break 256-bit integer vector compare into smaller ones.
24297 if (VT.is256BitVector() && !Subtarget.hasInt256())
24298 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24299
24300 // Break 512-bit integer vector compare into smaller ones.
24301 // TODO: Try harder to use VPCMPx + VPMOV2x?
24302 if (VT.is512BitVector())
24303 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24304
24305 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
24306 // not-of-PCMPEQ:
24307 // X != INT_MIN --> X >s INT_MIN
24308 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
24309 // +X != 0 --> +X >s 0
24310 APInt ConstValue;
24311 if (Cond == ISD::SETNE &&
24312 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
24313 if (ConstValue.isMinSignedValue())
24314 Cond = ISD::SETGT;
24315 else if (ConstValue.isMaxSignedValue())
24316 Cond = ISD::SETLT;
24317 else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
24318 Cond = ISD::SETGT;
24319 }
24320
24321 // If both operands are known non-negative, then an unsigned compare is the
24322 // same as a signed compare and there's no need to flip signbits.
24323 // TODO: We could check for more general simplifications here since we're
24324 // computing known bits.
24325 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
24326 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
24327
24328 // Special case: Use min/max operations for unsigned compares.
24329 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24331 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
24332 TLI.isOperationLegal(ISD::UMIN, VT)) {
24333 // If we have a constant operand, increment/decrement it and change the
24334 // condition to avoid an invert.
24335 if (Cond == ISD::SETUGT) {
24336 // X > C --> X >= (C+1) --> X == umax(X, C+1)
24337 if (SDValue UGTOp1 =
24338 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {
24339 Op1 = UGTOp1;
24340 Cond = ISD::SETUGE;
24341 }
24342 }
24343 if (Cond == ISD::SETULT) {
24344 // X < C --> X <= (C-1) --> X == umin(X, C-1)
24345 if (SDValue ULTOp1 =
24346 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {
24347 Op1 = ULTOp1;
24348 Cond = ISD::SETULE;
24349 }
24350 }
24351 bool Invert = false;
24352 unsigned Opc;
24353 switch (Cond) {
24354 // clang-format off
24355 default: llvm_unreachable("Unexpected condition code");
24356 case ISD::SETUGT: Invert = true; [[fallthrough]];
24357 case ISD::SETULE: Opc = ISD::UMIN; break;
24358 case ISD::SETULT: Invert = true; [[fallthrough]];
24359 case ISD::SETUGE: Opc = ISD::UMAX; break;
24360 // clang-format on
24361 }
24362
24363 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24364 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
24365
24366 // If the logical-not of the result is required, perform that now.
24367 if (Invert)
24368 Result = DAG.getNOT(dl, Result, VT);
24369
24370 return Result;
24371 }
24372
24373 // Try to use SUBUS and PCMPEQ.
24374 if (FlipSigns)
24375 if (SDValue V =
24376 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
24377 return V;
24378
24379 // We are handling one of the integer comparisons here. Since SSE only has
24380 // GT and EQ comparisons for integer, swapping operands and multiple
24381 // operations may be required for some comparisons.
24382 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
24384 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
24386 bool Invert = Cond == ISD::SETNE ||
24388
24389 if (Swap)
24390 std::swap(Op0, Op1);
24391
24392 // Check that the operation in question is available (most are plain SSE2,
24393 // but PCMPGTQ and PCMPEQQ have different requirements).
24394 if (VT == MVT::v2i64) {
24395 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
24396 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
24397
24398 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
24399 // the odd elements over the even elements.
24400 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
24401 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
24402 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24403
24404 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24405 static const int MaskHi[] = { 1, 1, 3, 3 };
24406 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24407
24408 return DAG.getBitcast(VT, Result);
24409 }
24410
24411 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
24412 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24413 Op1 = DAG.getAllOnesConstant(dl, MVT::v4i32);
24414
24415 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24416 static const int MaskHi[] = { 1, 1, 3, 3 };
24417 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24418
24419 return DAG.getBitcast(VT, Result);
24420 }
24421
24422 // If the i64 elements are sign-extended enough to be representable as i32
24423 // then we can compare the lower i32 bits and splat.
24424 if (!FlipSigns && !Invert && DAG.ComputeNumSignBits(Op0) > 32 &&
24425 DAG.ComputeNumSignBits(Op1) > 32) {
24426 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24427 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24428
24429 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24430 static const int MaskLo[] = {0, 0, 2, 2};
24431 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24432
24433 return DAG.getBitcast(VT, Result);
24434 }
24435
24436 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24437 // bits of the inputs before performing those operations. The lower
24438 // compare is always unsigned.
24439 SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL
24440 : 0x0000000080000000ULL,
24441 dl, MVT::v2i64);
24442
24443 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
24444 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
24445
24446 // Cast everything to the right type.
24447 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24448 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24449
24450 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
24451 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24452 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
24453
24454 // Create masks for only the low parts/high parts of the 64 bit integers.
24455 static const int MaskHi[] = { 1, 1, 3, 3 };
24456 static const int MaskLo[] = { 0, 0, 2, 2 };
24457 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
24458 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24459 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24460
24461 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
24462 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
24463
24464 if (Invert)
24465 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24466
24467 return DAG.getBitcast(VT, Result);
24468 }
24469
24470 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
24471 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
24472 // pcmpeqd + pshufd + pand.
24473 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
24474
24475 // First cast everything to the right type.
24476 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24477 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24478
24479 // Do the compare.
24480 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
24481
24482 // Make sure the lower and upper halves are both all-ones.
24483 static const int Mask[] = { 1, 0, 3, 2 };
24484 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
24485 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
24486
24487 if (Invert)
24488 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24489
24490 return DAG.getBitcast(VT, Result);
24491 }
24492 }
24493
24494 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24495 // bits of the inputs before performing those operations.
24496 if (FlipSigns) {
24497 MVT EltVT = VT.getVectorElementType();
24499 VT);
24500 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
24501 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
24502 }
24503
24504 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24505
24506 // If the logical-not of the result is required, perform that now.
24507 if (Invert)
24508 Result = DAG.getNOT(dl, Result, VT);
24509
24510 return Result;
24511}
24512
24513// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
24515 const SDLoc &dl, SelectionDAG &DAG,
24516 const X86Subtarget &Subtarget,
24517 SDValue &X86CC) {
24518 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
24519
24520 // Must be a bitcast from vXi1.
24521 if (Op0.getOpcode() != ISD::BITCAST)
24522 return SDValue();
24523
24524 Op0 = Op0.getOperand(0);
24525 MVT VT = Op0.getSimpleValueType();
24526 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
24527 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
24528 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
24529 return SDValue();
24530
24531 X86::CondCode X86Cond;
24532 if (isNullConstant(Op1)) {
24533 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
24534 } else if (isAllOnesConstant(Op1)) {
24535 // C flag is set for all ones.
24536 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
24537 } else
24538 return SDValue();
24539
24540 // If the input is an AND, we can combine it's operands into the KTEST.
24541 bool KTestable = false;
24542 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
24543 KTestable = true;
24544 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
24545 KTestable = true;
24546 if (!isNullConstant(Op1))
24547 KTestable = false;
24548 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
24549 SDValue LHS = Op0.getOperand(0);
24550 SDValue RHS = Op0.getOperand(1);
24551 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24552 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
24553 }
24554
24555 // If the input is an OR, we can combine it's operands into the KORTEST.
24556 SDValue LHS = Op0;
24557 SDValue RHS = Op0;
24558 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
24559 LHS = Op0.getOperand(0);
24560 RHS = Op0.getOperand(1);
24561 }
24562
24563 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24564 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
24565}
24566
24567/// Emit flags for the given setcc condition and operands. Also returns the
24568/// corresponding X86 condition code constant in X86CC.
24569SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
24570 ISD::CondCode CC, const SDLoc &dl,
24571 SelectionDAG &DAG,
24572 SDValue &X86CC) const {
24573 // Equality Combines.
24574 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
24575 X86::CondCode X86CondCode;
24576
24577 // Optimize to BT if possible.
24578 // Lower (X & (1 << N)) == 0 to BT(X, N).
24579 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
24580 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
24581 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {
24582 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
24583 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24584 return BT;
24585 }
24586 }
24587
24588 // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.
24589 if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,
24590 X86CondCode)) {
24591 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24592 return CmpZ;
24593 }
24594
24595 // Try to lower using KORTEST or KTEST.
24596 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
24597 return Test;
24598
24599 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms
24600 // of these.
24601 if (isOneConstant(Op1) || isNullConstant(Op1)) {
24602 // If the input is a setcc, then reuse the input setcc or use a new one
24603 // with the inverted condition.
24604 if (Op0.getOpcode() == X86ISD::SETCC) {
24605 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
24606
24607 X86CC = Op0.getOperand(0);
24608 if (Invert) {
24609 X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);
24610 X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);
24611 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24612 }
24613
24614 return Op0.getOperand(1);
24615 }
24616 }
24617
24618 // Look for X == INT_MIN or X != INT_MIN. We can use NEG and test for
24619 // overflow.
24620 if (isMinSignedConstant(Op1)) {
24621 EVT VT = Op0.getValueType();
24622 if (VT == MVT::i32 || VT == MVT::i64 || Op0->hasOneUse()) {
24623 SDVTList CmpVTs = DAG.getVTList(VT, MVT::i32);
24625 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24626 SDValue Neg = DAG.getNode(X86ISD::SUB, dl, CmpVTs,
24627 DAG.getConstant(0, dl, VT), Op0);
24628 return SDValue(Neg.getNode(), 1);
24629 }
24630 }
24631
24632 // Try to use the carry flag from the add in place of an separate CMP for:
24633 // (seteq (add X, -1), -1). Similar for setne.
24634 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
24635 Op0.getOperand(1) == Op1) {
24636 if (isProfitableToUseFlagOp(Op0)) {
24637 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
24638
24639 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
24640 Op0.getOperand(1));
24641 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
24642 X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
24643 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24644 return SDValue(New.getNode(), 1);
24645 }
24646 }
24647 }
24648
24650 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
24651 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
24652
24653 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
24654 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24655 return EFLAGS;
24656}
24657
24658SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
24659
24660 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24661 Op.getOpcode() == ISD::STRICT_FSETCCS;
24662 MVT VT = Op->getSimpleValueType(0);
24663
24664 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
24665
24666 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
24667 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24668 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24669 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24670 SDLoc dl(Op);
24671 ISD::CondCode CC =
24672 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
24673
24674 if (isSoftF16(Op0.getValueType(), Subtarget))
24675 return SDValue();
24676
24677 // Handle f128 first, since one possible outcome is a normal integer
24678 // comparison which gets handled by emitFlagsForSetcc.
24679 if (Op0.getValueType() == MVT::f128) {
24680 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
24681 Op.getOpcode() == ISD::STRICT_FSETCCS);
24682
24683 // If softenSetCCOperands returned a scalar, use it.
24684 if (!Op1.getNode()) {
24685 assert(Op0.getValueType() == Op.getValueType() &&
24686 "Unexpected setcc expansion!");
24687 if (IsStrict)
24688 return DAG.getMergeValues({Op0, Chain}, dl);
24689 return Op0;
24690 }
24691 }
24692
24693 if (Op0.getSimpleValueType().isInteger()) {
24694 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
24695 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
24696 // this may translate to less uops depending on uarch implementation. The
24697 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
24698 // canonicalize to that CondCode.
24699 // NOTE: Only do this if incrementing the constant doesn't increase the bit
24700 // encoding size - so it must either already be a i8 or i32 immediate, or it
24701 // shrinks down to that. We don't do this for any i64's to avoid additional
24702 // constant materializations.
24703 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
24704 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
24705 const APInt &Op1Val = Op1C->getAPIntValue();
24706 if (!Op1Val.isZero()) {
24707 // Ensure the constant+1 doesn't overflow.
24708 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
24709 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
24710 APInt Op1ValPlusOne = Op1Val + 1;
24711 if (Op1ValPlusOne.isSignedIntN(32) &&
24712 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
24713 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
24716 }
24717 }
24718 }
24719 }
24720
24721 SDValue X86CC;
24722 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
24723 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24724 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24725 }
24726
24727 if (Subtarget.hasAVX10_2()) {
24728 if (CC == ISD::SETOEQ || CC == ISD::SETUNE) {
24729 auto NewCC = (CC == ISD::SETOEQ) ? X86::COND_E : (X86::COND_NE);
24730 assert(Op0.getSimpleValueType() != MVT::bf16 && "Unsupported Type");
24731 if (Op0.getSimpleValueType() != MVT::f80) {
24732 SDValue Res = getSETCC(
24733 NewCC, DAG.getNode(X86ISD::UCOMX, dl, MVT::i32, Op0, Op1), dl, DAG);
24734 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24735 }
24736 }
24737 }
24738 // Handle floating point.
24739 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
24740 if (CondCode == X86::COND_INVALID)
24741 return SDValue();
24742
24743 SDValue EFLAGS;
24744 if (IsStrict) {
24745 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24746 EFLAGS =
24748 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
24749 Chain = EFLAGS.getValue(1);
24750 } else {
24751 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
24752 }
24753
24754 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24755 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24756 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24757}
24758
24759SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
24760 SDValue LHS = Op.getOperand(0);
24761 SDValue RHS = Op.getOperand(1);
24762 SDValue Carry = Op.getOperand(2);
24763 SDValue Cond = Op.getOperand(3);
24764 SDLoc DL(Op);
24765
24766 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
24768
24769 // Recreate the carry if needed.
24770 EVT CarryVT = Carry.getValueType();
24771 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
24772 Carry, DAG.getAllOnesConstant(DL, CarryVT));
24773
24774 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
24775 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
24776 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
24777}
24778
24779// This function returns three things: the arithmetic computation itself
24780// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
24781// flag and the condition code define the case in which the arithmetic
24782// computation overflows.
24783static std::pair<SDValue, SDValue>
24785 assert(Op.getResNo() == 0 && "Unexpected result number!");
24786 SDValue Value, Overflow;
24787 SDValue LHS = Op.getOperand(0);
24788 SDValue RHS = Op.getOperand(1);
24789 unsigned BaseOp = 0;
24790 SDLoc DL(Op);
24791 switch (Op.getOpcode()) {
24792 default: llvm_unreachable("Unknown ovf instruction!");
24793 case ISD::SADDO:
24794 BaseOp = X86ISD::ADD;
24795 Cond = X86::COND_O;
24796 break;
24797 case ISD::UADDO:
24798 BaseOp = X86ISD::ADD;
24800 break;
24801 case ISD::SSUBO:
24802 BaseOp = X86ISD::SUB;
24803 Cond = X86::COND_O;
24804 break;
24805 case ISD::USUBO:
24806 BaseOp = X86ISD::SUB;
24807 Cond = X86::COND_B;
24808 break;
24809 case ISD::SMULO:
24810 BaseOp = X86ISD::SMUL;
24811 Cond = X86::COND_O;
24812 break;
24813 case ISD::UMULO:
24814 BaseOp = X86ISD::UMUL;
24815 Cond = X86::COND_O;
24816 break;
24817 }
24818
24819 if (BaseOp) {
24820 // Also sets EFLAGS.
24821 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24822 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
24823 Overflow = Value.getValue(1);
24824 }
24825
24826 return std::make_pair(Value, Overflow);
24827}
24828
24830 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
24831 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
24832 // looks for this combo and may remove the "setcc" instruction if the "setcc"
24833 // has only one use.
24834 SDLoc DL(Op);
24836 SDValue Value, Overflow;
24837 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
24838
24839 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
24840 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
24841 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
24842}
24843
24844/// Return true if opcode is a X86 logical comparison.
24846 unsigned Opc = Op.getOpcode();
24847 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
24848 Opc == X86ISD::FCMP)
24849 return true;
24850 if (Op.getResNo() == 1 &&
24851 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
24853 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
24854 return true;
24855
24856 return false;
24857}
24858
24860 if (V.getOpcode() != ISD::TRUNCATE)
24861 return false;
24862
24863 SDValue VOp0 = V.getOperand(0);
24864 unsigned InBits = VOp0.getValueSizeInBits();
24865 unsigned Bits = V.getValueSizeInBits();
24866 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
24867}
24868
24869// Lower various (select (icmp CmpVal, 0), LHS, RHS) custom patterns.
24871 unsigned X86CC, const SDLoc &DL,
24872 SelectionDAG &DAG,
24873 const X86Subtarget &Subtarget) {
24874 EVT CmpVT = CmpVal.getValueType();
24875 EVT VT = LHS.getValueType();
24876 if (!CmpVT.isScalarInteger() || !VT.isScalarInteger())
24877 return SDValue();
24878
24879 if (X86CC == X86::COND_E && CmpVal.getOpcode() == ISD::AND &&
24880 isOneConstant(CmpVal.getOperand(1))) {
24881 auto SplatLSB = [&](EVT SplatVT) {
24882 // we need mask of all zeros or ones with same size of the other
24883 // operands.
24884 SDValue Neg = CmpVal;
24885 if (CmpVT.bitsGT(SplatVT))
24886 Neg = DAG.getNode(ISD::TRUNCATE, DL, SplatVT, CmpVal);
24887 else if (CmpVT.bitsLT(SplatVT))
24888 Neg = DAG.getNode(
24889 ISD::AND, DL, SplatVT,
24890 DAG.getNode(ISD::ANY_EXTEND, DL, SplatVT, CmpVal.getOperand(0)),
24891 DAG.getConstant(1, DL, SplatVT));
24892 return DAG.getNegative(Neg, DL, SplatVT); // -(and (x, 0x1))
24893 };
24894
24895 // SELECT (AND(X,1) == 0), 0, -1 -> NEG(AND(X,1))
24897 return SplatLSB(VT);
24898
24899 // SELECT (AND(X,1) == 0), C1, C2 -> XOR(C1,AND(NEG(AND(X,1)),XOR(C1,C2))
24900 if (!Subtarget.canUseCMOV() && isa<ConstantSDNode>(LHS) &&
24902 SDValue Mask = SplatLSB(VT);
24903 SDValue Diff = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
24904 SDValue Flip = DAG.getNode(ISD::AND, DL, VT, Mask, Diff);
24905 return DAG.getNode(ISD::XOR, DL, VT, LHS, Flip);
24906 }
24907
24908 SDValue Src1, Src2;
24909 auto isIdentityPatternZero = [&]() {
24910 switch (RHS.getOpcode()) {
24911 default:
24912 break;
24913 case ISD::OR:
24914 case ISD::XOR:
24915 case ISD::ADD:
24916 if (RHS.getOperand(0) == LHS || RHS.getOperand(1) == LHS) {
24917 Src1 = RHS.getOperand(RHS.getOperand(0) == LHS ? 1 : 0);
24918 Src2 = LHS;
24919 return true;
24920 }
24921 break;
24922 case ISD::SHL:
24923 case ISD::SRA:
24924 case ISD::SRL:
24925 case ISD::SUB:
24926 if (RHS.getOperand(0) == LHS) {
24927 Src1 = RHS.getOperand(1);
24928 Src2 = LHS;
24929 return true;
24930 }
24931 break;
24932 }
24933 return false;
24934 };
24935
24936 auto isIdentityPatternOnes = [&]() {
24937 switch (LHS.getOpcode()) {
24938 default:
24939 break;
24940 case ISD::AND:
24941 if (LHS.getOperand(0) == RHS || LHS.getOperand(1) == RHS) {
24942 Src1 = LHS.getOperand(LHS.getOperand(0) == RHS ? 1 : 0);
24943 Src2 = RHS;
24944 return true;
24945 }
24946 break;
24947 }
24948 return false;
24949 };
24950
24951 // Convert 'identity' patterns (iff X is 0 or 1):
24952 // SELECT (AND(X,1) == 0), Y, (OR Y, Z) -> (OR Y, (AND NEG(AND(X,1)), Z))
24953 // SELECT (AND(X,1) == 0), Y, (XOR Y, Z) -> (XOR Y, (AND NEG(AND(X,1)), Z))
24954 // SELECT (AND(X,1) == 0), Y, (ADD Y, Z) -> (ADD Y, (AND NEG(AND(X,1)), Z))
24955 // SELECT (AND(X,1) == 0), Y, (SUB Y, Z) -> (SUB Y, (AND NEG(AND(X,1)), Z))
24956 // SELECT (AND(X,1) == 0), Y, (SHL Y, Z) -> (SHL Y, (AND NEG(AND(X,1)), Z))
24957 // SELECT (AND(X,1) == 0), Y, (SRA Y, Z) -> (SRA Y, (AND NEG(AND(X,1)), Z))
24958 // SELECT (AND(X,1) == 0), Y, (SRL Y, Z) -> (SRL Y, (AND NEG(AND(X,1)), Z))
24959 if (!Subtarget.canUseCMOV() && isIdentityPatternZero()) {
24960 SDValue Mask = SplatLSB(Src1.getValueType());
24961 SDValue And = DAG.getNode(ISD::AND, DL, Src1.getValueType(), Mask,
24962 Src1); // Mask & z
24963 return DAG.getNode(RHS.getOpcode(), DL, VT, Src2, And); // y Op And
24964 }
24965 // SELECT (AND(X,1) == 0), (AND Y, Z), Y -> (AND Y, (OR NEG(AND(X, 1)), Z))
24966 if (!Subtarget.canUseCMOV() && isIdentityPatternOnes()) {
24967 SDValue Mask = SplatLSB(VT);
24968 SDValue Or = DAG.getNode(ISD::OR, DL, VT, Mask, Src1); // Mask | z
24969 return DAG.getNode(LHS.getOpcode(), DL, VT, Src2, Or); // y Op Or
24970 }
24971 }
24972
24973 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
24976 SDVTList CmpVTs = DAG.getVTList(CmpVT, MVT::i32);
24977
24978 // 'X - 1' sets the carry flag if X == 0.
24979 // '0 - X' sets the carry flag if X != 0.
24980 // Convert the carry flag to a -1/0 mask with sbb:
24981 // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
24982 // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
24983 // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
24984 // select (X == 0), -1, Y --> X - 1; or (sbb), Y
24985 SDValue Sub;
24986 if (isAllOnesConstant(LHS) == (X86CC == X86::COND_NE)) {
24987 SDValue Zero = DAG.getConstant(0, DL, CmpVT);
24988 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpVal);
24989 } else {
24990 SDValue One = DAG.getConstant(1, DL, CmpVT);
24991 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpVal, One);
24992 }
24993 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
24994 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
24995 Sub.getValue(1));
24996 return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
24997 }
24998
24999 return SDValue();
25000}
25001
25002SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
25003 bool AddTest = true;
25004 SDValue Cond = Op.getOperand(0);
25005 SDValue Op1 = Op.getOperand(1);
25006 SDValue Op2 = Op.getOperand(2);
25007 SDLoc DL(Op);
25008 MVT VT = Op1.getSimpleValueType();
25009 SDValue CC;
25010
25011 if (isSoftF16(VT, Subtarget)) {
25012 MVT NVT = VT.changeTypeToInteger();
25013 return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
25014 DAG.getBitcast(NVT, Op1),
25015 DAG.getBitcast(NVT, Op2)));
25016 }
25017
25018 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
25019 // are available or VBLENDV if AVX is available.
25020 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
25021 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
25022 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
25023 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
25024 bool IsAlwaysSignaling;
25025 unsigned SSECC =
25026 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
25027 CondOp0, CondOp1, IsAlwaysSignaling);
25028
25029 if (Subtarget.hasAVX512()) {
25030 SDValue Cmp =
25031 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
25032 DAG.getTargetConstant(SSECC, DL, MVT::i8));
25033 assert(!VT.isVector() && "Not a scalar type?");
25034 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
25035 }
25036
25037 if (SSECC < 8 || Subtarget.hasAVX()) {
25038 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
25039 DAG.getTargetConstant(SSECC, DL, MVT::i8));
25040
25041 // If we have SSE41/AVX, we can use a variable vector select (VBLENDV)
25042 // instead of 3 logic instructions for size savings and potentially speed.
25043 // Unfortunately, there is no scalar form of VBLENDV.
25044 //
25045 // If either operand is a +0.0 constant, don't try this. We can expect to
25046 // optimize away at least one of the logic instructions later in that
25047 // case, so that sequence would be faster than a variable blend.
25048 if (Subtarget.hasSSE41() && !isNullFPConstant(Op1) &&
25049 !isNullFPConstant(Op2)) {
25050 // Convert to vectors, do a VSELECT, and convert back to scalar.
25051 // All of the conversions should be optimized away.
25052 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
25053 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
25054 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
25055 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
25056
25057 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
25058 VCmp = DAG.getBitcast(VCmpVT, VCmp);
25059
25060 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
25061
25062 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VSel,
25063 DAG.getVectorIdxConstant(0, DL));
25064 }
25065 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
25066 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
25067 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
25068 }
25069 }
25070
25071 // AVX512 fallback is to lower selects of scalar floats to masked moves.
25072 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
25073 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
25074 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
25075 }
25076
25077 if (Cond.getOpcode() == ISD::SETCC &&
25078 !isSoftF16(Cond.getOperand(0).getSimpleValueType(), Subtarget)) {
25079 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
25080 Cond = NewCond;
25081 // If the condition was updated, it's possible that the operands of the
25082 // select were also updated (for example, EmitTest has a RAUW). Refresh
25083 // the local references to the select operands in case they got stale.
25084 Op1 = Op.getOperand(1);
25085 Op2 = Op.getOperand(2);
25086 }
25087 }
25088
25089 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
25090 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
25091 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
25092 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
25093 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
25094 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
25095 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
25096 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
25097 if (Cond.getOpcode() == X86ISD::SETCC &&
25098 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
25099 isNullConstant(Cond.getOperand(1).getOperand(1))) {
25100 SDValue Cmp = Cond.getOperand(1);
25101 SDValue CmpOp0 = Cmp.getOperand(0);
25102 unsigned CondCode = Cond.getConstantOperandVal(0);
25103
25104 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
25105 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
25106 // handle to keep the CMP with 0. This should be removed by
25107 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
25108 // cttz_zero_undef.
25109 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
25110 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
25111 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
25112 };
25113 if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
25114 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
25115 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
25116 // Keep Cmp.
25117 } else if (SDValue R = LowerSELECTWithCmpZero(CmpOp0, Op1, Op2, CondCode,
25118 DL, DAG, Subtarget)) {
25119 return R;
25120 } else if (VT.isScalarInteger() && isNullConstant(Op2) &&
25121 Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
25122 ((CondCode == X86::COND_S) || // smin(x, 0)
25123 (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
25124 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
25125 //
25126 // If the comparison is testing for a positive value, we have to invert
25127 // the sign bit mask, so only do that transform if the target has a
25128 // bitwise 'and not' instruction (the invert is free).
25129 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
25130 unsigned ShCt = VT.getSizeInBits() - 1;
25131 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
25132 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
25133 if (CondCode == X86::COND_G)
25134 Shift = DAG.getNOT(DL, Shift, VT);
25135 return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
25136 }
25137 }
25138
25139 // Look past (and (setcc_carry (cmp ...)), 1).
25140 if (Cond.getOpcode() == ISD::AND &&
25141 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
25142 isOneConstant(Cond.getOperand(1)))
25143 Cond = Cond.getOperand(0);
25144
25145 // Attempt to fold "raw cond" cases by treating them as:
25146 // (select (and X, 1), Op1, Op2 --> (select (icmpeq (and X, 1), 0), Op2, Op1)
25147 if (Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1)))
25148 if (SDValue R = LowerSELECTWithCmpZero(Cond, Op2, Op1, X86::COND_E, DL, DAG,
25149 Subtarget))
25150 return R;
25151
25152 // If condition flag is set by a X86ISD::CMP, then use it as the condition
25153 // setting operand in place of the X86ISD::SETCC.
25154 unsigned CondOpcode = Cond.getOpcode();
25155 if (CondOpcode == X86ISD::SETCC ||
25156 CondOpcode == X86ISD::SETCC_CARRY) {
25157 CC = Cond.getOperand(0);
25158
25159 SDValue Cmp = Cond.getOperand(1);
25160 bool IllegalFPCMov = false;
25161 if (VT.isFloatingPoint() && !VT.isVector() &&
25162 !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?
25163 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
25164
25165 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
25166 Cmp.getOpcode() == X86ISD::BT) { // FIXME
25167 Cond = Cmp;
25168 AddTest = false;
25169 }
25170 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
25171 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
25172 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
25173 SDValue Value;
25174 X86::CondCode X86Cond;
25175 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25176
25177 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
25178 AddTest = false;
25179 }
25180
25181 if (AddTest) {
25182 // Look past the truncate if the high bits are known zero.
25184 Cond = Cond.getOperand(0);
25185
25186 // We know the result of AND is compared against zero. Try to match
25187 // it to BT.
25188 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
25189 X86::CondCode X86CondCode;
25190 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
25191 CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
25192 Cond = BT;
25193 AddTest = false;
25194 }
25195 }
25196 }
25197
25198 if (AddTest) {
25199 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
25200 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
25201 }
25202
25203 // a < b ? -1 : 0 -> RES = ~setcc_carry
25204 // a < b ? 0 : -1 -> RES = setcc_carry
25205 // a >= b ? -1 : 0 -> RES = setcc_carry
25206 // a >= b ? 0 : -1 -> RES = ~setcc_carry
25207 if (Cond.getOpcode() == X86ISD::SUB) {
25208 unsigned CondCode = CC->getAsZExtVal();
25209
25210 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
25211 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
25212 (isNullConstant(Op1) || isNullConstant(Op2))) {
25213 SDValue Res =
25214 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
25215 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
25216 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
25217 return DAG.getNOT(DL, Res, Res.getValueType());
25218 return Res;
25219 }
25220 }
25221
25222 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
25223 // widen the cmov and push the truncate through. This avoids introducing a new
25224 // branch during isel and doesn't add any extensions.
25225 if (Op.getValueType() == MVT::i8 &&
25226 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
25227 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
25228 if (T1.getValueType() == T2.getValueType() &&
25229 // Exclude CopyFromReg to avoid partial register stalls.
25230 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
25231 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
25232 CC, Cond);
25233 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
25234 }
25235 }
25236
25237 // Or finally, promote i8 cmovs if we have CMOV,
25238 // or i16 cmovs if it won't prevent folding a load.
25239 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
25240 // legal, but EmitLoweredSelect() can not deal with these extensions
25241 // being inserted between two CMOV's. (in i16 case too TBN)
25242 // https://bugs.llvm.org/show_bug.cgi?id=40974
25243 if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
25244 (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
25245 !X86::mayFoldLoad(Op2, Subtarget))) {
25246 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
25247 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
25248 SDValue Ops[] = { Op2, Op1, CC, Cond };
25249 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
25250 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
25251 }
25252
25253 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
25254 // condition is true.
25255 SDValue Ops[] = { Op2, Op1, CC, Cond };
25256 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
25257}
25258
25260 const X86Subtarget &Subtarget,
25261 SelectionDAG &DAG) {
25262 MVT VT = Op->getSimpleValueType(0);
25263 SDValue In = Op->getOperand(0);
25264 MVT InVT = In.getSimpleValueType();
25265 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
25266 MVT VTElt = VT.getVectorElementType();
25267 unsigned NumElts = VT.getVectorNumElements();
25268
25269 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
25270 MVT ExtVT = VT;
25271 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
25272 // If v16i32 is to be avoided, we'll need to split and concatenate.
25273 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
25274 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
25275
25276 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
25277 }
25278
25279 // Widen to 512-bits if VLX is not supported.
25280 MVT WideVT = ExtVT;
25281 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
25282 NumElts *= 512 / ExtVT.getSizeInBits();
25283 InVT = MVT::getVectorVT(MVT::i1, NumElts);
25284 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT), In,
25285 DAG.getVectorIdxConstant(0, dl));
25286 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
25287 }
25288
25289 SDValue V;
25290 MVT WideEltVT = WideVT.getVectorElementType();
25291 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
25292 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
25293 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
25294 } else {
25295 SDValue NegOne = DAG.getAllOnesConstant(dl, WideVT);
25296 SDValue Zero = DAG.getConstant(0, dl, WideVT);
25297 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
25298 }
25299
25300 // Truncate if we had to extend i16/i8 above.
25301 if (VT != ExtVT) {
25302 WideVT = MVT::getVectorVT(VTElt, NumElts);
25303 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
25304 }
25305
25306 // Extract back to 128/256-bit if we widened.
25307 if (WideVT != VT)
25308 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
25309 DAG.getVectorIdxConstant(0, dl));
25310
25311 return V;
25312}
25313
25315 SelectionDAG &DAG) {
25316 SDValue In = Op->getOperand(0);
25317 MVT InVT = In.getSimpleValueType();
25318 SDLoc DL(Op);
25319
25320 if (InVT.getVectorElementType() == MVT::i1)
25321 return LowerSIGN_EXTEND_Mask(Op, DL, Subtarget, DAG);
25322
25323 assert(Subtarget.hasAVX() && "Expected AVX support");
25324 return LowerAVXExtend(Op, DL, DAG, Subtarget);
25325}
25326
25327// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
25328// For sign extend this needs to handle all vector sizes and SSE4.1 and
25329// non-SSE4.1 targets. For zero extend this should only handle inputs of
25330// MVT::v64i8 when BWI is not supported, but AVX512 is.
25332 const X86Subtarget &Subtarget,
25333 SelectionDAG &DAG) {
25334 SDValue In = Op->getOperand(0);
25335 MVT VT = Op->getSimpleValueType(0);
25336 MVT InVT = In.getSimpleValueType();
25337
25338 MVT SVT = VT.getVectorElementType();
25339 MVT InSVT = InVT.getVectorElementType();
25341
25342 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
25343 return SDValue();
25344 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
25345 return SDValue();
25346 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
25347 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
25348 !(VT.is512BitVector() && Subtarget.hasAVX512()))
25349 return SDValue();
25350
25351 SDLoc dl(Op);
25352 unsigned Opc = Op.getOpcode();
25353 unsigned NumElts = VT.getVectorNumElements();
25354
25355 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
25356 // For 512-bit vectors, we need 128-bits or 256-bits.
25357 if (InVT.getSizeInBits() > 128) {
25358 // Input needs to be at least the same number of elements as output, and
25359 // at least 128-bits.
25360 int InSize = InSVT.getSizeInBits() * NumElts;
25361 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
25362 InVT = In.getSimpleValueType();
25363 }
25364
25365 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
25366 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
25367 // need to be handled here for 256/512-bit results.
25368 if (Subtarget.hasInt256()) {
25369 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
25370
25371 if (InVT.getVectorNumElements() != NumElts)
25372 return DAG.getNode(Op.getOpcode(), dl, VT, In);
25373
25374 // FIXME: Apparently we create inreg operations that could be regular
25375 // extends.
25376 unsigned ExtOpc =
25379 return DAG.getNode(ExtOpc, dl, VT, In);
25380 }
25381
25382 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
25383 if (Subtarget.hasAVX()) {
25384 assert(VT.is256BitVector() && "256-bit vector expected");
25385 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25386 int HalfNumElts = HalfVT.getVectorNumElements();
25387
25388 unsigned NumSrcElts = InVT.getVectorNumElements();
25389 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
25390 for (int i = 0; i != HalfNumElts; ++i)
25391 HiMask[i] = HalfNumElts + i;
25392
25393 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
25394 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
25395 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
25396 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
25397 }
25398
25399 // We should only get here for sign extend.
25400 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
25401 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
25402 unsigned InNumElts = InVT.getVectorNumElements();
25403
25404 // If the source elements are already all-signbits, we don't need to extend,
25405 // just splat the elements.
25406 APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
25407 if (DAG.ComputeNumSignBits(In, DemandedElts) == InVT.getScalarSizeInBits()) {
25408 unsigned Scale = InNumElts / NumElts;
25409 SmallVector<int, 16> ShuffleMask;
25410 for (unsigned I = 0; I != NumElts; ++I)
25411 ShuffleMask.append(Scale, I);
25412 return DAG.getBitcast(VT,
25413 DAG.getVectorShuffle(InVT, dl, In, In, ShuffleMask));
25414 }
25415
25416 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
25417 SDValue Curr = In;
25418 SDValue SignExt = Curr;
25419
25420 // As SRAI is only available on i16/i32 types, we expand only up to i32
25421 // and handle i64 separately.
25422 if (InVT != MVT::v4i32) {
25423 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
25424
25425 unsigned DestWidth = DestVT.getScalarSizeInBits();
25426 unsigned Scale = DestWidth / InSVT.getSizeInBits();
25427 unsigned DestElts = DestVT.getVectorNumElements();
25428
25429 // Build a shuffle mask that takes each input element and places it in the
25430 // MSBs of the new element size.
25431 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
25432 for (unsigned i = 0; i != DestElts; ++i)
25433 Mask[i * Scale + (Scale - 1)] = i;
25434
25435 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
25436 Curr = DAG.getBitcast(DestVT, Curr);
25437
25438 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
25439 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
25440 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
25441 }
25442
25443 if (VT == MVT::v2i64) {
25444 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
25445 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
25446 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
25447 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
25448 SignExt = DAG.getBitcast(VT, SignExt);
25449 }
25450
25451 return SignExt;
25452}
25453
25455 SelectionDAG &DAG) {
25456 MVT VT = Op->getSimpleValueType(0);
25457 SDValue In = Op->getOperand(0);
25458 MVT InVT = In.getSimpleValueType();
25459 SDLoc dl(Op);
25460
25461 if (InVT.getVectorElementType() == MVT::i1)
25462 return LowerSIGN_EXTEND_Mask(Op, dl, Subtarget, DAG);
25463
25464 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
25466 "Expected same number of elements");
25467 assert((VT.getVectorElementType() == MVT::i16 ||
25468 VT.getVectorElementType() == MVT::i32 ||
25469 VT.getVectorElementType() == MVT::i64) &&
25470 "Unexpected element type");
25471 assert((InVT.getVectorElementType() == MVT::i8 ||
25472 InVT.getVectorElementType() == MVT::i16 ||
25473 InVT.getVectorElementType() == MVT::i32) &&
25474 "Unexpected element type");
25475
25476 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
25477 assert(InVT == MVT::v32i8 && "Unexpected VT!");
25478 return splitVectorIntUnary(Op, DAG, dl);
25479 }
25480
25481 if (Subtarget.hasInt256())
25482 return Op;
25483
25484 // Optimize vectors in AVX mode
25485 // Sign extend v8i16 to v8i32 and
25486 // v4i32 to v4i64
25487 //
25488 // Divide input vector into two parts
25489 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
25490 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
25491 // concat the vectors to original VT
25492 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25493 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
25494
25495 unsigned NumElems = InVT.getVectorNumElements();
25496 SmallVector<int,8> ShufMask(NumElems, -1);
25497 for (unsigned i = 0; i != NumElems/2; ++i)
25498 ShufMask[i] = i + NumElems/2;
25499
25500 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
25501 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
25502
25503 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
25504}
25505
25506/// Change a vector store into a pair of half-size vector stores.
25508 SDValue StoredVal = Store->getValue();
25509 assert((StoredVal.getValueType().is256BitVector() ||
25510 StoredVal.getValueType().is512BitVector()) &&
25511 "Expecting 256/512-bit op");
25512
25513 // Splitting volatile memory ops is not allowed unless the operation was not
25514 // legal to begin with. Assume the input store is legal (this transform is
25515 // only used for targets with AVX). Note: It is possible that we have an
25516 // illegal type like v2i128, and so we could allow splitting a volatile store
25517 // in that case if that is important.
25518 if (!Store->isSimple())
25519 return SDValue();
25520
25521 SDLoc DL(Store);
25522 SDValue Value0, Value1;
25523 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
25524 unsigned HalfOffset = Value0.getValueType().getStoreSize();
25525 SDValue Ptr0 = Store->getBasePtr();
25526 SDValue Ptr1 =
25527 DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(HalfOffset), DL);
25528 SDValue Ch0 =
25529 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
25530 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25531 SDValue Ch1 =
25532 DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
25533 Store->getPointerInfo().getWithOffset(HalfOffset),
25534 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25535 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
25536}
25537
25538/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
25539/// type.
25541 SelectionDAG &DAG) {
25542 SDValue StoredVal = Store->getValue();
25543 assert(StoreVT.is128BitVector() &&
25544 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
25545 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
25546
25547 // Splitting volatile memory ops is not allowed unless the operation was not
25548 // legal to begin with. We are assuming the input op is legal (this transform
25549 // is only used for targets with AVX).
25550 if (!Store->isSimple())
25551 return SDValue();
25552
25553 MVT StoreSVT = StoreVT.getScalarType();
25554 unsigned NumElems = StoreVT.getVectorNumElements();
25555 unsigned ScalarSize = StoreSVT.getStoreSize();
25556
25557 SDLoc DL(Store);
25559 for (unsigned i = 0; i != NumElems; ++i) {
25560 unsigned Offset = i * ScalarSize;
25561 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
25563 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
25564 DAG.getVectorIdxConstant(i, DL));
25565 SDValue Ch =
25566 DAG.getStore(Store->getChain(), DL, Scl, Ptr,
25567 Store->getPointerInfo().getWithOffset(Offset),
25568 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25569 Stores.push_back(Ch);
25570 }
25571 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
25572}
25573
25574static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
25575 SelectionDAG &DAG) {
25576 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
25577 SDLoc dl(St);
25578 SDValue StoredVal = St->getValue();
25579
25580 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
25581 if (StoredVal.getValueType().isVector() &&
25582 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
25583 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
25584 assert(NumElts <= 8 && "Unexpected VT");
25585 assert(!St->isTruncatingStore() && "Expected non-truncating store");
25586 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25587 "Expected AVX512F without AVX512DQI");
25588
25589 // We must pad with zeros to ensure we store zeroes to any unused bits.
25590 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25591 DAG.getUNDEF(MVT::v16i1), StoredVal,
25592 DAG.getVectorIdxConstant(0, dl));
25593 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
25594 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
25595 // Make sure we store zeros in the extra bits.
25596 if (NumElts < 8)
25597 StoredVal = DAG.getZeroExtendInReg(
25598 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
25599
25600 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25601 St->getPointerInfo(), St->getBaseAlign(),
25602 St->getMemOperand()->getFlags());
25603 }
25604
25605 if (St->isTruncatingStore())
25606 return SDValue();
25607
25608 // If this is a 256/512-bit store of concatenated ops, we are better off
25609 // splitting that store into two half-size stores. This avoids spurious use of
25610 // concatenated ops and each half can execute independently. Some cores would
25611 // split the op into halves anyway, so the concat is purely an extra op.
25612 MVT StoreVT = StoredVal.getSimpleValueType();
25613 if (StoreVT.is256BitVector() || StoreVT.is512BitVector()) {
25614 if (StoredVal.hasOneUse() && isFreeToSplitVector(StoredVal, DAG))
25615 return splitVectorStore(St, DAG);
25616 return SDValue();
25617 }
25618
25619 if (StoreVT.is32BitVector())
25620 return SDValue();
25621
25622 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25623 assert(StoreVT.is64BitVector() && "Unexpected VT");
25624 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
25626 "Unexpected type action!");
25627
25628 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
25629 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
25630 DAG.getUNDEF(StoreVT));
25631
25632 if (Subtarget.hasSSE2()) {
25633 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
25634 // and store it.
25635 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
25636 MVT CastVT = MVT::getVectorVT(StVT, 2);
25637 StoredVal = DAG.getBitcast(CastVT, StoredVal);
25638 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
25639 DAG.getVectorIdxConstant(0, dl));
25640
25641 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25642 St->getPointerInfo(), St->getBaseAlign(),
25643 St->getMemOperand()->getFlags());
25644 }
25645 assert(Subtarget.hasSSE1() && "Expected SSE");
25646 SDVTList Tys = DAG.getVTList(MVT::Other);
25647 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
25648 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
25649 St->getMemOperand());
25650}
25651
25652// Lower vector extended loads using a shuffle. If SSSE3 is not available we
25653// may emit an illegal shuffle but the expansion is still better than scalar
25654// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
25655// we'll emit a shuffle and a arithmetic shift.
25656// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
25657// TODO: It is possible to support ZExt by zeroing the undef values during
25658// the shuffle phase or after the shuffle.
25659static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
25660 SelectionDAG &DAG) {
25661 MVT RegVT = Op.getSimpleValueType();
25662 assert(RegVT.isVector() && "We only custom lower vector loads.");
25663 assert(RegVT.isInteger() &&
25664 "We only custom lower integer vector loads.");
25665
25666 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
25667 SDLoc dl(Ld);
25668
25669 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
25670 if (RegVT.getVectorElementType() == MVT::i1) {
25671 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
25672 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
25673 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25674 "Expected AVX512F without AVX512DQI");
25675
25676 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
25677 Ld->getPointerInfo(), Ld->getBaseAlign(),
25678 Ld->getMemOperand()->getFlags());
25679
25680 // Replace chain users with the new chain.
25681 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
25682
25683 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
25684 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
25685 DAG.getBitcast(MVT::v16i1, Val),
25686 DAG.getVectorIdxConstant(0, dl));
25687 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
25688 }
25689
25690 return SDValue();
25691}
25692
25693/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
25694/// each of which has no other use apart from the AND / OR.
25695static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
25696 Opc = Op.getOpcode();
25697 if (Opc != ISD::OR && Opc != ISD::AND)
25698 return false;
25699 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
25700 Op.getOperand(0).hasOneUse() &&
25701 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
25702 Op.getOperand(1).hasOneUse());
25703}
25704
25705SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
25706 SDValue Chain = Op.getOperand(0);
25707 SDValue Cond = Op.getOperand(1);
25708 SDValue Dest = Op.getOperand(2);
25709 SDLoc dl(Op);
25710
25711 // Bail out when we don't have native compare instructions.
25712 if (Cond.getOpcode() == ISD::SETCC &&
25713 Cond.getOperand(0).getValueType() != MVT::f128 &&
25714 !isSoftF16(Cond.getOperand(0).getValueType(), Subtarget)) {
25715 SDValue LHS = Cond.getOperand(0);
25716 SDValue RHS = Cond.getOperand(1);
25717 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
25718
25719 // Special case for
25720 // setcc([su]{add,sub,mul}o == 0)
25721 // setcc([su]{add,sub,mul}o != 1)
25723 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
25725 SDValue Value, Overflow;
25726 X86::CondCode X86Cond;
25727 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
25728
25729 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
25730 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
25731
25732 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25733 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25734 Overflow, Op->getFlags());
25735 }
25736
25737 if (LHS.getSimpleValueType().isInteger()) {
25738 SDValue CCVal;
25739 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
25740 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25741 EFLAGS, Op->getFlags());
25742 }
25743
25744 if (CC == ISD::SETOEQ) {
25745 // For FCMP_OEQ, we can emit
25746 // two branches instead of an explicit AND instruction with a
25747 // separate test. However, we only do this if this block doesn't
25748 // have a fall-through edge, because this requires an explicit
25749 // jmp when the condition is false.
25750 if (Op.getNode()->hasOneUse()) {
25751 SDNode *User = *Op.getNode()->user_begin();
25752 // Look for an unconditional branch following this conditional branch.
25753 // We need this because we need to reverse the successors in order
25754 // to implement FCMP_OEQ.
25755 if (User->getOpcode() == ISD::BR) {
25756 SDValue FalseBB = User->getOperand(1);
25757 SDNode *NewBR =
25758 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
25759 assert(NewBR == User);
25760 (void)NewBR;
25761 Dest = FalseBB;
25762
25763 SDValue Cmp =
25764 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25765 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25766 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
25767 CCVal, Cmp, Op->getFlags());
25768 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25769 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25770 Cmp, Op->getFlags());
25771 }
25772 }
25773 } else if (CC == ISD::SETUNE) {
25774 // For FCMP_UNE, we can emit
25775 // two branches instead of an explicit OR instruction with a
25776 // separate test.
25777 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25778 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25779 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25780 Cmp, Op->getFlags());
25781 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25782 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25783 Cmp, Op->getFlags());
25784 } else {
25785 X86::CondCode X86Cond =
25786 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
25787 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25788 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25789 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25790 Cmp, Op->getFlags());
25791 }
25792 }
25793
25795 SDValue Value, Overflow;
25796 X86::CondCode X86Cond;
25797 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25798
25799 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25800 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25801 Overflow, Op->getFlags());
25802 }
25803
25804 // Look past the truncate if the high bits are known zero.
25806 Cond = Cond.getOperand(0);
25807
25808 EVT CondVT = Cond.getValueType();
25809
25810 // Add an AND with 1 if we don't already have one.
25811 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
25812 Cond =
25813 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
25814
25815 SDValue LHS = Cond;
25816 SDValue RHS = DAG.getConstant(0, dl, CondVT);
25817
25818 SDValue CCVal;
25819 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
25820 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, EFLAGS,
25821 Op->getFlags());
25822}
25823
25824// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
25825// Calls to _alloca are needed to probe the stack when allocating more than 4k
25826// bytes in one go. Touching the stack at 4K increments is necessary to ensure
25827// that the guard pages used by the OS virtual memory manager are allocated in
25828// correct sequence.
25829SDValue
25830X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
25831 SelectionDAG &DAG) const {
25832 MachineFunction &MF = DAG.getMachineFunction();
25833 bool SplitStack = MF.shouldSplitStack();
25834 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
25835 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
25836 SplitStack || EmitStackProbeCall;
25837 SDLoc dl(Op);
25838
25839 // Get the inputs.
25840 SDNode *Node = Op.getNode();
25841 SDValue Chain = Op.getOperand(0);
25842 SDValue Size = Op.getOperand(1);
25843 MaybeAlign Alignment(Op.getConstantOperandVal(2));
25844 EVT VT = Node->getValueType(0);
25845
25846 // Chain the dynamic stack allocation so that it doesn't modify the stack
25847 // pointer when other instructions are using the stack.
25848 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
25849
25850 bool Is64Bit = Subtarget.is64Bit();
25851 MVT SPTy = Op.getValueType().getSimpleVT();
25852
25854 if (!Lower) {
25855 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25857 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
25858 " not tell us which reg is the stack pointer!");
25859
25860 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
25861 const Align StackAlign = TFI.getStackAlign();
25862 if (hasInlineStackProbe(MF)) {
25863 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, {SPTy, MVT::Other},
25864 {Chain, Size});
25865 Chain = Result.getValue(1);
25866 } else {
25867 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
25868 Chain = SP.getValue(1);
25869 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
25870 }
25871 if (Alignment && *Alignment > StackAlign)
25872 Result = DAG.getNode(
25873 ISD::AND, dl, VT, Result,
25874 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25875 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
25876 } else if (SplitStack) {
25877 if (Is64Bit) {
25878 // The 64 bit implementation of segmented stacks needs to clobber both r10
25879 // r11. This makes it impossible to use it along with nested parameters.
25880 const Function &F = MF.getFunction();
25881 for (const auto &A : F.args()) {
25882 if (A.hasNestAttr())
25883 report_fatal_error("Cannot use segmented stacks with functions that "
25884 "have nested arguments.");
25885 }
25886 }
25887
25888 Result =
25889 DAG.getNode(X86ISD::SEG_ALLOCA, dl, {SPTy, MVT::Other}, {Chain, Size});
25890 Chain = Result.getValue(1);
25891 } else {
25892 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
25893 Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
25894 MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
25895
25896 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25897 Register SPReg = RegInfo->getStackRegister();
25898 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
25899 Chain = SP.getValue(1);
25900
25901 if (Alignment) {
25902 SP = DAG.getNode(
25903 ISD::AND, dl, VT, SP.getValue(0),
25904 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25905 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
25906 }
25907
25908 Result = SP;
25909 }
25910
25911 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
25912
25913 SDValue Ops[2] = {Result, Chain};
25914 return DAG.getMergeValues(Ops, dl);
25915}
25916
25917SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
25918 MachineFunction &MF = DAG.getMachineFunction();
25919 SDValue Ptr = Op.getOperand(1);
25920 EVT PtrVT = Ptr.getValueType();
25921
25922 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
25923
25924 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25925 SDLoc DL(Op);
25926
25927 if (!Subtarget.is64Bit() ||
25928 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
25929 // vastart just stores the address of the VarArgsFrameIndex slot into the
25930 // memory location argument.
25931 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25932 return DAG.getStore(Op.getOperand(0), DL, FR, Ptr, MachinePointerInfo(SV));
25933 }
25934
25935 // __va_list_tag:
25936 // gp_offset (0 - 6 * 8)
25937 // fp_offset (48 - 48 + 8 * 16)
25938 // overflow_arg_area (point to parameters coming in memory).
25939 // reg_save_area
25941 SDValue FIN = Op.getOperand(1);
25942 // Store gp_offset
25943 SDValue Store = DAG.getStore(
25944 Op.getOperand(0), DL,
25945 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
25946 MachinePointerInfo(SV));
25947 MemOps.push_back(Store);
25948
25949 // Store fp_offset
25950 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::getFixed(4), DL);
25951 Store = DAG.getStore(
25952 Op.getOperand(0), DL,
25953 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
25954 MachinePointerInfo(SV, 4));
25955 MemOps.push_back(Store);
25956
25957 // Store ptr to overflow_arg_area
25958 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
25959 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25960 Store =
25961 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
25962 MemOps.push_back(Store);
25963
25964 // Store ptr to reg_save_area.
25965 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
25966 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
25967 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
25968 Store = DAG.getStore(
25969 Op.getOperand(0), DL, RSFIN, FIN,
25970 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
25971 MemOps.push_back(Store);
25972 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
25973}
25974
25975SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
25976 assert(Subtarget.is64Bit() &&
25977 "LowerVAARG only handles 64-bit va_arg!");
25978 assert(Op.getNumOperands() == 4);
25979
25980 MachineFunction &MF = DAG.getMachineFunction();
25981 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
25982 // The Win64 ABI uses char* instead of a structure.
25983 return DAG.expandVAArg(Op.getNode());
25984
25985 SDValue Chain = Op.getOperand(0);
25986 SDValue SrcPtr = Op.getOperand(1);
25987 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25988 unsigned Align = Op.getConstantOperandVal(3);
25989 SDLoc dl(Op);
25990
25991 EVT ArgVT = Op.getNode()->getValueType(0);
25992 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
25993 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
25994 uint8_t ArgMode;
25995
25996 // Decide which area this value should be read from.
25997 // TODO: Implement the AMD64 ABI in its entirety. This simple
25998 // selection mechanism works only for the basic types.
25999 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
26000 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
26001 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
26002 } else {
26003 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
26004 "Unhandled argument type in LowerVAARG");
26005 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
26006 }
26007
26008 if (ArgMode == 2) {
26009 // Make sure using fp_offset makes sense.
26010 assert(!Subtarget.useSoftFloat() &&
26011 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
26012 Subtarget.hasSSE1());
26013 }
26014
26015 // Insert VAARG node into the DAG
26016 // VAARG returns two values: Variable Argument Address, Chain
26017 SDValue InstOps[] = {Chain, SrcPtr,
26018 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
26019 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
26020 DAG.getTargetConstant(Align, dl, MVT::i32)};
26021 SDVTList VTs = DAG.getVTList(SrcPtr.getValueType(), MVT::Other);
26022 SDValue VAARG = DAG.getMemIntrinsicNode(
26023 Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
26024 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
26025 /*Alignment=*/std::nullopt,
26027 Chain = VAARG.getValue(1);
26028
26029 // Load the next argument and return it
26030 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
26031}
26032
26033static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
26034 SelectionDAG &DAG) {
26035 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
26036 // where a va_list is still an i8*.
26037 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
26038 if (Subtarget.isCallingConvWin64(
26040 // Probably a Win64 va_copy.
26041 return DAG.expandVACopy(Op.getNode());
26042
26043 SDValue Chain = Op.getOperand(0);
26044 SDValue DstPtr = Op.getOperand(1);
26045 SDValue SrcPtr = Op.getOperand(2);
26046 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
26047 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
26048 SDLoc DL(Op);
26049
26050 return DAG.getMemcpy(
26051 Chain, DL, DstPtr, SrcPtr,
26052 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
26053 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
26054 /*CI=*/nullptr, std::nullopt, MachinePointerInfo(DstSV),
26055 MachinePointerInfo(SrcSV));
26056}
26057
26058// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
26059static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
26060 switch (Opc) {
26061 case ISD::SHL:
26062 case X86ISD::VSHL:
26063 case X86ISD::VSHLI:
26064 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
26065 case ISD::SRL:
26066 case X86ISD::VSRL:
26067 case X86ISD::VSRLI:
26068 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
26069 case ISD::SRA:
26070 case X86ISD::VSRA:
26071 case X86ISD::VSRAI:
26072 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
26073 }
26074 llvm_unreachable("Unknown target vector shift node");
26075}
26076
26077/// Handle vector element shifts where the shift amount is a constant.
26078/// Takes immediate version of shift as input.
26079static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
26080 SDValue SrcOp, uint64_t ShiftAmt,
26081 SelectionDAG &DAG) {
26082 MVT ElementType = VT.getVectorElementType();
26083
26084 // Bitcast the source vector to the output type, this is mainly necessary for
26085 // vXi8/vXi64 shifts.
26086 if (VT != SrcOp.getSimpleValueType())
26087 SrcOp = DAG.getBitcast(VT, SrcOp);
26088
26089 // Fold this packed shift into its first operand if ShiftAmt is 0.
26090 if (ShiftAmt == 0)
26091 return SrcOp;
26092
26093 // Check for ShiftAmt >= element width
26094 if (ShiftAmt >= ElementType.getSizeInBits()) {
26095 if (Opc == X86ISD::VSRAI)
26096 ShiftAmt = ElementType.getSizeInBits() - 1;
26097 else
26098 return DAG.getConstant(0, dl, VT);
26099 }
26100
26102 && "Unknown target vector shift-by-constant node");
26103
26104 // Fold this packed vector shift into a build vector if SrcOp is a
26105 // vector of Constants or UNDEFs.
26107 unsigned ShiftOpc;
26108 switch (Opc) {
26109 default: llvm_unreachable("Unknown opcode!");
26110 case X86ISD::VSHLI:
26111 ShiftOpc = ISD::SHL;
26112 break;
26113 case X86ISD::VSRLI:
26114 ShiftOpc = ISD::SRL;
26115 break;
26116 case X86ISD::VSRAI:
26117 ShiftOpc = ISD::SRA;
26118 break;
26119 }
26120
26121 SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
26122 if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
26123 return C;
26124 }
26125
26126 return DAG.getNode(Opc, dl, VT, SrcOp,
26127 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
26128}
26129
26130/// Handle vector element shifts by a splat shift amount
26131static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
26132 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
26133 const X86Subtarget &Subtarget,
26134 SelectionDAG &DAG) {
26135 MVT AmtVT = ShAmt.getSimpleValueType();
26136 assert(AmtVT.isVector() && "Vector shift type mismatch");
26137 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&
26138 "Illegal vector splat index");
26139
26140 // Move the splat element to the bottom element.
26141 if (ShAmtIdx != 0) {
26142 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
26143 Mask[0] = ShAmtIdx;
26144 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
26145 }
26146
26147 // Peek through any zext node if we can get back to a 128-bit source.
26148 if (AmtVT.getScalarSizeInBits() == 64 &&
26149 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
26151 ShAmt.getOperand(0).getValueType().isSimple() &&
26152 ShAmt.getOperand(0).getValueType().is128BitVector()) {
26153 ShAmt = ShAmt.getOperand(0);
26154 AmtVT = ShAmt.getSimpleValueType();
26155 }
26156
26157 // See if we can mask off the upper elements using the existing source node.
26158 // The shift uses the entire lower 64-bits of the amount vector, so no need to
26159 // do this for vXi64 types.
26160 bool IsMasked = false;
26161 if (AmtVT.getScalarSizeInBits() < 64) {
26162 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
26163 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
26164 // If the shift amount has come from a scalar, then zero-extend the scalar
26165 // before moving to the vector.
26166 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
26167 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26168 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
26169 AmtVT = MVT::v4i32;
26170 IsMasked = true;
26171 } else if (ShAmt.getOpcode() == ISD::AND) {
26172 // See if the shift amount is already masked (e.g. for rotation modulo),
26173 // then we can zero-extend it by setting all the other mask elements to
26174 // zero.
26175 SmallVector<SDValue> MaskElts(
26176 AmtVT.getVectorNumElements(),
26177 DAG.getConstant(0, dl, AmtVT.getScalarType()));
26178 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
26179 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
26180 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
26181 {ShAmt.getOperand(1), Mask}))) {
26182 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
26183 IsMasked = true;
26184 }
26185 }
26186 }
26187
26188 // Extract if the shift amount vector is larger than 128-bits.
26189 if (AmtVT.getSizeInBits() > 128) {
26190 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
26191 AmtVT = ShAmt.getSimpleValueType();
26192 }
26193
26194 // Zero-extend bottom element to v2i64 vector type, either by extension or
26195 // shuffle masking.
26196 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
26197 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
26198 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
26199 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
26200 } else if (Subtarget.hasSSE41()) {
26201 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
26202 MVT::v2i64, ShAmt);
26203 } else {
26204 SDValue ByteShift = DAG.getTargetConstant(
26205 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
26206 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
26207 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
26208 ByteShift);
26209 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
26210 ByteShift);
26211 }
26212 }
26213
26214 // Change opcode to non-immediate version.
26216
26217 // The return type has to be a 128-bit type with the same element
26218 // type as the input type.
26219 MVT EltVT = VT.getVectorElementType();
26220 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
26221
26222 ShAmt = DAG.getBitcast(ShVT, ShAmt);
26223 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
26224}
26225
26226/// Return Mask with the necessary casting or extending
26227/// for \p Mask according to \p MaskVT when lowering masking intrinsics
26228static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
26229 const X86Subtarget &Subtarget, SelectionDAG &DAG,
26230 const SDLoc &dl) {
26231
26232 if (isAllOnesConstant(Mask))
26233 return DAG.getConstant(1, dl, MaskVT);
26234 if (X86::isZeroNode(Mask))
26235 return DAG.getConstant(0, dl, MaskVT);
26236
26237 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
26238
26239 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
26240 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
26241 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
26242 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
26243 SDValue Lo, Hi;
26244 std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
26245 Lo = DAG.getBitcast(MVT::v32i1, Lo);
26246 Hi = DAG.getBitcast(MVT::v32i1, Hi);
26247 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
26248 } else {
26249 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
26250 Mask.getSimpleValueType().getSizeInBits());
26251 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
26252 // are extracted by EXTRACT_SUBVECTOR.
26253 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
26254 DAG.getBitcast(BitcastVT, Mask),
26255 DAG.getVectorIdxConstant(0, dl));
26256 }
26257}
26258
26259/// Return (and \p Op, \p Mask) for compare instructions or
26260/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
26261/// necessary casting or extending for \p Mask when lowering masking intrinsics
26263 SDValue PreservedSrc,
26264 const X86Subtarget &Subtarget,
26265 SelectionDAG &DAG) {
26266 MVT VT = Op.getSimpleValueType();
26267 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
26268 unsigned OpcodeSelect = ISD::VSELECT;
26269 SDLoc dl(Op);
26270
26271 if (isAllOnesConstant(Mask))
26272 return Op;
26273
26274 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26275
26276 if (PreservedSrc.isUndef())
26277 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
26278 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
26279}
26280
26281/// Creates an SDNode for a predicated scalar operation.
26282/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
26283/// The mask is coming as MVT::i8 and it should be transformed
26284/// to MVT::v1i1 while lowering masking intrinsics.
26285/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
26286/// "X86select" instead of "vselect". We just can't create the "vselect" node
26287/// for a scalar instruction.
26289 SDValue PreservedSrc,
26290 const X86Subtarget &Subtarget,
26291 SelectionDAG &DAG) {
26292 auto *MaskConst = dyn_cast<ConstantSDNode>(Mask);
26293 if (MaskConst && (MaskConst->getZExtValue() & 0x1))
26294 return Op;
26295
26296 MVT VT = Op.getSimpleValueType();
26297 SDLoc dl(Op);
26298
26299 assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
26300 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
26301 DAG.getBitcast(MVT::v8i1, Mask),
26302 DAG.getVectorIdxConstant(0, dl));
26303 if (Op.getOpcode() == X86ISD::FSETCCM ||
26304 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
26305 Op.getOpcode() == X86ISD::VFPCLASSS)
26306 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
26307
26308 if (PreservedSrc.isUndef())
26309 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
26310
26311 if (MaskConst) {
26312 assert((MaskConst->getZExtValue() & 0x1) == 0 && "Expected false mask");
26313 // Discard op and blend passthrough with scalar op src/dst.
26315 std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
26316 ShuffleMask[0] = VT.getVectorNumElements();
26317 return DAG.getVectorShuffle(VT, dl, Op.getOperand(0), PreservedSrc,
26318 ShuffleMask);
26319 }
26320
26321 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
26322}
26323
26325 if (!Fn->hasPersonalityFn())
26327 "querying registration node size for function without personality");
26328 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
26329 // WinEHStatePass for the full struct definition.
26330 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
26331 case EHPersonality::MSVC_X86SEH: return 24;
26332 case EHPersonality::MSVC_CXX: return 16;
26333 default: break;
26334 }
26336 "can only recover FP for 32-bit MSVC EH personality functions");
26337}
26338
26339/// When the MSVC runtime transfers control to us, either to an outlined
26340/// function or when returning to a parent frame after catching an exception, we
26341/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
26342/// Here's the math:
26343/// RegNodeBase = EntryEBP - RegNodeSize
26344/// ParentFP = RegNodeBase - ParentFrameOffset
26345/// Subtracting RegNodeSize takes us to the offset of the registration node, and
26346/// subtracting the offset (negative on x86) takes us back to the parent FP.
26348 SDValue EntryEBP) {
26350 SDLoc dl;
26351
26352 // It's possible that the parent function no longer has a personality function
26353 // if the exceptional code was optimized away, in which case we just return
26354 // the incoming EBP.
26355 if (!Fn->hasPersonalityFn())
26356 return EntryEBP;
26357
26358 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
26359 // registration, or the .set_setframe offset.
26362 MVT PtrVT = EntryEBP.getValueType().getSimpleVT();
26363 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
26364 SDValue ParentFrameOffset =
26365 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
26366
26367 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
26368 // prologue to RBP in the parent function.
26369 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
26370 if (Subtarget.is64Bit())
26371 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
26372
26373 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
26374 // RegNodeBase = EntryEBP - RegNodeSize
26375 // ParentFP = RegNodeBase - ParentFrameOffset
26376 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
26377 DAG.getConstant(RegNodeSize, dl, PtrVT));
26378 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
26379}
26380
26381SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
26382 SelectionDAG &DAG) const {
26383 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
26384 auto isRoundModeCurDirection = [](SDValue Rnd) {
26385 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
26386 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
26387
26388 return false;
26389 };
26390 auto isRoundModeSAE = [](SDValue Rnd) {
26391 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26392 unsigned RC = C->getZExtValue();
26394 // Clear the NO_EXC bit and check remaining bits.
26396 // As a convenience we allow no other bits or explicitly
26397 // current direction.
26398 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
26399 }
26400 }
26401
26402 return false;
26403 };
26404 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
26405 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26406 RC = C->getZExtValue();
26408 // Clear the NO_EXC bit and check remaining bits.
26414 }
26415 }
26416
26417 return false;
26418 };
26419
26420 SDLoc dl(Op);
26421 unsigned IntNo = Op.getConstantOperandVal(0);
26422 MVT VT = Op.getSimpleValueType();
26423 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
26424
26425 // Propagate flags from original node to transformed node(s).
26426 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
26427
26428 if (IntrData) {
26429 switch(IntrData->Type) {
26430 case INTR_TYPE_1OP: {
26431 // We specify 2 possible opcodes for intrinsics with rounding modes.
26432 // First, we check if the intrinsic may have non-default rounding mode,
26433 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26434 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26435 if (IntrWithRoundingModeOpcode != 0) {
26436 SDValue Rnd = Op.getOperand(2);
26437 unsigned RC = 0;
26438 if (isRoundModeSAEToX(Rnd, RC))
26439 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26440 Op.getOperand(1),
26441 DAG.getTargetConstant(RC, dl, MVT::i32));
26442 if (!isRoundModeCurDirection(Rnd))
26443 return SDValue();
26444 }
26445 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26446 Op.getOperand(1));
26447 }
26448 case INTR_TYPE_1OP_SAE: {
26449 SDValue Sae = Op.getOperand(2);
26450
26451 unsigned Opc;
26452 if (isRoundModeCurDirection(Sae))
26453 Opc = IntrData->Opc0;
26454 else if (isRoundModeSAE(Sae))
26455 Opc = IntrData->Opc1;
26456 else
26457 return SDValue();
26458
26459 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
26460 }
26461 case INTR_TYPE_2OP: {
26462 SDValue Src2 = Op.getOperand(2);
26463
26464 // We specify 2 possible opcodes for intrinsics with rounding modes.
26465 // First, we check if the intrinsic may have non-default rounding mode,
26466 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26467 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26468 if (IntrWithRoundingModeOpcode != 0) {
26469 SDValue Rnd = Op.getOperand(3);
26470 unsigned RC = 0;
26471 if (isRoundModeSAEToX(Rnd, RC))
26472 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26473 Op.getOperand(1), Src2,
26474 DAG.getTargetConstant(RC, dl, MVT::i32));
26475 if (!isRoundModeCurDirection(Rnd))
26476 return SDValue();
26477 }
26478
26479 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26480 Op.getOperand(1), Src2);
26481 }
26482 case INTR_TYPE_2OP_SAE: {
26483 SDValue Sae = Op.getOperand(3);
26484
26485 unsigned Opc;
26486 if (isRoundModeCurDirection(Sae))
26487 Opc = IntrData->Opc0;
26488 else if (isRoundModeSAE(Sae))
26489 Opc = IntrData->Opc1;
26490 else
26491 return SDValue();
26492
26493 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
26494 Op.getOperand(2));
26495 }
26496 case INTR_TYPE_3OP:
26497 case INTR_TYPE_3OP_IMM8: {
26498 SDValue Src1 = Op.getOperand(1);
26499 SDValue Src2 = Op.getOperand(2);
26500 SDValue Src3 = Op.getOperand(3);
26501
26502 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
26503 Src3.getValueType() != MVT::i8) {
26504 Src3 = DAG.getTargetConstant(Src3->getAsZExtVal() & 0xff, dl, MVT::i8);
26505 }
26506
26507 // We specify 2 possible opcodes for intrinsics with rounding modes.
26508 // First, we check if the intrinsic may have non-default rounding mode,
26509 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26510 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26511 if (IntrWithRoundingModeOpcode != 0) {
26512 SDValue Rnd = Op.getOperand(4);
26513 unsigned RC = 0;
26514 if (isRoundModeSAEToX(Rnd, RC))
26515 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26516 Src1, Src2, Src3,
26517 DAG.getTargetConstant(RC, dl, MVT::i32));
26518 if (!isRoundModeCurDirection(Rnd))
26519 return SDValue();
26520 }
26521
26522 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26523 {Src1, Src2, Src3});
26524 }
26525 case INTR_TYPE_4OP_IMM8: {
26526 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
26527 SDValue Src4 = Op.getOperand(4);
26528 if (Src4.getValueType() != MVT::i8) {
26529 Src4 = DAG.getTargetConstant(Src4->getAsZExtVal() & 0xff, dl, MVT::i8);
26530 }
26531
26532 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26533 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
26534 Src4);
26535 }
26536 case INTR_TYPE_1OP_MASK: {
26537 SDValue Src = Op.getOperand(1);
26538 SDValue PassThru = Op.getOperand(2);
26539 SDValue Mask = Op.getOperand(3);
26540 // We add rounding mode to the Node when
26541 // - RC Opcode is specified and
26542 // - RC is not "current direction".
26543 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26544 if (IntrWithRoundingModeOpcode != 0) {
26545 SDValue Rnd = Op.getOperand(4);
26546 unsigned RC = 0;
26547 if (isRoundModeSAEToX(Rnd, RC))
26548 return getVectorMaskingNode(
26549 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26550 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
26551 Mask, PassThru, Subtarget, DAG);
26552 if (!isRoundModeCurDirection(Rnd))
26553 return SDValue();
26554 }
26555 return getVectorMaskingNode(
26556 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
26557 Subtarget, DAG);
26558 }
26560 SDValue Src = Op.getOperand(1);
26561 SDValue PassThru = Op.getOperand(2);
26562 SDValue Mask = Op.getOperand(3);
26563 SDValue Rnd = Op.getOperand(4);
26564
26565 unsigned Opc;
26566 if (isRoundModeCurDirection(Rnd))
26567 Opc = IntrData->Opc0;
26568 else if (isRoundModeSAE(Rnd))
26569 Opc = IntrData->Opc1;
26570 else
26571 return SDValue();
26572
26573 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
26574 Subtarget, DAG);
26575 }
26576 case INTR_TYPE_SCALAR_MASK: {
26577 SDValue Src1 = Op.getOperand(1);
26578 SDValue Src2 = Op.getOperand(2);
26579 SDValue passThru = Op.getOperand(3);
26580 SDValue Mask = Op.getOperand(4);
26581 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26582 // There are 2 kinds of intrinsics in this group:
26583 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
26584 // (2) With rounding mode and sae - 7 operands.
26585 bool HasRounding = IntrWithRoundingModeOpcode != 0;
26586 if (Op.getNumOperands() == (5U + HasRounding)) {
26587 if (HasRounding) {
26588 SDValue Rnd = Op.getOperand(5);
26589 unsigned RC = 0;
26590 if (isRoundModeSAEToX(Rnd, RC))
26591 return getScalarMaskingNode(
26592 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
26593 DAG.getTargetConstant(RC, dl, MVT::i32)),
26594 Mask, passThru, Subtarget, DAG);
26595 if (!isRoundModeCurDirection(Rnd))
26596 return SDValue();
26597 }
26598 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
26599 Src2),
26600 Mask, passThru, Subtarget, DAG);
26601 }
26602
26603 assert(Op.getNumOperands() == (6U + HasRounding) &&
26604 "Unexpected intrinsic form");
26605 SDValue RoundingMode = Op.getOperand(5);
26606 unsigned Opc = IntrData->Opc0;
26607 if (HasRounding) {
26608 SDValue Sae = Op.getOperand(6);
26609 if (isRoundModeSAE(Sae))
26610 Opc = IntrWithRoundingModeOpcode;
26611 else if (!isRoundModeCurDirection(Sae))
26612 return SDValue();
26613 }
26614 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
26615 Src2, RoundingMode),
26616 Mask, passThru, Subtarget, DAG);
26617 }
26619 SDValue Src1 = Op.getOperand(1);
26620 SDValue Src2 = Op.getOperand(2);
26621 SDValue passThru = Op.getOperand(3);
26622 SDValue Mask = Op.getOperand(4);
26623 SDValue Rnd = Op.getOperand(5);
26624
26625 SDValue NewOp;
26626 unsigned RC = 0;
26627 if (isRoundModeCurDirection(Rnd))
26628 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26629 else if (isRoundModeSAEToX(Rnd, RC))
26630 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26631 DAG.getTargetConstant(RC, dl, MVT::i32));
26632 else
26633 return SDValue();
26634
26635 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
26636 }
26638 SDValue Src1 = Op.getOperand(1);
26639 SDValue Src2 = Op.getOperand(2);
26640 SDValue passThru = Op.getOperand(3);
26641 SDValue Mask = Op.getOperand(4);
26642 SDValue Sae = Op.getOperand(5);
26643 unsigned Opc;
26644 if (isRoundModeCurDirection(Sae))
26645 Opc = IntrData->Opc0;
26646 else if (isRoundModeSAE(Sae))
26647 Opc = IntrData->Opc1;
26648 else
26649 return SDValue();
26650
26651 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26652 Mask, passThru, Subtarget, DAG);
26653 }
26654 case INTR_TYPE_2OP_MASK: {
26655 SDValue Src1 = Op.getOperand(1);
26656 SDValue Src2 = Op.getOperand(2);
26657 SDValue PassThru = Op.getOperand(3);
26658 SDValue Mask = Op.getOperand(4);
26659 SDValue NewOp;
26660 if (IntrData->Opc1 != 0) {
26661 SDValue Rnd = Op.getOperand(5);
26662 unsigned RC = 0;
26663 if (isRoundModeSAEToX(Rnd, RC))
26664 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26665 DAG.getTargetConstant(RC, dl, MVT::i32));
26666 else if (!isRoundModeCurDirection(Rnd))
26667 return SDValue();
26668 }
26669 if (!NewOp)
26670 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26671 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26672 }
26674 SDValue Src1 = Op.getOperand(1);
26675 SDValue Src2 = Op.getOperand(2);
26676 SDValue PassThru = Op.getOperand(3);
26677 SDValue Mask = Op.getOperand(4);
26678
26679 unsigned Opc = IntrData->Opc0;
26680 if (IntrData->Opc1 != 0) {
26681 SDValue Sae = Op.getOperand(5);
26682 if (isRoundModeSAE(Sae))
26683 Opc = IntrData->Opc1;
26684 else if (!isRoundModeCurDirection(Sae))
26685 return SDValue();
26686 }
26687
26688 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26689 Mask, PassThru, Subtarget, DAG);
26690 }
26692 SDValue Src1 = Op.getOperand(1);
26693 SDValue Src2 = Op.getOperand(2);
26694 SDValue Src3 = Op.getOperand(3);
26695 SDValue PassThru = Op.getOperand(4);
26696 SDValue Mask = Op.getOperand(5);
26697 SDValue Sae = Op.getOperand(6);
26698 unsigned Opc;
26699 if (isRoundModeCurDirection(Sae))
26700 Opc = IntrData->Opc0;
26701 else if (isRoundModeSAE(Sae))
26702 Opc = IntrData->Opc1;
26703 else
26704 return SDValue();
26705
26706 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26707 Mask, PassThru, Subtarget, DAG);
26708 }
26710 SDValue Src1 = Op.getOperand(1);
26711 SDValue Src2 = Op.getOperand(2);
26712 SDValue Src3 = Op.getOperand(3);
26713 SDValue PassThru = Op.getOperand(4);
26714 SDValue Mask = Op.getOperand(5);
26715
26716 unsigned Opc = IntrData->Opc0;
26717 if (IntrData->Opc1 != 0) {
26718 SDValue Sae = Op.getOperand(6);
26719 if (isRoundModeSAE(Sae))
26720 Opc = IntrData->Opc1;
26721 else if (!isRoundModeCurDirection(Sae))
26722 return SDValue();
26723 }
26724 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26725 Mask, PassThru, Subtarget, DAG);
26726 }
26727 case BLENDV: {
26728 SDValue Src1 = Op.getOperand(1);
26729 SDValue Src2 = Op.getOperand(2);
26730 SDValue Src3 = Op.getOperand(3);
26731
26732 EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
26733 Src3 = DAG.getBitcast(MaskVT, Src3);
26734
26735 // Reverse the operands to match VSELECT order.
26736 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
26737 }
26738 case VPERM_2OP : {
26739 SDValue Src1 = Op.getOperand(1);
26740 SDValue Src2 = Op.getOperand(2);
26741
26742 // Swap Src1 and Src2 in the node creation
26743 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
26744 }
26745 case CFMA_OP_MASKZ:
26746 case CFMA_OP_MASK: {
26747 SDValue Src1 = Op.getOperand(1);
26748 SDValue Src2 = Op.getOperand(2);
26749 SDValue Src3 = Op.getOperand(3);
26750 SDValue Mask = Op.getOperand(4);
26751 MVT VT = Op.getSimpleValueType();
26752
26753 SDValue PassThru = Src3;
26754 if (IntrData->Type == CFMA_OP_MASKZ)
26755 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26756
26757 // We add rounding mode to the Node when
26758 // - RC Opcode is specified and
26759 // - RC is not "current direction".
26760 SDValue NewOp;
26761 if (IntrData->Opc1 != 0) {
26762 SDValue Rnd = Op.getOperand(5);
26763 unsigned RC = 0;
26764 if (isRoundModeSAEToX(Rnd, RC))
26765 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
26766 DAG.getTargetConstant(RC, dl, MVT::i32));
26767 else if (!isRoundModeCurDirection(Rnd))
26768 return SDValue();
26769 }
26770 if (!NewOp)
26771 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
26772 if (IntrData->Opc0 == X86ISD::VFMADDCSH ||
26773 IntrData->Opc0 == X86ISD::VFCMADDCSH)
26774 return getScalarMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26775 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26776 }
26777 case IFMA_OP:
26778 // NOTE: We need to swizzle the operands to pass the multiply operands
26779 // first.
26780 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26781 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
26782 case FPCLASSS: {
26783 SDValue Src1 = Op.getOperand(1);
26784 SDValue Imm = Op.getOperand(2);
26785 SDValue Mask = Op.getOperand(3);
26786 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
26787 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
26788 Subtarget, DAG);
26789 // Need to fill with zeros to ensure the bitcast will produce zeroes
26790 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26791 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26792 DAG.getConstant(0, dl, MVT::v8i1), FPclassMask,
26793 DAG.getVectorIdxConstant(0, dl));
26794 return DAG.getBitcast(MVT::i8, Ins);
26795 }
26796
26797 case CMP_MASK_CC: {
26798 MVT MaskVT = Op.getSimpleValueType();
26799 SDValue CC = Op.getOperand(3);
26800 SDValue Mask = Op.getOperand(4);
26801 // We specify 2 possible opcodes for intrinsics with rounding modes.
26802 // First, we check if the intrinsic may have non-default rounding mode,
26803 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26804 if (IntrData->Opc1 != 0) {
26805 SDValue Sae = Op.getOperand(5);
26806 if (isRoundModeSAE(Sae))
26807 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
26808 Op.getOperand(2), CC, Mask, Sae);
26809 if (!isRoundModeCurDirection(Sae))
26810 return SDValue();
26811 }
26812 //default rounding mode
26813 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
26814 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
26815 }
26816 case CMP_MASK_SCALAR_CC: {
26817 SDValue Src1 = Op.getOperand(1);
26818 SDValue Src2 = Op.getOperand(2);
26819 SDValue CC = Op.getOperand(3);
26820 SDValue Mask = Op.getOperand(4);
26821
26822 SDValue Cmp;
26823 if (IntrData->Opc1 != 0) {
26824 SDValue Sae = Op.getOperand(5);
26825 if (isRoundModeSAE(Sae))
26826 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
26827 else if (!isRoundModeCurDirection(Sae))
26828 return SDValue();
26829 }
26830 //default rounding mode
26831 if (!Cmp.getNode())
26832 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
26833
26834 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
26835 Subtarget, DAG);
26836 // Need to fill with zeros to ensure the bitcast will produce zeroes
26837 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26838 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26839 DAG.getConstant(0, dl, MVT::v8i1), CmpMask,
26840 DAG.getVectorIdxConstant(0, dl));
26841 return DAG.getBitcast(MVT::i8, Ins);
26842 }
26843 case COMI: { // Comparison intrinsics
26844 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
26845 SDValue LHS = Op.getOperand(1);
26846 SDValue RHS = Op.getOperand(2);
26847 // Some conditions require the operands to be swapped.
26848 if (CC == ISD::SETLT || CC == ISD::SETLE)
26849 std::swap(LHS, RHS);
26850
26851 // For AVX10.2, Support EQ and NE.
26852 bool HasAVX10_2_COMX =
26853 Subtarget.hasAVX10_2() && (CC == ISD::SETEQ || CC == ISD::SETNE);
26854
26855 // AVX10.2 COMPARE supports only v2f64, v4f32 or v8f16.
26856 // For BF type we need to fall back.
26857 bool HasAVX10_2_COMX_Ty = (LHS.getSimpleValueType() != MVT::v8bf16);
26858
26859 auto ComiOpCode = IntrData->Opc0;
26860 auto isUnordered = (ComiOpCode == X86ISD::UCOMI);
26861
26862 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty)
26863 ComiOpCode = isUnordered ? X86ISD::UCOMX : X86ISD::COMX;
26864
26865 SDValue Comi = DAG.getNode(ComiOpCode, dl, MVT::i32, LHS, RHS);
26866
26867 SDValue SetCC;
26868 switch (CC) {
26869 case ISD::SETEQ: {
26870 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
26871 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 1
26872 break;
26873 // (ZF = 1 and PF = 0)
26874 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
26875 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
26876 break;
26877 }
26878 case ISD::SETNE: {
26879 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
26880 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 0
26881 break;
26882 // (ZF = 0 or PF = 1)
26883 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
26884 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
26885 break;
26886 }
26887 case ISD::SETGT: // (CF = 0 and ZF = 0)
26888 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
26889 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
26890 break;
26891 }
26892 case ISD::SETGE: // CF = 0
26893 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
26894 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
26895 break;
26896 default:
26897 llvm_unreachable("Unexpected illegal condition!");
26898 }
26899 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26900 }
26901 case COMI_RM: { // Comparison intrinsics with Sae
26902 SDValue LHS = Op.getOperand(1);
26903 SDValue RHS = Op.getOperand(2);
26904 unsigned CondVal = Op.getConstantOperandVal(3);
26905 SDValue Sae = Op.getOperand(4);
26906
26907 SDValue FCmp;
26908 if (isRoundModeCurDirection(Sae))
26909 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
26910 DAG.getTargetConstant(CondVal, dl, MVT::i8));
26911 else if (isRoundModeSAE(Sae))
26912 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
26913 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
26914 else
26915 return SDValue();
26916 // Need to fill with zeros to ensure the bitcast will produce zeroes
26917 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26918 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
26919 DAG.getConstant(0, dl, MVT::v16i1), FCmp,
26920 DAG.getVectorIdxConstant(0, dl));
26921 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
26922 DAG.getBitcast(MVT::i16, Ins));
26923 }
26924 case VSHIFT: {
26925 SDValue SrcOp = Op.getOperand(1);
26926 SDValue ShAmt = Op.getOperand(2);
26927 assert(ShAmt.getValueType() == MVT::i32 &&
26928 "Unexpected VSHIFT amount type");
26929
26930 // Catch shift-by-constant.
26931 if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
26932 return getTargetVShiftByConstNode(IntrData->Opc0, dl,
26933 Op.getSimpleValueType(), SrcOp,
26934 CShAmt->getZExtValue(), DAG);
26935
26936 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26937 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
26938 SrcOp, ShAmt, 0, Subtarget, DAG);
26939 }
26941 SDValue Mask = Op.getOperand(3);
26942 SDValue DataToCompress = Op.getOperand(1);
26943 SDValue PassThru = Op.getOperand(2);
26944 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
26945 return Op.getOperand(1);
26946
26947 // Avoid false dependency.
26948 if (PassThru.isUndef())
26949 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26950
26951 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
26952 Mask);
26953 }
26954 case FIXUPIMM:
26955 case FIXUPIMM_MASKZ: {
26956 SDValue Src1 = Op.getOperand(1);
26957 SDValue Src2 = Op.getOperand(2);
26958 SDValue Src3 = Op.getOperand(3);
26959 SDValue Imm = Op.getOperand(4);
26960 SDValue Mask = Op.getOperand(5);
26961 SDValue Passthru = (IntrData->Type == FIXUPIMM)
26962 ? Src1
26963 : getZeroVector(VT, Subtarget, DAG, dl);
26964
26965 unsigned Opc = IntrData->Opc0;
26966 if (IntrData->Opc1 != 0) {
26967 SDValue Sae = Op.getOperand(6);
26968 if (isRoundModeSAE(Sae))
26969 Opc = IntrData->Opc1;
26970 else if (!isRoundModeCurDirection(Sae))
26971 return SDValue();
26972 }
26973
26974 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
26975
26977 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26978
26979 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26980 }
26981 case ROUNDP: {
26982 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
26983 // Clear the upper bits of the rounding immediate so that the legacy
26984 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26985 uint64_t Round = Op.getConstantOperandVal(2);
26986 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26987 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26988 Op.getOperand(1), RoundingMode);
26989 }
26990 case ROUNDS: {
26991 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
26992 // Clear the upper bits of the rounding immediate so that the legacy
26993 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26994 uint64_t Round = Op.getConstantOperandVal(3);
26995 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26996 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26997 Op.getOperand(1), Op.getOperand(2), RoundingMode);
26998 }
26999 case BEXTRI: {
27000 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
27001
27002 uint64_t Imm = Op.getConstantOperandVal(2);
27003 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
27004 Op.getValueType());
27005 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27006 Op.getOperand(1), Control);
27007 }
27008 // ADC/SBB
27009 case ADX: {
27010 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
27011 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
27012
27013 SDValue Res;
27014 // If the carry in is zero, then we should just use ADD/SUB instead of
27015 // ADC/SBB.
27016 if (isNullConstant(Op.getOperand(1))) {
27017 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
27018 Op.getOperand(3));
27019 } else {
27020 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
27021 DAG.getAllOnesConstant(dl, MVT::i8));
27022 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
27023 Op.getOperand(3), GenCF.getValue(1));
27024 }
27025 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
27026 SDValue Results[] = { SetCC, Res };
27027 return DAG.getMergeValues(Results, dl);
27028 }
27029 case CVTPD2PS_MASK:
27030 case CVTPD2DQ_MASK:
27031 case CVTQQ2PS_MASK:
27032 case TRUNCATE_TO_REG: {
27033 SDValue Src = Op.getOperand(1);
27034 SDValue PassThru = Op.getOperand(2);
27035 SDValue Mask = Op.getOperand(3);
27036
27037 if (isAllOnesConstant(Mask))
27038 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
27039
27040 MVT SrcVT = Src.getSimpleValueType();
27041 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
27042 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27043 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
27044 {Src, PassThru, Mask});
27045 }
27046 case TRUNCATE2_TO_REG: {
27047 SDValue Src = Op.getOperand(1);
27048 SDValue Src2 = Op.getOperand(2);
27049 SDValue PassThru = Op.getOperand(3);
27050 SDValue Mask = Op.getOperand(4);
27051
27052 if (isAllOnesConstant(Mask))
27053 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), {Src, Src2});
27054
27055 MVT Src2VT = Src2.getSimpleValueType();
27056 MVT MaskVT = MVT::getVectorVT(MVT::i1, Src2VT.getVectorNumElements());
27057 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27058 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
27059 {Src, Src2, PassThru, Mask});
27060 }
27061 case CVTPS2PH_MASK: {
27062 SDValue Src = Op.getOperand(1);
27063 SDValue Rnd = Op.getOperand(2);
27064 SDValue PassThru = Op.getOperand(3);
27065 SDValue Mask = Op.getOperand(4);
27066
27067 unsigned RC = 0;
27068 unsigned Opc = IntrData->Opc0;
27069 bool SAE = Src.getValueType().is512BitVector() &&
27070 (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));
27071 if (SAE) {
27073 Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);
27074 }
27075
27076 if (isAllOnesConstant(Mask))
27077 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);
27078
27079 if (SAE)
27081 else
27082 Opc = IntrData->Opc1;
27083 MVT SrcVT = Src.getSimpleValueType();
27084 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
27085 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27086 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);
27087 }
27088 case CVTNEPS2BF16_MASK: {
27089 SDValue Src = Op.getOperand(1);
27090 SDValue PassThru = Op.getOperand(2);
27091 SDValue Mask = Op.getOperand(3);
27092
27093 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
27094 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
27095
27096 // Break false dependency.
27097 if (PassThru.isUndef())
27098 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
27099
27100 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
27101 Mask);
27102 }
27103 default:
27104 break;
27105 }
27106 }
27107
27108 switch (IntNo) {
27109 default: return SDValue(); // Don't custom lower most intrinsics.
27110
27111 // ptest and testp intrinsics. The intrinsic these come from are designed to
27112 // return an integer value, not just an instruction so lower it to the ptest
27113 // or testp pattern and a setcc for the result.
27114 case Intrinsic::x86_avx512_ktestc_b:
27115 case Intrinsic::x86_avx512_ktestc_w:
27116 case Intrinsic::x86_avx512_ktestc_d:
27117 case Intrinsic::x86_avx512_ktestc_q:
27118 case Intrinsic::x86_avx512_ktestz_b:
27119 case Intrinsic::x86_avx512_ktestz_w:
27120 case Intrinsic::x86_avx512_ktestz_d:
27121 case Intrinsic::x86_avx512_ktestz_q:
27122 case Intrinsic::x86_sse41_ptestz:
27123 case Intrinsic::x86_sse41_ptestc:
27124 case Intrinsic::x86_sse41_ptestnzc:
27125 case Intrinsic::x86_avx_ptestz_256:
27126 case Intrinsic::x86_avx_ptestc_256:
27127 case Intrinsic::x86_avx_ptestnzc_256:
27128 case Intrinsic::x86_avx_vtestz_ps:
27129 case Intrinsic::x86_avx_vtestc_ps:
27130 case Intrinsic::x86_avx_vtestnzc_ps:
27131 case Intrinsic::x86_avx_vtestz_pd:
27132 case Intrinsic::x86_avx_vtestc_pd:
27133 case Intrinsic::x86_avx_vtestnzc_pd:
27134 case Intrinsic::x86_avx_vtestz_ps_256:
27135 case Intrinsic::x86_avx_vtestc_ps_256:
27136 case Intrinsic::x86_avx_vtestnzc_ps_256:
27137 case Intrinsic::x86_avx_vtestz_pd_256:
27138 case Intrinsic::x86_avx_vtestc_pd_256:
27139 case Intrinsic::x86_avx_vtestnzc_pd_256: {
27140 unsigned TestOpc = X86ISD::PTEST;
27141 X86::CondCode X86CC;
27142 switch (IntNo) {
27143 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
27144 case Intrinsic::x86_avx512_ktestc_b:
27145 case Intrinsic::x86_avx512_ktestc_w:
27146 case Intrinsic::x86_avx512_ktestc_d:
27147 case Intrinsic::x86_avx512_ktestc_q:
27148 // CF = 1
27149 TestOpc = X86ISD::KTEST;
27150 X86CC = X86::COND_B;
27151 break;
27152 case Intrinsic::x86_avx512_ktestz_b:
27153 case Intrinsic::x86_avx512_ktestz_w:
27154 case Intrinsic::x86_avx512_ktestz_d:
27155 case Intrinsic::x86_avx512_ktestz_q:
27156 TestOpc = X86ISD::KTEST;
27157 X86CC = X86::COND_E;
27158 break;
27159 case Intrinsic::x86_avx_vtestz_ps:
27160 case Intrinsic::x86_avx_vtestz_pd:
27161 case Intrinsic::x86_avx_vtestz_ps_256:
27162 case Intrinsic::x86_avx_vtestz_pd_256:
27163 TestOpc = X86ISD::TESTP;
27164 [[fallthrough]];
27165 case Intrinsic::x86_sse41_ptestz:
27166 case Intrinsic::x86_avx_ptestz_256:
27167 // ZF = 1
27168 X86CC = X86::COND_E;
27169 break;
27170 case Intrinsic::x86_avx_vtestc_ps:
27171 case Intrinsic::x86_avx_vtestc_pd:
27172 case Intrinsic::x86_avx_vtestc_ps_256:
27173 case Intrinsic::x86_avx_vtestc_pd_256:
27174 TestOpc = X86ISD::TESTP;
27175 [[fallthrough]];
27176 case Intrinsic::x86_sse41_ptestc:
27177 case Intrinsic::x86_avx_ptestc_256:
27178 // CF = 1
27179 X86CC = X86::COND_B;
27180 break;
27181 case Intrinsic::x86_avx_vtestnzc_ps:
27182 case Intrinsic::x86_avx_vtestnzc_pd:
27183 case Intrinsic::x86_avx_vtestnzc_ps_256:
27184 case Intrinsic::x86_avx_vtestnzc_pd_256:
27185 TestOpc = X86ISD::TESTP;
27186 [[fallthrough]];
27187 case Intrinsic::x86_sse41_ptestnzc:
27188 case Intrinsic::x86_avx_ptestnzc_256:
27189 // ZF and CF = 0
27190 X86CC = X86::COND_A;
27191 break;
27192 }
27193
27194 SDValue LHS = Op.getOperand(1);
27195 SDValue RHS = Op.getOperand(2);
27196 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
27197 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
27198 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
27199 }
27200
27201 case Intrinsic::x86_sse42_pcmpistria128:
27202 case Intrinsic::x86_sse42_pcmpestria128:
27203 case Intrinsic::x86_sse42_pcmpistric128:
27204 case Intrinsic::x86_sse42_pcmpestric128:
27205 case Intrinsic::x86_sse42_pcmpistrio128:
27206 case Intrinsic::x86_sse42_pcmpestrio128:
27207 case Intrinsic::x86_sse42_pcmpistris128:
27208 case Intrinsic::x86_sse42_pcmpestris128:
27209 case Intrinsic::x86_sse42_pcmpistriz128:
27210 case Intrinsic::x86_sse42_pcmpestriz128: {
27211 unsigned Opcode;
27212 X86::CondCode X86CC;
27213 switch (IntNo) {
27214 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
27215 case Intrinsic::x86_sse42_pcmpistria128:
27216 Opcode = X86ISD::PCMPISTR;
27217 X86CC = X86::COND_A;
27218 break;
27219 case Intrinsic::x86_sse42_pcmpestria128:
27220 Opcode = X86ISD::PCMPESTR;
27221 X86CC = X86::COND_A;
27222 break;
27223 case Intrinsic::x86_sse42_pcmpistric128:
27224 Opcode = X86ISD::PCMPISTR;
27225 X86CC = X86::COND_B;
27226 break;
27227 case Intrinsic::x86_sse42_pcmpestric128:
27228 Opcode = X86ISD::PCMPESTR;
27229 X86CC = X86::COND_B;
27230 break;
27231 case Intrinsic::x86_sse42_pcmpistrio128:
27232 Opcode = X86ISD::PCMPISTR;
27233 X86CC = X86::COND_O;
27234 break;
27235 case Intrinsic::x86_sse42_pcmpestrio128:
27236 Opcode = X86ISD::PCMPESTR;
27237 X86CC = X86::COND_O;
27238 break;
27239 case Intrinsic::x86_sse42_pcmpistris128:
27240 Opcode = X86ISD::PCMPISTR;
27241 X86CC = X86::COND_S;
27242 break;
27243 case Intrinsic::x86_sse42_pcmpestris128:
27244 Opcode = X86ISD::PCMPESTR;
27245 X86CC = X86::COND_S;
27246 break;
27247 case Intrinsic::x86_sse42_pcmpistriz128:
27248 Opcode = X86ISD::PCMPISTR;
27249 X86CC = X86::COND_E;
27250 break;
27251 case Intrinsic::x86_sse42_pcmpestriz128:
27252 Opcode = X86ISD::PCMPESTR;
27253 X86CC = X86::COND_E;
27254 break;
27255 }
27257 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27258 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
27259 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
27260 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
27261 }
27262
27263 case Intrinsic::x86_sse42_pcmpistri128:
27264 case Intrinsic::x86_sse42_pcmpestri128: {
27265 unsigned Opcode;
27266 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
27267 Opcode = X86ISD::PCMPISTR;
27268 else
27269 Opcode = X86ISD::PCMPESTR;
27270
27272 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27273 return DAG.getNode(Opcode, dl, VTs, NewOps);
27274 }
27275
27276 case Intrinsic::x86_sse42_pcmpistrm128:
27277 case Intrinsic::x86_sse42_pcmpestrm128: {
27278 unsigned Opcode;
27279 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
27280 Opcode = X86ISD::PCMPISTR;
27281 else
27282 Opcode = X86ISD::PCMPESTR;
27283
27285 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27286 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
27287 }
27288
27289 case Intrinsic::eh_sjlj_lsda: {
27290 MachineFunction &MF = DAG.getMachineFunction();
27291 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27292 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
27293 auto &Context = MF.getContext();
27294 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
27295 Twine(MF.getFunctionNumber()));
27296 return DAG.getNode(getGlobalWrapperKind(nullptr, /*OpFlags=*/0), dl, VT,
27297 DAG.getMCSymbol(S, PtrVT));
27298 }
27299
27300 case Intrinsic::x86_seh_lsda: {
27301 // Compute the symbol for the LSDA. We know it'll get emitted later.
27302 MachineFunction &MF = DAG.getMachineFunction();
27303 SDValue Op1 = Op.getOperand(1);
27304 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
27307
27308 // Generate a simple absolute symbol reference. This intrinsic is only
27309 // supported on 32-bit Windows, which isn't PIC.
27310 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
27311 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
27312 }
27313
27314 case Intrinsic::eh_recoverfp: {
27315 SDValue FnOp = Op.getOperand(1);
27316 SDValue IncomingFPOp = Op.getOperand(2);
27317 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
27318 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
27319 if (!Fn)
27321 "llvm.eh.recoverfp must take a function as the first argument");
27322 return recoverFramePointer(DAG, Fn, IncomingFPOp);
27323 }
27324
27325 case Intrinsic::localaddress: {
27326 // Returns one of the stack, base, or frame pointer registers, depending on
27327 // which is used to reference local variables.
27328 MachineFunction &MF = DAG.getMachineFunction();
27329 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27330 Register Reg;
27331 if (RegInfo->hasBasePointer(MF))
27332 Reg = RegInfo->getBaseRegister();
27333 else { // Handles the SP or FP case.
27334 bool CantUseFP = RegInfo->hasStackRealignment(MF);
27335 if (CantUseFP)
27336 Reg = RegInfo->getPtrSizedStackRegister(MF);
27337 else
27338 Reg = RegInfo->getPtrSizedFrameRegister(MF);
27339 }
27340 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
27341 }
27342 case Intrinsic::x86_avx512_vp2intersect_q_512:
27343 case Intrinsic::x86_avx512_vp2intersect_q_256:
27344 case Intrinsic::x86_avx512_vp2intersect_q_128:
27345 case Intrinsic::x86_avx512_vp2intersect_d_512:
27346 case Intrinsic::x86_avx512_vp2intersect_d_256:
27347 case Intrinsic::x86_avx512_vp2intersect_d_128: {
27348 SDLoc DL(Op);
27349 MVT MaskVT = Op.getSimpleValueType();
27350 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27352 Op.getOperand(1), Op.getOperand(2));
27353 SDValue Result0 =
27354 DAG.getTargetExtractSubreg(X86::sub_mask_0, DL, MaskVT, Operation);
27355 SDValue Result1 =
27356 DAG.getTargetExtractSubreg(X86::sub_mask_1, DL, MaskVT, Operation);
27357 return DAG.getMergeValues({Result0, Result1}, DL);
27358 }
27359 case Intrinsic::x86_mmx_pslli_w:
27360 case Intrinsic::x86_mmx_pslli_d:
27361 case Intrinsic::x86_mmx_pslli_q:
27362 case Intrinsic::x86_mmx_psrli_w:
27363 case Intrinsic::x86_mmx_psrli_d:
27364 case Intrinsic::x86_mmx_psrli_q:
27365 case Intrinsic::x86_mmx_psrai_w:
27366 case Intrinsic::x86_mmx_psrai_d: {
27367 SDLoc DL(Op);
27368 SDValue ShAmt = Op.getOperand(2);
27369 // If the argument is a constant, convert it to a target constant.
27370 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
27371 // Clamp out of bounds shift amounts since they will otherwise be masked
27372 // to 8-bits which may make it no longer out of bounds.
27373 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
27374 if (ShiftAmount == 0)
27375 return Op.getOperand(1);
27376
27377 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27378 Op.getOperand(0), Op.getOperand(1),
27379 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
27380 }
27381
27382 unsigned NewIntrinsic;
27383 switch (IntNo) {
27384 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
27385 case Intrinsic::x86_mmx_pslli_w:
27386 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
27387 break;
27388 case Intrinsic::x86_mmx_pslli_d:
27389 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
27390 break;
27391 case Intrinsic::x86_mmx_pslli_q:
27392 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
27393 break;
27394 case Intrinsic::x86_mmx_psrli_w:
27395 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
27396 break;
27397 case Intrinsic::x86_mmx_psrli_d:
27398 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
27399 break;
27400 case Intrinsic::x86_mmx_psrli_q:
27401 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
27402 break;
27403 case Intrinsic::x86_mmx_psrai_w:
27404 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
27405 break;
27406 case Intrinsic::x86_mmx_psrai_d:
27407 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
27408 break;
27409 }
27410
27411 // The vector shift intrinsics with scalars uses 32b shift amounts but
27412 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
27413 // MMX register.
27414 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
27415 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27416 DAG.getTargetConstant(NewIntrinsic, DL,
27418 Op.getOperand(1), ShAmt);
27419 }
27420 case Intrinsic::thread_pointer: {
27421 if (Subtarget.isTargetELF()) {
27422 SDLoc dl(Op);
27423 EVT PtrVT = Op.getValueType();
27424 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
27426 *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
27427 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27428 DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr));
27429 }
27431 "Target OS doesn't support __builtin_thread_pointer() yet.");
27432 }
27433 }
27434}
27435
27437 SDValue Src, SDValue Mask, SDValue Base,
27438 SDValue Index, SDValue ScaleOp, SDValue Chain,
27439 const X86Subtarget &Subtarget) {
27440 SDLoc dl(Op);
27441 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27442 // Scale must be constant.
27443 if (!C)
27444 return SDValue();
27445 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27446 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27447 TLI.getPointerTy(DAG.getDataLayout()));
27448 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
27449 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27450 // If source is undef or we know it won't be used, use a zero vector
27451 // to break register dependency.
27452 // TODO: use undef instead and let BreakFalseDeps deal with it?
27453 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27454 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27455
27456 // Cast mask to an integer type.
27457 Mask = DAG.getBitcast(MaskVT, Mask);
27458
27460
27461 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27462 SDValue Res =
27464 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27465 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27466}
27467
27469 SDValue Src, SDValue Mask, SDValue Base,
27470 SDValue Index, SDValue ScaleOp, SDValue Chain,
27471 const X86Subtarget &Subtarget) {
27472 MVT VT = Op.getSimpleValueType();
27473 SDLoc dl(Op);
27474 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27475 // Scale must be constant.
27476 if (!C)
27477 return SDValue();
27478 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27479 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27480 TLI.getPointerTy(DAG.getDataLayout()));
27481 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27483 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27484
27485 // We support two versions of the gather intrinsics. One with scalar mask and
27486 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27487 if (Mask.getValueType() != MaskVT)
27488 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27489
27490 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27491 // If source is undef or we know it won't be used, use a zero vector
27492 // to break register dependency.
27493 // TODO: use undef instead and let BreakFalseDeps deal with it?
27494 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27495 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27496
27498
27499 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27500 SDValue Res =
27502 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27503 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27504}
27505
27507 SDValue Src, SDValue Mask, SDValue Base,
27508 SDValue Index, SDValue ScaleOp, SDValue Chain,
27509 const X86Subtarget &Subtarget) {
27510 SDLoc dl(Op);
27511 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27512 // Scale must be constant.
27513 if (!C)
27514 return SDValue();
27515 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27516 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27517 TLI.getPointerTy(DAG.getDataLayout()));
27518 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27519 Src.getSimpleValueType().getVectorNumElements());
27520 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27521
27522 // We support two versions of the scatter intrinsics. One with scalar mask and
27523 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27524 if (Mask.getValueType() != MaskVT)
27525 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27526
27528
27529 SDVTList VTs = DAG.getVTList(MVT::Other);
27530 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
27531 SDValue Res =
27533 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27534 return Res;
27535}
27536
27538 SDValue Mask, SDValue Base, SDValue Index,
27539 SDValue ScaleOp, SDValue Chain,
27540 const X86Subtarget &Subtarget) {
27541 SDLoc dl(Op);
27542 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27543 // Scale must be constant.
27544 if (!C)
27545 return SDValue();
27546 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27547 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27548 TLI.getPointerTy(DAG.getDataLayout()));
27549 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
27550 SDValue Segment = DAG.getRegister(0, MVT::i32);
27551 MVT MaskVT =
27552 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
27553 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27554 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
27555 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
27556 return SDValue(Res, 0);
27557}
27558
27559/// Handles the lowering of builtin intrinsics with chain that return their
27560/// value into registers EDX:EAX.
27561/// If operand ScrReg is a valid register identifier, then operand 2 of N is
27562/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
27563/// TargetOpcode.
27564/// Returns a Glue value which can be used to add extra copy-from-reg if the
27565/// expanded intrinsics implicitly defines extra registers (i.e. not just
27566/// EDX:EAX).
27568 SelectionDAG &DAG,
27569 unsigned TargetOpcode,
27570 unsigned SrcReg,
27571 const X86Subtarget &Subtarget,
27573 SDValue Chain = N->getOperand(0);
27574 SDValue Glue;
27575
27576 if (SrcReg) {
27577 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
27578 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
27579 Glue = Chain.getValue(1);
27580 }
27581
27582 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
27583 SDValue N1Ops[] = {Chain, Glue};
27584 SDNode *N1 = DAG.getMachineNode(
27585 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
27586 Chain = SDValue(N1, 0);
27587
27588 // Reads the content of XCR and returns it in registers EDX:EAX.
27589 SDValue LO, HI;
27590 if (Subtarget.is64Bit()) {
27591 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
27592 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
27593 LO.getValue(2));
27594 } else {
27595 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
27596 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
27597 LO.getValue(2));
27598 }
27599 Chain = HI.getValue(1);
27600 Glue = HI.getValue(2);
27601
27602 if (Subtarget.is64Bit()) {
27603 // Merge the two 32-bit values into a 64-bit one.
27604 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
27605 DAG.getConstant(32, DL, MVT::i8));
27606 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
27607 Results.push_back(Chain);
27608 return Glue;
27609 }
27610
27611 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
27612 SDValue Ops[] = { LO, HI };
27613 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
27614 Results.push_back(Pair);
27615 Results.push_back(Chain);
27616 return Glue;
27617}
27618
27619/// Handles the lowering of builtin intrinsics that read the time stamp counter
27620/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
27621/// READCYCLECOUNTER nodes.
27622static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
27623 SelectionDAG &DAG,
27624 const X86Subtarget &Subtarget,
27626 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
27627 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
27628 // and the EAX register is loaded with the low-order 32 bits.
27629 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
27630 /* NoRegister */0, Subtarget,
27631 Results);
27632 if (Opcode != X86::RDTSCP)
27633 return;
27634
27635 SDValue Chain = Results[1];
27636 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
27637 // the ECX register. Add 'ecx' explicitly to the chain.
27638 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
27639 Results[1] = ecx;
27640 Results.push_back(ecx.getValue(1));
27641}
27642
27644 SelectionDAG &DAG) {
27646 SDLoc DL(Op);
27647 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
27648 Results);
27649 return DAG.getMergeValues(Results, DL);
27650}
27651
27654 SDValue Chain = Op.getOperand(0);
27655 SDValue RegNode = Op.getOperand(2);
27656 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27657 if (!EHInfo)
27658 report_fatal_error("EH registrations only live in functions using WinEH");
27659
27660 // Cast the operand to an alloca, and remember the frame index.
27661 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
27662 if (!FINode)
27663 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
27664 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
27665
27666 // Return the chain operand without making any DAG nodes.
27667 return Chain;
27668}
27669
27672 SDValue Chain = Op.getOperand(0);
27673 SDValue EHGuard = Op.getOperand(2);
27674 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27675 if (!EHInfo)
27676 report_fatal_error("EHGuard only live in functions using WinEH");
27677
27678 // Cast the operand to an alloca, and remember the frame index.
27679 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
27680 if (!FINode)
27681 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
27682 EHInfo->EHGuardFrameIndex = FINode->getIndex();
27683
27684 // Return the chain operand without making any DAG nodes.
27685 return Chain;
27686}
27687
27688/// Emit Truncating Store with signed or unsigned saturation.
27689static SDValue
27690EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val,
27691 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
27692 SelectionDAG &DAG) {
27693 SDVTList VTs = DAG.getVTList(MVT::Other);
27694 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
27695 SDValue Ops[] = { Chain, Val, Ptr, Undef };
27696 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
27697 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27698}
27699
27700/// Emit Masked Truncating Store with signed or unsigned saturation.
27701static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain,
27702 const SDLoc &DL,
27703 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
27704 MachineMemOperand *MMO, SelectionDAG &DAG) {
27705 SDVTList VTs = DAG.getVTList(MVT::Other);
27706 SDValue Ops[] = { Chain, Val, Ptr, Mask };
27707 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
27708 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27709}
27710
27712 const MachineFunction &MF) {
27713 if (!Subtarget.is64Bit())
27714 return false;
27715 // 64-bit targets support extended Swift async frame setup,
27716 // except for targets that use the windows 64 prologue.
27717 return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
27718}
27719
27721 SelectionDAG &DAG) {
27722 unsigned IntNo = Op.getConstantOperandVal(1);
27723 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
27724 if (!IntrData) {
27725 switch (IntNo) {
27726
27727 case Intrinsic::swift_async_context_addr: {
27728 SDLoc dl(Op);
27729 auto &MF = DAG.getMachineFunction();
27730 auto *X86FI = MF.getInfo<X86MachineFunctionInfo>();
27731 if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF)) {
27733 X86FI->setHasSwiftAsyncContext(true);
27734 SDValue Chain = Op->getOperand(0);
27735 SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
27736 SDValue Result =
27737 SDValue(DAG.getMachineNode(X86::SUB64ri32, dl, MVT::i64, CopyRBP,
27738 DAG.getTargetConstant(8, dl, MVT::i32)),
27739 0);
27740 // Return { result, chain }.
27741 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27742 CopyRBP.getValue(1));
27743 } else {
27744 // No special extended frame, create or reuse an existing stack slot.
27745 int PtrSize = Subtarget.is64Bit() ? 8 : 4;
27746 if (!X86FI->getSwiftAsyncContextFrameIdx())
27747 X86FI->setSwiftAsyncContextFrameIdx(
27748 MF.getFrameInfo().CreateStackObject(PtrSize, Align(PtrSize),
27749 false));
27750 SDValue Result =
27751 DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(),
27752 PtrSize == 8 ? MVT::i64 : MVT::i32);
27753 // Return { result, chain }.
27754 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27755 Op->getOperand(0));
27756 }
27757 }
27758
27759 case llvm::Intrinsic::x86_seh_ehregnode:
27760 return MarkEHRegistrationNode(Op, DAG);
27761 case llvm::Intrinsic::x86_seh_ehguard:
27762 return MarkEHGuard(Op, DAG);
27763 case llvm::Intrinsic::x86_rdpkru: {
27764 SDLoc dl(Op);
27765 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27766 // Create a RDPKRU node and pass 0 to the ECX parameter.
27767 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
27768 DAG.getConstant(0, dl, MVT::i32));
27769 }
27770 case llvm::Intrinsic::x86_wrpkru: {
27771 SDLoc dl(Op);
27772 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
27773 // to the EDX and ECX parameters.
27774 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
27775 Op.getOperand(0), Op.getOperand(2),
27776 DAG.getConstant(0, dl, MVT::i32),
27777 DAG.getConstant(0, dl, MVT::i32));
27778 }
27779 case llvm::Intrinsic::asan_check_memaccess: {
27780 // Mark this as adjustsStack because it will be lowered to a call.
27782 // Don't do anything here, we will expand these intrinsics out later.
27783 return Op;
27784 }
27785 case llvm::Intrinsic::x86_flags_read_u32:
27786 case llvm::Intrinsic::x86_flags_read_u64:
27787 case llvm::Intrinsic::x86_flags_write_u32:
27788 case llvm::Intrinsic::x86_flags_write_u64: {
27789 // We need a frame pointer because this will get lowered to a PUSH/POP
27790 // sequence.
27793 // Don't do anything here, we will expand these intrinsics out later
27794 // during FinalizeISel in EmitInstrWithCustomInserter.
27795 return Op;
27796 }
27797 case Intrinsic::x86_lwpins32:
27798 case Intrinsic::x86_lwpins64:
27799 case Intrinsic::x86_umwait:
27800 case Intrinsic::x86_tpause: {
27801 SDLoc dl(Op);
27802 SDValue Chain = Op->getOperand(0);
27803 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27804 unsigned Opcode;
27805
27806 switch (IntNo) {
27807 default: llvm_unreachable("Impossible intrinsic");
27808 case Intrinsic::x86_umwait:
27809 Opcode = X86ISD::UMWAIT;
27810 break;
27811 case Intrinsic::x86_tpause:
27812 Opcode = X86ISD::TPAUSE;
27813 break;
27814 case Intrinsic::x86_lwpins32:
27815 case Intrinsic::x86_lwpins64:
27816 Opcode = X86ISD::LWPINS;
27817 break;
27818 }
27819
27821 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
27822 Op->getOperand(3), Op->getOperand(4));
27823 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27824 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27825 Operation.getValue(1));
27826 }
27827 case Intrinsic::x86_enqcmd:
27828 case Intrinsic::x86_enqcmds: {
27829 SDLoc dl(Op);
27830 SDValue Chain = Op.getOperand(0);
27831 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27832 unsigned Opcode;
27833 switch (IntNo) {
27834 default: llvm_unreachable("Impossible intrinsic!");
27835 case Intrinsic::x86_enqcmd:
27836 Opcode = X86ISD::ENQCMD;
27837 break;
27838 case Intrinsic::x86_enqcmds:
27839 Opcode = X86ISD::ENQCMDS;
27840 break;
27841 }
27842 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
27843 Op.getOperand(3));
27844 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
27845 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27846 Operation.getValue(1));
27847 }
27848 case Intrinsic::x86_aesenc128kl:
27849 case Intrinsic::x86_aesdec128kl:
27850 case Intrinsic::x86_aesenc256kl:
27851 case Intrinsic::x86_aesdec256kl: {
27852 SDLoc DL(Op);
27853 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
27854 SDValue Chain = Op.getOperand(0);
27855 unsigned Opcode;
27856
27857 switch (IntNo) {
27858 default: llvm_unreachable("Impossible intrinsic");
27859 case Intrinsic::x86_aesenc128kl:
27860 Opcode = X86ISD::AESENC128KL;
27861 break;
27862 case Intrinsic::x86_aesdec128kl:
27863 Opcode = X86ISD::AESDEC128KL;
27864 break;
27865 case Intrinsic::x86_aesenc256kl:
27866 Opcode = X86ISD::AESENC256KL;
27867 break;
27868 case Intrinsic::x86_aesdec256kl:
27869 Opcode = X86ISD::AESDEC256KL;
27870 break;
27871 }
27872
27874 MachineMemOperand *MMO = MemIntr->getMemOperand();
27875 EVT MemVT = MemIntr->getMemoryVT();
27877 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
27878 MMO);
27879 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
27880
27881 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27882 {ZF, Operation.getValue(0), Operation.getValue(2)});
27883 }
27884 case Intrinsic::x86_aesencwide128kl:
27885 case Intrinsic::x86_aesdecwide128kl:
27886 case Intrinsic::x86_aesencwide256kl:
27887 case Intrinsic::x86_aesdecwide256kl: {
27888 SDLoc DL(Op);
27889 SDVTList VTs = DAG.getVTList(
27890 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
27891 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
27892 SDValue Chain = Op.getOperand(0);
27893 unsigned Opcode;
27894
27895 switch (IntNo) {
27896 default: llvm_unreachable("Impossible intrinsic");
27897 case Intrinsic::x86_aesencwide128kl:
27898 Opcode = X86ISD::AESENCWIDE128KL;
27899 break;
27900 case Intrinsic::x86_aesdecwide128kl:
27901 Opcode = X86ISD::AESDECWIDE128KL;
27902 break;
27903 case Intrinsic::x86_aesencwide256kl:
27904 Opcode = X86ISD::AESENCWIDE256KL;
27905 break;
27906 case Intrinsic::x86_aesdecwide256kl:
27907 Opcode = X86ISD::AESDECWIDE256KL;
27908 break;
27909 }
27910
27912 MachineMemOperand *MMO = MemIntr->getMemOperand();
27913 EVT MemVT = MemIntr->getMemoryVT();
27915 Opcode, DL, VTs,
27916 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
27917 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
27918 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
27919 MemVT, MMO);
27920 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
27921
27922 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27923 {ZF, Operation.getValue(1), Operation.getValue(2),
27924 Operation.getValue(3), Operation.getValue(4),
27925 Operation.getValue(5), Operation.getValue(6),
27926 Operation.getValue(7), Operation.getValue(8),
27927 Operation.getValue(9)});
27928 }
27929 case Intrinsic::x86_testui: {
27930 SDLoc dl(Op);
27931 SDValue Chain = Op.getOperand(0);
27932 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27933 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
27934 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27935 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27936 Operation.getValue(1));
27937 }
27938 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27939 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27940 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27941 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27942 case Intrinsic::x86_t2rpntlvwz0_internal:
27943 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27944 case Intrinsic::x86_t2rpntlvwz1_internal:
27945 case Intrinsic::x86_t2rpntlvwz1t1_internal: {
27946 auto *X86MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
27948 unsigned IntNo = Op.getConstantOperandVal(1);
27949 unsigned Opc = 0;
27950 switch (IntNo) {
27951 default:
27952 llvm_unreachable("Unexpected intrinsic!");
27953 case Intrinsic::x86_t2rpntlvwz0_internal:
27954 Opc = X86::PT2RPNTLVWZ0V;
27955 break;
27956 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27957 Opc = X86::PT2RPNTLVWZ0T1V;
27958 break;
27959 case Intrinsic::x86_t2rpntlvwz1_internal:
27960 Opc = X86::PT2RPNTLVWZ1V;
27961 break;
27962 case Intrinsic::x86_t2rpntlvwz1t1_internal:
27963 Opc = X86::PT2RPNTLVWZ1T1V;
27964 break;
27965 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27966 Opc = X86::PT2RPNTLVWZ0RSV;
27967 break;
27968 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27969 Opc = X86::PT2RPNTLVWZ0RST1V;
27970 break;
27971 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27972 Opc = X86::PT2RPNTLVWZ1RSV;
27973 break;
27974 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27975 Opc = X86::PT2RPNTLVWZ1RST1V;
27976 break;
27977 }
27978
27979 SDLoc DL(Op);
27980 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27981
27982 SDValue Ops[] = {Op.getOperand(2), // Row
27983 Op.getOperand(3), // Col0
27984 Op.getOperand(4), // Col1
27985 Op.getOperand(5), // Base
27986 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
27987 Op.getOperand(6), // Index
27988 DAG.getTargetConstant(0, DL, MVT::i32), // Disp
27989 DAG.getRegister(0, MVT::i16), // Segment
27990 Op.getOperand(0)}; // Chain
27991
27992 MachineSDNode *Res = DAG.getMachineNode(Opc, DL, VTs, Ops);
27993 SDValue Res0 = DAG.getTargetExtractSubreg(X86::sub_t0, DL, MVT::x86amx,
27994 SDValue(Res, 0));
27995 SDValue Res1 = DAG.getTargetExtractSubreg(X86::sub_t1, DL, MVT::x86amx,
27996 SDValue(Res, 0));
27997 return DAG.getMergeValues({Res0, Res1, SDValue(Res, 1)}, DL);
27998 }
27999 case Intrinsic::x86_atomic_bts_rm:
28000 case Intrinsic::x86_atomic_btc_rm:
28001 case Intrinsic::x86_atomic_btr_rm: {
28002 SDLoc DL(Op);
28003 MVT VT = Op.getSimpleValueType();
28004 SDValue Chain = Op.getOperand(0);
28005 SDValue Op1 = Op.getOperand(2);
28006 SDValue Op2 = Op.getOperand(3);
28007 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM
28008 : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
28010 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28011 SDValue Res =
28012 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28013 {Chain, Op1, Op2}, VT, MMO);
28014 Chain = Res.getValue(1);
28015 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
28016 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
28017 }
28018 case Intrinsic::x86_atomic_bts:
28019 case Intrinsic::x86_atomic_btc:
28020 case Intrinsic::x86_atomic_btr: {
28021 SDLoc DL(Op);
28022 MVT VT = Op.getSimpleValueType();
28023 SDValue Chain = Op.getOperand(0);
28024 SDValue Op1 = Op.getOperand(2);
28025 SDValue Op2 = Op.getOperand(3);
28026 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS
28027 : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
28028 : X86ISD::LBTR;
28029 SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
28030 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28031 SDValue Res =
28032 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28033 {Chain, Op1, Op2, Size}, VT, MMO);
28034 Chain = Res.getValue(1);
28035 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
28036 unsigned Imm = Op2->getAsZExtVal();
28037 if (Imm)
28038 Res = DAG.getNode(ISD::SHL, DL, VT, Res,
28039 DAG.getShiftAmountConstant(Imm, VT, DL));
28040 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
28041 }
28042 case Intrinsic::x86_cmpccxadd32:
28043 case Intrinsic::x86_cmpccxadd64: {
28044 SDLoc DL(Op);
28045 SDValue Chain = Op.getOperand(0);
28046 SDValue Addr = Op.getOperand(2);
28047 SDValue Src1 = Op.getOperand(3);
28048 SDValue Src2 = Op.getOperand(4);
28049 SDValue CC = Op.getOperand(5);
28050 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28052 X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
28053 MVT::i32, MMO);
28054 return Operation;
28055 }
28056 case Intrinsic::x86_aadd32:
28057 case Intrinsic::x86_aadd64:
28058 case Intrinsic::x86_aand32:
28059 case Intrinsic::x86_aand64:
28060 case Intrinsic::x86_aor32:
28061 case Intrinsic::x86_aor64:
28062 case Intrinsic::x86_axor32:
28063 case Intrinsic::x86_axor64: {
28064 SDLoc DL(Op);
28065 SDValue Chain = Op.getOperand(0);
28066 SDValue Op1 = Op.getOperand(2);
28067 SDValue Op2 = Op.getOperand(3);
28068 MVT VT = Op2.getSimpleValueType();
28069 unsigned Opc = 0;
28070 switch (IntNo) {
28071 default:
28072 llvm_unreachable("Unknown Intrinsic");
28073 case Intrinsic::x86_aadd32:
28074 case Intrinsic::x86_aadd64:
28075 Opc = X86ISD::AADD;
28076 break;
28077 case Intrinsic::x86_aand32:
28078 case Intrinsic::x86_aand64:
28079 Opc = X86ISD::AAND;
28080 break;
28081 case Intrinsic::x86_aor32:
28082 case Intrinsic::x86_aor64:
28083 Opc = X86ISD::AOR;
28084 break;
28085 case Intrinsic::x86_axor32:
28086 case Intrinsic::x86_axor64:
28087 Opc = X86ISD::AXOR;
28088 break;
28089 }
28090 MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
28091 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
28092 {Chain, Op1, Op2}, VT, MMO);
28093 }
28094 case Intrinsic::x86_atomic_add_cc:
28095 case Intrinsic::x86_atomic_sub_cc:
28096 case Intrinsic::x86_atomic_or_cc:
28097 case Intrinsic::x86_atomic_and_cc:
28098 case Intrinsic::x86_atomic_xor_cc: {
28099 SDLoc DL(Op);
28100 SDValue Chain = Op.getOperand(0);
28101 SDValue Op1 = Op.getOperand(2);
28102 SDValue Op2 = Op.getOperand(3);
28103 X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);
28104 MVT VT = Op2.getSimpleValueType();
28105 unsigned Opc = 0;
28106 switch (IntNo) {
28107 default:
28108 llvm_unreachable("Unknown Intrinsic");
28109 case Intrinsic::x86_atomic_add_cc:
28110 Opc = X86ISD::LADD;
28111 break;
28112 case Intrinsic::x86_atomic_sub_cc:
28113 Opc = X86ISD::LSUB;
28114 break;
28115 case Intrinsic::x86_atomic_or_cc:
28116 Opc = X86ISD::LOR;
28117 break;
28118 case Intrinsic::x86_atomic_and_cc:
28119 Opc = X86ISD::LAND;
28120 break;
28121 case Intrinsic::x86_atomic_xor_cc:
28122 Opc = X86ISD::LXOR;
28123 break;
28124 }
28125 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28126 SDValue LockArith =
28127 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28128 {Chain, Op1, Op2}, VT, MMO);
28129 Chain = LockArith.getValue(1);
28130 return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);
28131 }
28132 }
28133 return SDValue();
28134 }
28135
28136 SDLoc dl(Op);
28137 switch(IntrData->Type) {
28138 default: llvm_unreachable("Unknown Intrinsic Type");
28139 case RDSEED:
28140 case RDRAND: {
28141 // Emit the node with the right value type.
28142 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
28143 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
28144
28145 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
28146 // Otherwise return the value from Rand, which is always 0, casted to i32.
28147 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
28148 DAG.getConstant(1, dl, Op->getValueType(1)),
28149 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
28150 SDValue(Result.getNode(), 1)};
28151 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
28152
28153 // Return { result, isValid, chain }.
28154 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
28155 SDValue(Result.getNode(), 2));
28156 }
28157 case GATHER_AVX2: {
28158 SDValue Chain = Op.getOperand(0);
28159 SDValue Src = Op.getOperand(2);
28160 SDValue Base = Op.getOperand(3);
28161 SDValue Index = Op.getOperand(4);
28162 SDValue Mask = Op.getOperand(5);
28163 SDValue Scale = Op.getOperand(6);
28164 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
28165 Scale, Chain, Subtarget);
28166 }
28167 case GATHER: {
28168 //gather(v1, mask, index, base, scale);
28169 SDValue Chain = Op.getOperand(0);
28170 SDValue Src = Op.getOperand(2);
28171 SDValue Base = Op.getOperand(3);
28172 SDValue Index = Op.getOperand(4);
28173 SDValue Mask = Op.getOperand(5);
28174 SDValue Scale = Op.getOperand(6);
28175 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
28176 Chain, Subtarget);
28177 }
28178 case SCATTER: {
28179 //scatter(base, mask, index, v1, scale);
28180 SDValue Chain = Op.getOperand(0);
28181 SDValue Base = Op.getOperand(2);
28182 SDValue Mask = Op.getOperand(3);
28183 SDValue Index = Op.getOperand(4);
28184 SDValue Src = Op.getOperand(5);
28185 SDValue Scale = Op.getOperand(6);
28186 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
28187 Scale, Chain, Subtarget);
28188 }
28189 case PREFETCH: {
28190 const APInt &HintVal = Op.getConstantOperandAPInt(6);
28191 assert((HintVal == 2 || HintVal == 3) &&
28192 "Wrong prefetch hint in intrinsic: should be 2 or 3");
28193 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
28194 SDValue Chain = Op.getOperand(0);
28195 SDValue Mask = Op.getOperand(2);
28196 SDValue Index = Op.getOperand(3);
28197 SDValue Base = Op.getOperand(4);
28198 SDValue Scale = Op.getOperand(5);
28199 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
28200 Subtarget);
28201 }
28202 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
28203 case RDTSC: {
28205 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
28206 Results);
28207 return DAG.getMergeValues(Results, dl);
28208 }
28209 // Read Performance Monitoring Counters.
28210 case RDPMC:
28211 // Read Processor Register.
28212 case RDPRU:
28213 // GetExtended Control Register.
28214 case XGETBV: {
28216
28217 // RDPMC uses ECX to select the index of the performance counter to read.
28218 // RDPRU uses ECX to select the processor register to read.
28219 // XGETBV uses ECX to select the index of the XCR register to return.
28220 // The result is stored into registers EDX:EAX.
28221 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
28222 Subtarget, Results);
28223 return DAG.getMergeValues(Results, dl);
28224 }
28225 // XTEST intrinsics.
28226 case XTEST: {
28227 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
28228 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
28229
28230 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
28231 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
28232 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
28233 Ret, SDValue(InTrans.getNode(), 1));
28234 }
28237 case TRUNCATE_TO_MEM_VI32: {
28238 SDValue Mask = Op.getOperand(4);
28239 SDValue DataToTruncate = Op.getOperand(3);
28240 SDValue Addr = Op.getOperand(2);
28241 SDValue Chain = Op.getOperand(0);
28242
28244 assert(MemIntr && "Expected MemIntrinsicSDNode!");
28245
28246 EVT MemVT = MemIntr->getMemoryVT();
28247
28248 uint16_t TruncationOp = IntrData->Opc0;
28249 switch (TruncationOp) {
28250 case X86ISD::VTRUNC: {
28251 if (isAllOnesConstant(Mask)) // return just a truncate store
28252 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
28253 MemIntr->getMemOperand());
28254
28255 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
28256 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28257 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
28258
28259 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
28260 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
28261 true /* truncating */);
28262 }
28263 case X86ISD::VTRUNCUS:
28264 case X86ISD::VTRUNCS: {
28265 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
28266 if (isAllOnesConstant(Mask))
28267 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
28268 MemIntr->getMemOperand(), DAG);
28269
28270 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
28271 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28272
28273 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
28274 VMask, MemVT, MemIntr->getMemOperand(), DAG);
28275 }
28276 default:
28277 llvm_unreachable("Unsupported truncstore intrinsic");
28278 }
28279 }
28280 case INTR_TYPE_CAST_MMX:
28281 return SDValue(); // handled in combineINTRINSIC_*
28282 }
28283}
28284
28285SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
28286 SelectionDAG &DAG) const {
28287 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
28288 MFI.setReturnAddressIsTaken(true);
28289
28290 unsigned Depth = Op.getConstantOperandVal(0);
28291 SDLoc dl(Op);
28292 EVT PtrVT = Op.getValueType();
28293
28294 if (Depth > 0) {
28295 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
28296 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28297 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
28298 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
28299 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
28300 MachinePointerInfo());
28301 }
28302
28303 // Just load the return address.
28304 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
28305 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
28306 MachinePointerInfo());
28307}
28308
28309SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
28310 SelectionDAG &DAG) const {
28312 return getReturnAddressFrameIndex(DAG);
28313}
28314
28315SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
28316 MachineFunction &MF = DAG.getMachineFunction();
28317 MachineFrameInfo &MFI = MF.getFrameInfo();
28318 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
28319 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28320 EVT VT = Op.getValueType();
28321
28322 MFI.setFrameAddressIsTaken(true);
28323
28324 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
28325 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
28326 // is not possible to crawl up the stack without looking at the unwind codes
28327 // simultaneously.
28328 int FrameAddrIndex = FuncInfo->getFAIndex();
28329 if (!FrameAddrIndex) {
28330 // Set up a frame object for the return address.
28331 unsigned SlotSize = RegInfo->getSlotSize();
28332 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
28333 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
28334 FuncInfo->setFAIndex(FrameAddrIndex);
28335 }
28336 return DAG.getFrameIndex(FrameAddrIndex, VT);
28337 }
28338
28339 Register FrameReg =
28340 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
28341 SDLoc dl(Op); // FIXME probably not meaningful
28342 unsigned Depth = Op.getConstantOperandVal(0);
28343 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
28344 (FrameReg == X86::EBP && VT == MVT::i32)) &&
28345 "Invalid Frame Register!");
28346 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
28347 while (Depth--)
28348 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
28349 MachinePointerInfo());
28350 return FrameAddr;
28351}
28352
28353// FIXME? Maybe this could be a TableGen attribute on some registers and
28354// this table could be generated automatically from RegInfo.
28356 const MachineFunction &MF) const {
28357 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
28358
28360 .Case("esp", X86::ESP)
28361 .Case("rsp", X86::RSP)
28362 .Case("ebp", X86::EBP)
28363 .Case("rbp", X86::RBP)
28364 .Case("r14", X86::R14)
28365 .Case("r15", X86::R15)
28366 .Default(0);
28367
28368 if (Reg == X86::EBP || Reg == X86::RBP) {
28369 if (!TFI.hasFP(MF))
28370 report_fatal_error("register " + StringRef(RegName) +
28371 " is allocatable: function has no frame pointer");
28372#ifndef NDEBUG
28373 else {
28374 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28375 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
28376 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
28377 "Invalid Frame Register!");
28378 }
28379#endif
28380 }
28381
28382 return Reg;
28383}
28384
28385SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
28386 SelectionDAG &DAG) const {
28387 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28388 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
28389}
28390
28392 const Constant *PersonalityFn) const {
28393 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
28394 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
28395
28396 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
28397}
28398
28400 const Constant *PersonalityFn) const {
28401 // Funclet personalities don't use selectors (the runtime does the selection).
28403 return X86::NoRegister;
28404 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
28405}
28406
28408 return Subtarget.isTargetWin64();
28409}
28410
28411SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
28412 SDValue Chain = Op.getOperand(0);
28413 SDValue Offset = Op.getOperand(1);
28414 SDValue Handler = Op.getOperand(2);
28415 SDLoc dl (Op);
28416
28417 EVT PtrVT = getPointerTy(DAG.getDataLayout());
28418 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28419 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
28420 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
28421 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
28422 "Invalid Frame Register!");
28423 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
28424 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
28425
28426 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
28427 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
28428 dl));
28429 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
28430 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
28431 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
28432
28433 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
28434 DAG.getRegister(StoreAddrReg, PtrVT));
28435}
28436
28437SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
28438 SelectionDAG &DAG) const {
28439 SDLoc DL(Op);
28440 // If the subtarget is not 64bit, we may need the global base reg
28441 // after isel expand pseudo, i.e., after CGBR pass ran.
28442 // Therefore, ask for the GlobalBaseReg now, so that the pass
28443 // inserts the code for us in case we need it.
28444 // Otherwise, we will end up in a situation where we will
28445 // reference a virtual register that is not defined!
28446 if (!Subtarget.is64Bit()) {
28447 const X86InstrInfo *TII = Subtarget.getInstrInfo();
28448 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
28449 }
28450 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
28451 DAG.getVTList(MVT::i32, MVT::Other),
28452 Op.getOperand(0), Op.getOperand(1));
28453}
28454
28455SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
28456 SelectionDAG &DAG) const {
28457 SDLoc DL(Op);
28458 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
28459 Op.getOperand(0), Op.getOperand(1));
28460}
28461
28462SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
28463 SelectionDAG &DAG) const {
28464 SDLoc DL(Op);
28465 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
28466 Op.getOperand(0));
28467}
28468
28470 return Op.getOperand(0);
28471}
28472
28473SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
28474 SelectionDAG &DAG) const {
28475 SDValue Root = Op.getOperand(0);
28476 SDValue Trmp = Op.getOperand(1); // trampoline
28477 SDValue FPtr = Op.getOperand(2); // nested function
28478 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
28479 SDLoc dl (Op);
28480
28481 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
28482 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
28483
28484 if (Subtarget.is64Bit()) {
28485 SDValue OutChains[6];
28486
28487 // Large code-model.
28488 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
28489 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
28490
28491 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
28492 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
28493
28494 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
28495
28496 // Load the pointer to the nested function into R11.
28497 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
28498 SDValue Addr = Trmp;
28499 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28500 Addr, MachinePointerInfo(TrmpAddr));
28501
28502 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28503 DAG.getConstant(2, dl, MVT::i64));
28504 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
28505 MachinePointerInfo(TrmpAddr, 2), Align(2));
28506
28507 // Load the 'nest' parameter value into R10.
28508 // R10 is specified in X86CallingConv.td
28509 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
28510 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28511 DAG.getConstant(10, dl, MVT::i64));
28512 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28513 Addr, MachinePointerInfo(TrmpAddr, 10));
28514
28515 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28516 DAG.getConstant(12, dl, MVT::i64));
28517 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
28518 MachinePointerInfo(TrmpAddr, 12), Align(2));
28519
28520 // Jump to the nested function.
28521 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
28522 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28523 DAG.getConstant(20, dl, MVT::i64));
28524 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28525 Addr, MachinePointerInfo(TrmpAddr, 20));
28526
28527 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
28528 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28529 DAG.getConstant(22, dl, MVT::i64));
28530 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
28531 Addr, MachinePointerInfo(TrmpAddr, 22));
28532
28533 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28534 } else {
28535 const Function *Func =
28536 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
28537 CallingConv::ID CC = Func->getCallingConv();
28538 unsigned NestReg;
28539
28540 switch (CC) {
28541 default:
28542 llvm_unreachable("Unsupported calling convention");
28543 case CallingConv::C:
28545 // Pass 'nest' parameter in ECX.
28546 // Must be kept in sync with X86CallingConv.td
28547 NestReg = X86::ECX;
28548
28549 // Check that ECX wasn't needed by an 'inreg' parameter.
28550 FunctionType *FTy = Func->getFunctionType();
28551 const AttributeList &Attrs = Func->getAttributes();
28552
28553 if (!Attrs.isEmpty() && !Func->isVarArg()) {
28554 unsigned InRegCount = 0;
28555 unsigned Idx = 0;
28556
28557 for (FunctionType::param_iterator I = FTy->param_begin(),
28558 E = FTy->param_end(); I != E; ++I, ++Idx)
28559 if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
28560 const DataLayout &DL = DAG.getDataLayout();
28561 // FIXME: should only count parameters that are lowered to integers.
28562 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
28563 }
28564
28565 if (InRegCount > 2) {
28566 report_fatal_error("Nest register in use - reduce number of inreg"
28567 " parameters!");
28568 }
28569 }
28570 break;
28571 }
28574 case CallingConv::Fast:
28575 case CallingConv::Tail:
28577 // Pass 'nest' parameter in EAX.
28578 // Must be kept in sync with X86CallingConv.td
28579 NestReg = X86::EAX;
28580 break;
28581 }
28582
28583 SDValue OutChains[4];
28584 SDValue Addr, Disp;
28585
28586 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28587 DAG.getConstant(10, dl, MVT::i32));
28588 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
28589
28590 // This is storing the opcode for MOV32ri.
28591 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
28592 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
28593 OutChains[0] =
28594 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
28595 Trmp, MachinePointerInfo(TrmpAddr));
28596
28597 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28598 DAG.getConstant(1, dl, MVT::i32));
28599 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
28600 MachinePointerInfo(TrmpAddr, 1), Align(1));
28601
28602 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
28603 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28604 DAG.getConstant(5, dl, MVT::i32));
28605 OutChains[2] =
28606 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
28607 MachinePointerInfo(TrmpAddr, 5), Align(1));
28608
28609 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28610 DAG.getConstant(6, dl, MVT::i32));
28611 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
28612 MachinePointerInfo(TrmpAddr, 6), Align(1));
28613
28614 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28615 }
28616}
28617
28618SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
28619 SelectionDAG &DAG) const {
28620 /*
28621 The rounding mode is in bits 11:10 of FPSR, and has the following
28622 settings:
28623 00 Round to nearest
28624 01 Round to -inf
28625 10 Round to +inf
28626 11 Round to 0
28627
28628 GET_ROUNDING, on the other hand, expects the following:
28629 -1 Undefined
28630 0 Round to 0
28631 1 Round to nearest
28632 2 Round to +inf
28633 3 Round to -inf
28634
28635 To perform the conversion, we use a packed lookup table of the four 2-bit
28636 values that we can index by FPSP[11:10]
28637 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
28638
28639 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
28640 */
28641
28642 MachineFunction &MF = DAG.getMachineFunction();
28643 MVT VT = Op.getSimpleValueType();
28644 SDLoc DL(Op);
28645
28646 // Save FP Control Word to stack slot
28647 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
28648 SDValue StackSlot =
28649 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
28650
28651 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
28652
28653 SDValue Chain = Op.getOperand(0);
28654 SDValue Ops[] = {Chain, StackSlot};
28656 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
28658
28659 // Load FP Control Word from stack slot
28660 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
28661 Chain = CWD.getValue(1);
28662
28663 // Mask and turn the control bits into a shift for the lookup table.
28664 SDValue Shift =
28665 DAG.getNode(ISD::SRL, DL, MVT::i16,
28666 DAG.getNode(ISD::AND, DL, MVT::i16,
28667 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
28668 DAG.getConstant(9, DL, MVT::i8));
28669 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
28670
28671 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
28672 SDValue RetVal =
28673 DAG.getNode(ISD::AND, DL, MVT::i32,
28674 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
28675 DAG.getConstant(3, DL, MVT::i32));
28676
28677 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
28678
28679 return DAG.getMergeValues({RetVal, Chain}, DL);
28680}
28681
28682SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
28683 SelectionDAG &DAG) const {
28684 MachineFunction &MF = DAG.getMachineFunction();
28685 SDLoc DL(Op);
28686 SDValue Chain = Op.getNode()->getOperand(0);
28687
28688 // FP control word may be set only from data in memory. So we need to allocate
28689 // stack space to save/load FP control word.
28690 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
28691 SDValue StackSlot =
28692 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
28693 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);
28694 MachineMemOperand *MMO =
28696
28697 // Store FP control word into memory.
28698 SDValue Ops[] = {Chain, StackSlot};
28699 Chain = DAG.getMemIntrinsicNode(
28700 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
28701
28702 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
28703 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
28704 Chain = CWD.getValue(1);
28705 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
28706 DAG.getConstant(0xf3ff, DL, MVT::i16));
28707
28708 // Calculate new rounding mode.
28709 SDValue NewRM = Op.getNode()->getOperand(1);
28710 SDValue RMBits;
28711 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
28712 uint64_t RM = CVal->getZExtValue();
28713 int FieldVal = X86::getRoundingModeX86(RM);
28714
28715 if (FieldVal == X86::rmInvalid) {
28716 FieldVal = X86::rmToNearest;
28717 LLVMContext &C = MF.getFunction().getContext();
28718 C.diagnose(DiagnosticInfoUnsupported(
28719 MF.getFunction(), "rounding mode is not supported by X86 hardware",
28720 DiagnosticLocation(DL.getDebugLoc()), DS_Error));
28721 }
28722 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
28723 } else {
28724 // Need to convert argument into bits of control word:
28725 // 0 Round to 0 -> 11
28726 // 1 Round to nearest -> 00
28727 // 2 Round to +inf -> 10
28728 // 3 Round to -inf -> 01
28729 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
28730 // To make the conversion, put all these values into a value 0xc9 and shift
28731 // it left depending on the rounding mode:
28732 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
28733 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
28734 // ...
28735 // (0xc9 << (2 * NewRM + 4)) & 0xc00
28736 SDValue ShiftValue =
28737 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
28738 DAG.getNode(ISD::ADD, DL, MVT::i32,
28739 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
28740 DAG.getConstant(1, DL, MVT::i8)),
28741 DAG.getConstant(4, DL, MVT::i32)));
28742 SDValue Shifted =
28743 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
28744 ShiftValue);
28745 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
28746 DAG.getConstant(0xc00, DL, MVT::i16));
28747 }
28748
28749 // Update rounding mode bits and store the new FP Control Word into stack.
28750 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
28751 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));
28752
28753 // Load FP control word from the slot.
28754 SDValue OpsLD[] = {Chain, StackSlot};
28755 MachineMemOperand *MMOL =
28757 Chain = DAG.getMemIntrinsicNode(
28758 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
28759
28760 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
28761 // same way but in bits 14:13.
28762 if (Subtarget.hasSSE1()) {
28763 // Store MXCSR into memory.
28764 Chain = DAG.getNode(
28765 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28766 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28767 StackSlot);
28768
28769 // Load MXCSR from stack slot and clear RM field (bits 14:13).
28770 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
28771 Chain = CWD.getValue(1);
28772 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
28773 DAG.getConstant(0xffff9fff, DL, MVT::i32));
28774
28775 // Shift X87 RM bits from 11:10 to 14:13.
28776 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
28777 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
28778 DAG.getConstant(3, DL, MVT::i8));
28779
28780 // Update rounding mode bits and store the new FP Control Word into stack.
28781 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
28782 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));
28783
28784 // Load MXCSR from the slot.
28785 Chain = DAG.getNode(
28786 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28787 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28788 StackSlot);
28789 }
28790
28791 return Chain;
28792}
28793
28794const unsigned X87StateSize = 28;
28795const unsigned FPStateSize = 32;
28796[[maybe_unused]] const unsigned FPStateSizeInBits = FPStateSize * 8;
28797
28798SDValue X86TargetLowering::LowerGET_FPENV_MEM(SDValue Op,
28799 SelectionDAG &DAG) const {
28801 SDLoc DL(Op);
28802 SDValue Chain = Op->getOperand(0);
28803 SDValue Ptr = Op->getOperand(1);
28805 EVT MemVT = Node->getMemoryVT();
28807 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28808
28809 // Get x87 state, if it presents.
28810 if (Subtarget.hasX87()) {
28811 Chain =
28812 DAG.getMemIntrinsicNode(X86ISD::FNSTENVm, DL, DAG.getVTList(MVT::Other),
28813 {Chain, Ptr}, MemVT, MMO);
28814
28815 // FNSTENV changes the exception mask, so load back the stored environment.
28816 MachineMemOperand::Flags NewFlags =
28819 MMO = MF.getMachineMemOperand(MMO, NewFlags);
28820 Chain =
28821 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28822 {Chain, Ptr}, MemVT, MMO);
28823 }
28824
28825 // If target supports SSE, get MXCSR as well.
28826 if (Subtarget.hasSSE1()) {
28827 // Get pointer to the MXCSR location in memory.
28829 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28830 DAG.getConstant(X87StateSize, DL, PtrVT));
28831 // Store MXCSR into memory.
28832 Chain = DAG.getNode(
28833 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28834 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28835 MXCSRAddr);
28836 }
28837
28838 return Chain;
28839}
28840
28842 EVT MemVT, MachineMemOperand *MMO,
28843 SelectionDAG &DAG,
28844 const X86Subtarget &Subtarget) {
28845 // Set x87 state, if it presents.
28846 if (Subtarget.hasX87())
28847 Chain =
28848 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28849 {Chain, Ptr}, MemVT, MMO);
28850 // If target supports SSE, set MXCSR as well.
28851 if (Subtarget.hasSSE1()) {
28852 // Get pointer to the MXCSR location in memory.
28854 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28855 DAG.getConstant(X87StateSize, DL, PtrVT));
28856 // Load MXCSR from memory.
28857 Chain = DAG.getNode(
28858 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28859 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28860 MXCSRAddr);
28861 }
28862 return Chain;
28863}
28864
28865SDValue X86TargetLowering::LowerSET_FPENV_MEM(SDValue Op,
28866 SelectionDAG &DAG) const {
28867 SDLoc DL(Op);
28868 SDValue Chain = Op->getOperand(0);
28869 SDValue Ptr = Op->getOperand(1);
28871 EVT MemVT = Node->getMemoryVT();
28873 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28874 return createSetFPEnvNodes(Ptr, Chain, DL, MemVT, MMO, DAG, Subtarget);
28875}
28876
28877SDValue X86TargetLowering::LowerRESET_FPENV(SDValue Op,
28878 SelectionDAG &DAG) const {
28879 MachineFunction &MF = DAG.getMachineFunction();
28880 SDLoc DL(Op);
28881 SDValue Chain = Op.getNode()->getOperand(0);
28882
28883 IntegerType *ItemTy = Type::getInt32Ty(*DAG.getContext());
28884 ArrayType *FPEnvTy = ArrayType::get(ItemTy, 8);
28886
28887 // x87 FPU Control Word: mask all floating-point exceptions, sets rounding to
28888 // nearest. FPU precision is set to 53 bits on Windows and 64 bits otherwise
28889 // for compatibility with glibc.
28890 unsigned X87CW = Subtarget.isTargetWindowsMSVC() ? 0x27F : 0x37F;
28891 FPEnvVals.push_back(ConstantInt::get(ItemTy, X87CW));
28892 Constant *Zero = ConstantInt::get(ItemTy, 0);
28893 for (unsigned I = 0; I < 6; ++I)
28894 FPEnvVals.push_back(Zero);
28895
28896 // MXCSR: mask all floating-point exceptions, sets rounding to nearest, clear
28897 // all exceptions, sets DAZ and FTZ to 0.
28898 FPEnvVals.push_back(ConstantInt::get(ItemTy, 0x1F80));
28899 Constant *FPEnvBits = ConstantArray::get(FPEnvTy, FPEnvVals);
28900 MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
28901 SDValue Env = DAG.getConstantPool(FPEnvBits, PtrVT);
28902 MachinePointerInfo MPI =
28904 MachineMemOperand *MMO = MF.getMachineMemOperand(
28906
28907 return createSetFPEnvNodes(Env, Chain, DL, MVT::i32, MMO, DAG, Subtarget);
28908}
28909
28910// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28911uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt = 0) {
28912 assert((Amt < 8) && "Shift/Rotation amount out of range");
28913 switch (Opcode) {
28914 case ISD::BITREVERSE:
28915 return 0x8040201008040201ULL;
28916 case ISD::SHL:
28917 return ((0x0102040810204080ULL >> (Amt)) &
28918 (0x0101010101010101ULL * (0xFF >> (Amt))));
28919 case ISD::SRL:
28920 return ((0x0102040810204080ULL << (Amt)) &
28921 (0x0101010101010101ULL * ((0xFF << (Amt)) & 0xFF)));
28922 case ISD::SRA:
28923 return (getGFNICtrlImm(ISD::SRL, Amt) |
28924 (0x8080808080808080ULL >> (64 - (8 * Amt))));
28925 case ISD::ROTL:
28926 return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt);
28927 case ISD::ROTR:
28928 return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt);
28929 }
28930 llvm_unreachable("Unsupported GFNI opcode");
28931}
28932
28933// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28934SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL,
28935 MVT VT, unsigned Amt = 0) {
28936 assert(VT.getVectorElementType() == MVT::i8 &&
28937 (VT.getSizeInBits() % 64) == 0 && "Illegal GFNI control type");
28938 uint64_t Imm = getGFNICtrlImm(Opcode, Amt);
28939 SmallVector<SDValue> MaskBits;
28940 for (unsigned I = 0, E = VT.getSizeInBits(); I != E; I += 8) {
28941 uint64_t Bits = (Imm >> (I % 64)) & 255;
28942 MaskBits.push_back(DAG.getConstant(Bits, DL, MVT::i8));
28943 }
28944 return DAG.getBuildVector(VT, DL, MaskBits);
28945}
28946
28947/// Lower a vector CTLZ using native supported vector CTLZ instruction.
28948//
28949// i8/i16 vector implemented using dword LZCNT vector instruction
28950// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
28951// split the vector, perform operation on it's Lo a Hi part and
28952// concatenate the results.
28954 const X86Subtarget &Subtarget) {
28955 assert(Op.getOpcode() == ISD::CTLZ);
28956 SDLoc dl(Op);
28957 MVT VT = Op.getSimpleValueType();
28958 MVT EltVT = VT.getVectorElementType();
28959 unsigned NumElems = VT.getVectorNumElements();
28960
28961 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
28962 "Unsupported element type");
28963
28964 // Split vector, it's Lo and Hi parts will be handled in next iteration.
28965 if (NumElems > 16 ||
28966 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
28967 return splitVectorIntUnary(Op, DAG, dl);
28968
28969 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
28970 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
28971 "Unsupported value type for operation");
28972
28973 // Use native supported vector instruction vplzcntd.
28974 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
28975 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
28976 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
28977 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
28978
28979 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
28980}
28981
28982// Lower CTLZ using a PSHUFB lookup table implementation.
28984 const X86Subtarget &Subtarget,
28985 SelectionDAG &DAG) {
28986 MVT VT = Op.getSimpleValueType();
28987 int NumElts = VT.getVectorNumElements();
28988 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
28989 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
28990
28991 // Per-nibble leading zero PSHUFB lookup table.
28992 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
28993 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
28994 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
28995 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
28996
28998 for (int i = 0; i < NumBytes; ++i)
28999 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
29000 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
29001
29002 // Begin by bitcasting the input to byte vector, then split those bytes
29003 // into lo/hi nibbles and use the PSHUFB LUT to perform CTLZ on each of them.
29004 // If the hi input nibble is zero then we add both results together, otherwise
29005 // we just take the hi result (by masking the lo result to zero before the
29006 // add).
29007 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
29008 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
29009
29010 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
29011 SDValue Lo = Op0;
29012 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
29013 SDValue HiZ;
29014 if (CurrVT.is512BitVector()) {
29015 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29016 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
29017 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29018 } else {
29019 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
29020 }
29021
29022 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
29023 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
29024 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
29025 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
29026
29027 // Merge result back from vXi8 back to VT, working on the lo/hi halves
29028 // of the current vector width in the same way we did for the nibbles.
29029 // If the upper half of the input element is zero then add the halves'
29030 // leading zero counts together, otherwise just use the upper half's.
29031 // Double the width of the result until we are at target width.
29032 while (CurrVT != VT) {
29033 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
29034 int CurrNumElts = CurrVT.getVectorNumElements();
29035 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
29036 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
29037 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
29038
29039 // Check if the upper half of the input element is zero.
29040 if (CurrVT.is512BitVector()) {
29041 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29042 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
29043 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29044 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29045 } else {
29046 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
29047 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29048 }
29049 HiZ = DAG.getBitcast(NextVT, HiZ);
29050
29051 // Move the upper/lower halves to the lower bits as we'll be extending to
29052 // NextVT. Mask the lower result to zero if HiZ is true and add the results
29053 // together.
29054 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
29055 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
29056 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
29057 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
29058 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
29059 CurrVT = NextVT;
29060 }
29061
29062 return Res;
29063}
29064
29066 const X86Subtarget &Subtarget,
29067 SelectionDAG &DAG) {
29068 MVT VT = Op.getSimpleValueType();
29069
29070 if (Subtarget.hasCDI() &&
29071 // vXi8 vectors need to be promoted to 512-bits for vXi32.
29072 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
29073 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
29074
29075 // Decompose 256-bit ops into smaller 128-bit ops.
29076 if (VT.is256BitVector() && !Subtarget.hasInt256())
29077 return splitVectorIntUnary(Op, DAG, DL);
29078
29079 // Decompose 512-bit ops into smaller 256-bit ops.
29080 if (VT.is512BitVector() && !Subtarget.hasBWI())
29081 return splitVectorIntUnary(Op, DAG, DL);
29082
29083 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
29084 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
29085}
29086
29088 SelectionDAG &DAG,
29089 const X86Subtarget &Subtarget) {
29090 MVT VT = Op.getSimpleValueType();
29091 SDValue Input = Op.getOperand(0);
29092
29093 assert(VT.isVector() && VT.getVectorElementType() == MVT::i8 &&
29094 "Expected vXi8 input for GFNI-based CTLZ lowering");
29095
29096 SDValue Reversed = DAG.getNode(ISD::BITREVERSE, DL, VT, Input);
29097
29098 SDValue Neg = DAG.getNegative(Reversed, DL, VT);
29099 SDValue Filtered = DAG.getNode(ISD::AND, DL, VT, Reversed, Neg);
29100
29101 MVT VT64 = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
29102 SDValue CTTZConst = DAG.getConstant(0xAACCF0FF00000000ULL, DL, VT64);
29103 SDValue CTTZMatrix = DAG.getBitcast(VT, CTTZConst);
29104
29105 SDValue LZCNT =
29106 DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, Filtered, CTTZMatrix,
29107 DAG.getTargetConstant(8, DL, MVT::i8));
29108 return LZCNT;
29109}
29110
29111static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
29112 SelectionDAG &DAG) {
29113 MVT VT = Op.getSimpleValueType();
29114 MVT OpVT = VT;
29115 unsigned NumBits = VT.getSizeInBits();
29116 SDLoc dl(Op);
29117 unsigned Opc = Op.getOpcode();
29118
29119 if (VT.isVector() && VT.getScalarType() == MVT::i8 && Subtarget.hasGFNI())
29120 return LowerVectorCTLZ_GFNI(Op, dl, DAG, Subtarget);
29121
29122 if (VT.isVector())
29123 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
29124
29125 Op = Op.getOperand(0);
29126 if (VT == MVT::i8) {
29127 // Zero extend to i32 since there is not an i8 bsr.
29128 OpVT = MVT::i32;
29129 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
29130 }
29131
29132 // Check if we can safely pass a result though BSR for zero sources.
29133 SDValue PassThru = DAG.getUNDEF(OpVT);
29134 if (Opc == ISD::CTLZ && Subtarget.hasBitScanPassThrough() &&
29135 !DAG.isKnownNeverZero(Op))
29136 PassThru = DAG.getConstant(NumBits + NumBits - 1, dl, OpVT);
29137
29138 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
29139 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
29140 Op = DAG.getNode(X86ISD::BSR, dl, VTs, PassThru, Op);
29141
29142 // Skip CMOV if we're using a pass through value.
29143 if (Opc == ISD::CTLZ && PassThru.isUndef()) {
29144 // If src is zero (i.e. bsr sets ZF), returns NumBits.
29145 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
29146 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
29147 Op.getValue(1)};
29148 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
29149 }
29150
29151 // Finally xor with NumBits-1.
29152 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
29153 DAG.getConstant(NumBits - 1, dl, OpVT));
29154
29155 if (VT == MVT::i8)
29156 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
29157 return Op;
29158}
29159
29160static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
29161 SelectionDAG &DAG) {
29162 MVT VT = Op.getSimpleValueType();
29163 unsigned NumBits = VT.getScalarSizeInBits();
29164 SDValue N0 = Op.getOperand(0);
29165 SDLoc dl(Op);
29166 bool NonZeroSrc = DAG.isKnownNeverZero(N0);
29167
29168 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
29169 "Only scalar CTTZ requires custom lowering");
29170
29171 // Check if we can safely pass a result though BSF for zero sources.
29172 SDValue PassThru = DAG.getUNDEF(VT);
29173 if (!NonZeroSrc && Subtarget.hasBitScanPassThrough())
29174 PassThru = DAG.getConstant(NumBits, dl, VT);
29175
29176 // Issue a bsf (scan bits forward) which also sets EFLAGS.
29177 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29178 Op = DAG.getNode(X86ISD::BSF, dl, VTs, PassThru, N0);
29179
29180 // Skip CMOV if src is never zero or we're using a pass through value.
29181 if (NonZeroSrc || !PassThru.isUndef())
29182 return Op;
29183
29184 // If src is zero (i.e. bsf sets ZF), returns NumBits.
29185 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
29186 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
29187 Op.getValue(1)};
29188 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
29189}
29190
29192 const X86Subtarget &Subtarget) {
29193 MVT VT = Op.getSimpleValueType();
29194 SDLoc DL(Op);
29195
29196 if (VT == MVT::i16 || VT == MVT::i32)
29197 return lowerAddSubToHorizontalOp(Op, DL, DAG, Subtarget);
29198
29199 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29200 return splitVectorIntBinary(Op, DAG, DL);
29201
29202 assert(Op.getSimpleValueType().is256BitVector() &&
29203 Op.getSimpleValueType().isInteger() &&
29204 "Only handle AVX 256-bit vector integer operation");
29205 return splitVectorIntBinary(Op, DAG, DL);
29206}
29207
29209 const X86Subtarget &Subtarget) {
29210 MVT VT = Op.getSimpleValueType();
29211 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
29212 unsigned Opcode = Op.getOpcode();
29213 SDLoc DL(Op);
29214
29215 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
29216 (VT.is256BitVector() && !Subtarget.hasInt256())) {
29217 assert(Op.getSimpleValueType().isInteger() &&
29218 "Only handle AVX vector integer operation");
29219 return splitVectorIntBinary(Op, DAG, DL);
29220 }
29221
29222 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
29223 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29224 EVT SetCCResultType =
29225 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29226
29227 unsigned BitWidth = VT.getScalarSizeInBits();
29228 if (Opcode == ISD::USUBSAT) {
29229 if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
29230 // Handle a special-case with a bit-hack instead of cmp+select:
29231 // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
29232 // If the target can use VPTERNLOG, DAGToDAG will match this as
29233 // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
29234 // "broadcast" constant load.
29236 if (C && C->getAPIntValue().isSignMask()) {
29237 SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
29238 SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
29239 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
29240 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
29241 return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
29242 }
29243 }
29244 if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
29245 // usubsat X, Y --> (X >u Y) ? X - Y : 0
29246 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
29247 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
29248 // TODO: Move this to DAGCombiner?
29249 if (SetCCResultType == VT &&
29250 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
29251 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
29252 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
29253 }
29254 }
29255
29256 if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
29257 (!VT.isVector() || VT == MVT::v2i64)) {
29260 SDValue Zero = DAG.getConstant(0, DL, VT);
29261 SDValue Result =
29262 DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
29263 DAG.getVTList(VT, SetCCResultType), X, Y);
29264 SDValue SumDiff = Result.getValue(0);
29265 SDValue Overflow = Result.getValue(1);
29266 SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
29267 SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
29268 SDValue SumNeg =
29269 DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
29270 Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
29271 return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
29272 }
29273
29274 // Use default expansion.
29275 return SDValue();
29276}
29277
29278static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
29279 SelectionDAG &DAG) {
29280 MVT VT = Op.getSimpleValueType();
29281 SDLoc DL(Op);
29282
29283 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
29284 // Since X86 does not have CMOV for 8-bit integer, we don't convert
29285 // 8-bit integer abs to NEG and CMOV.
29286 SDValue N0 = Op.getOperand(0);
29287 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
29288 DAG.getConstant(0, DL, VT), N0);
29289 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
29290 SDValue(Neg.getNode(), 1)};
29291 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
29292 }
29293
29294 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
29295 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
29296 SDValue Src = Op.getOperand(0);
29297 SDValue Neg = DAG.getNegative(Src, DL, VT);
29298 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Neg, Src);
29299 }
29300
29301 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
29302 assert(VT.isInteger() &&
29303 "Only handle AVX 256-bit vector integer operation");
29304 return splitVectorIntUnary(Op, DAG, DL);
29305 }
29306
29307 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29308 return splitVectorIntUnary(Op, DAG, DL);
29309
29310 // Default to expand.
29311 return SDValue();
29312}
29313
29314static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
29315 SelectionDAG &DAG) {
29316 MVT VT = Op.getSimpleValueType();
29317 SDLoc DL(Op);
29318
29319 // For AVX1 cases, split to use legal ops.
29320 if (VT.is256BitVector() && !Subtarget.hasInt256())
29321 return splitVectorIntBinary(Op, DAG, DL);
29322
29323 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29324 return splitVectorIntBinary(Op, DAG, DL);
29325
29326 // Default to expand.
29327 return SDValue();
29328}
29329
29330static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
29331 SelectionDAG &DAG) {
29332 MVT VT = Op.getSimpleValueType();
29333 SDLoc DL(Op);
29334
29335 // For AVX1 cases, split to use legal ops.
29336 if (VT.is256BitVector() && !Subtarget.hasInt256())
29337 return splitVectorIntBinary(Op, DAG, DL);
29338
29339 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29340 return splitVectorIntBinary(Op, DAG, DL);
29341
29342 // Default to expand.
29343 return SDValue();
29344}
29345
29347 SelectionDAG &DAG) {
29348 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29349 EVT VT = Op.getValueType();
29350 SDValue X = Op.getOperand(0);
29351 SDValue Y = Op.getOperand(1);
29352 SDLoc DL(Op);
29353 bool IsMaxOp =
29354 Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
29355 bool IsNum =
29356 Op.getOpcode() == ISD::FMINIMUMNUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
29357 if (Subtarget.hasAVX10_2() && TLI.isTypeLegal(VT)) {
29358 unsigned Opc = 0;
29359 if (VT.isVector())
29361 else if (VT == MVT::f16 || VT == MVT::f32 || VT == MVT::f64)
29363
29364 if (Opc) {
29365 SDValue Imm =
29366 DAG.getTargetConstant(IsMaxOp + (IsNum ? 16 : 0), DL, MVT::i32);
29367 return DAG.getNode(Opc, DL, VT, X, Y, Imm, Op->getFlags());
29368 }
29369 }
29370
29371 uint64_t SizeInBits = VT.getScalarSizeInBits();
29372 APInt PreferredZero = APInt::getZero(SizeInBits);
29373 APInt OppositeZero = PreferredZero;
29374 EVT IVT = VT.changeTypeToInteger();
29375 X86ISD::NodeType MinMaxOp;
29376 if (IsMaxOp) {
29377 MinMaxOp = X86ISD::FMAX;
29378 OppositeZero.setSignBit();
29379 } else {
29380 PreferredZero.setSignBit();
29381 MinMaxOp = X86ISD::FMIN;
29382 }
29383 EVT SetCCType =
29384 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29385
29386 // The tables below show the expected result of Max in cases of NaN and
29387 // signed zeros.
29388 //
29389 // Y Y
29390 // Num xNaN +0 -0
29391 // --------------- ---------------
29392 // Num | Max | Y | +0 | +0 | +0 |
29393 // X --------------- X ---------------
29394 // xNaN | X | X/Y | -0 | +0 | -0 |
29395 // --------------- ---------------
29396 //
29397 // It is achieved by means of FMAX/FMIN with preliminary checks and operand
29398 // reordering.
29399 //
29400 // We check if any of operands is NaN and return NaN. Then we check if any of
29401 // operands is zero or negative zero (for fmaximum and fminimum respectively)
29402 // to ensure the correct zero is returned.
29403 auto MatchesZero = [](SDValue Op, APInt Zero) {
29405 if (auto *CstOp = dyn_cast<ConstantFPSDNode>(Op))
29406 return CstOp->getValueAPF().bitcastToAPInt() == Zero;
29407 if (auto *CstOp = dyn_cast<ConstantSDNode>(Op))
29408 return CstOp->getAPIntValue() == Zero;
29409 if (Op->getOpcode() == ISD::BUILD_VECTOR ||
29410 Op->getOpcode() == ISD::SPLAT_VECTOR) {
29411 for (const SDValue &OpVal : Op->op_values()) {
29412 if (OpVal.isUndef())
29413 continue;
29414 auto *CstOp = dyn_cast<ConstantFPSDNode>(OpVal);
29415 if (!CstOp)
29416 return false;
29417 if (!CstOp->getValueAPF().isZero())
29418 continue;
29419 if (CstOp->getValueAPF().bitcastToAPInt() != Zero)
29420 return false;
29421 }
29422 return true;
29423 }
29424 return false;
29425 };
29426
29427 bool IsXNeverNaN = DAG.isKnownNeverNaN(X);
29428 bool IsYNeverNaN = DAG.isKnownNeverNaN(Y);
29429 bool IgnoreSignedZero = DAG.getTarget().Options.NoSignedZerosFPMath ||
29430 Op->getFlags().hasNoSignedZeros() ||
29431 DAG.isKnownNeverZeroFloat(X) ||
29433 SDValue NewX, NewY;
29434 if (IgnoreSignedZero || MatchesZero(Y, PreferredZero) ||
29435 MatchesZero(X, OppositeZero)) {
29436 // Operands are already in right order or order does not matter.
29437 NewX = X;
29438 NewY = Y;
29439 } else if (MatchesZero(X, PreferredZero) || MatchesZero(Y, OppositeZero)) {
29440 NewX = Y;
29441 NewY = X;
29442 } else if (!VT.isVector() && (VT == MVT::f16 || Subtarget.hasDQI()) &&
29443 (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {
29444 if (IsXNeverNaN)
29445 std::swap(X, Y);
29446 // VFPCLASSS consumes a vector type. So provide a minimal one corresponded
29447 // xmm register.
29448 MVT VectorType = MVT::getVectorVT(VT.getSimpleVT(), 128 / SizeInBits);
29450 // Bits of classes:
29451 // Bits Imm8[0] Imm8[1] Imm8[2] Imm8[3] Imm8[4] Imm8[5] Imm8[6] Imm8[7]
29452 // Class QNAN PosZero NegZero PosINF NegINF Denormal Negative SNAN
29453 SDValue Imm = DAG.getTargetConstant(MinMaxOp == X86ISD::FMAX ? 0b11 : 0b101,
29454 DL, MVT::i32);
29455 SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm);
29456 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
29457 DAG.getConstant(0, DL, MVT::v8i1), IsNanZero,
29458 DAG.getVectorIdxConstant(0, DL));
29459 SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins);
29460 NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X);
29461 NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y);
29462 return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29463 } else {
29464 SDValue IsXSigned;
29465 if (Subtarget.is64Bit() || VT != MVT::f64) {
29466 SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X);
29467 SDValue ZeroCst = DAG.getConstant(0, DL, IVT);
29468 IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT);
29469 } else {
29470 assert(VT == MVT::f64);
29471 SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64,
29472 DAG.getConstantFP(0, DL, MVT::v2f64), X,
29473 DAG.getVectorIdxConstant(0, DL));
29474 SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins);
29475 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX,
29476 DAG.getVectorIdxConstant(1, DL));
29477 Hi = DAG.getBitcast(MVT::i32, Hi);
29478 SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32);
29479 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(),
29480 *DAG.getContext(), MVT::i32);
29481 IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT);
29482 }
29483 if (MinMaxOp == X86ISD::FMAX) {
29484 NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29485 NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29486 } else {
29487 NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29488 NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29489 }
29490 }
29491
29492 bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath ||
29493 Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN);
29494
29495 // If we did no ordering operands for signed zero handling and we need
29496 // to process NaN and we know that one of the operands is not NaN then:
29497 // - For minimum/maximum, put it in the first operand,
29498 // - For minimumnum/maximumnum, put it in the second operand,
29499 // and we will not need to post handle NaN after max/min.
29500 if (IgnoreSignedZero && !IgnoreNaN &&
29501 DAG.isKnownNeverNaN(IsNum ? NewX : NewY))
29502 std::swap(NewX, NewY);
29503
29504 SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29505
29506 if (IgnoreNaN || DAG.isKnownNeverNaN(IsNum ? NewY : NewX))
29507 return MinMax;
29508
29509 if (DAG.isKnownNeverNaN(NewX))
29510 NewX = NewY;
29511
29512 SDValue IsNaN =
29513 DAG.getSetCC(DL, SetCCType, NewX, NewX, IsNum ? ISD::SETO : ISD::SETUO);
29514
29515 return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
29516}
29517
29518static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
29519 SelectionDAG &DAG) {
29520 MVT VT = Op.getSimpleValueType();
29521 SDLoc dl(Op);
29522
29523 // For AVX1 cases, split to use legal ops.
29524 if (VT.is256BitVector() && !Subtarget.hasInt256())
29525 return splitVectorIntBinary(Op, DAG, dl);
29526
29527 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())
29528 return splitVectorIntBinary(Op, DAG, dl);
29529
29530 bool IsSigned = Op.getOpcode() == ISD::ABDS;
29531 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29532
29533 if (Subtarget.canUseCMOV() && VT.isScalarInteger()) {
29534 X86::CondCode CC = IsSigned ? X86::COND_L : X86::COND_B;
29535 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29536
29537 // abds(lhs, rhs) -> select(slt(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29538 // abdu(lhs, rhs) -> select(ult(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29539 if (VT.bitsGE(MVT::i32)) {
29540 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29541 SDValue LHS = DAG.getFreeze(Op.getOperand(0));
29542 SDValue RHS = DAG.getFreeze(Op.getOperand(1));
29543 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, VTs, LHS, RHS);
29544 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, VTs, RHS, LHS);
29545 return DAG.getNode(X86ISD::CMOV, dl, VT, Diff1, Diff0,
29546 DAG.getTargetConstant(CC, dl, MVT::i8),
29547 Diff1.getValue(1));
29548 }
29549
29550 // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs))))
29551 // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs))))
29552 unsigned WideBits = std::max<unsigned>(2 * VT.getScalarSizeInBits(), 32u);
29553 MVT WideVT = MVT::getIntegerVT(WideBits);
29554 if (TLI.isTypeLegal(WideVT)) {
29555 SDVTList WideVTs = DAG.getVTList(WideVT, MVT::i32);
29556 SDValue LHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(0));
29557 SDValue RHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(1));
29558 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, WideVTs, LHS, RHS);
29559 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, WideVTs, RHS, LHS);
29560 SDValue AbsDiff = DAG.getNode(X86ISD::CMOV, dl, WideVT, Diff1, Diff0,
29561 DAG.getTargetConstant(CC, dl, MVT::i8),
29562 Diff1.getValue(1));
29563 return DAG.getNode(ISD::TRUNCATE, dl, VT, AbsDiff);
29564 }
29565 }
29566
29567 // Default to expand.
29568 return SDValue();
29569}
29570
29571static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
29572 SelectionDAG &DAG) {
29573 SDLoc dl(Op);
29574 MVT VT = Op.getSimpleValueType();
29575
29576 // Decompose 256-bit ops into 128-bit ops.
29577 if (VT.is256BitVector() && !Subtarget.hasInt256())
29578 return splitVectorIntBinary(Op, DAG, dl);
29579
29580 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29581 return splitVectorIntBinary(Op, DAG, dl);
29582
29583 SDValue A = Op.getOperand(0);
29584 SDValue B = Op.getOperand(1);
29585
29586 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
29587 // vector pairs, multiply and truncate.
29588 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
29589 unsigned NumElts = VT.getVectorNumElements();
29590 unsigned NumLanes = VT.getSizeInBits() / 128;
29591 unsigned NumEltsPerLane = NumElts / NumLanes;
29592
29593 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29594 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29595 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
29596 return DAG.getNode(
29597 ISD::TRUNCATE, dl, VT,
29598 DAG.getNode(ISD::MUL, dl, ExVT,
29599 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
29600 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
29601 }
29602
29603 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29604
29605 // For vXi8 mul, try PMADDUBSW to avoid the need for extension.
29606 // Don't do this if we only need to unpack one half.
29607 if (Subtarget.hasSSSE3()) {
29608 bool BIsBuildVector = isa<BuildVectorSDNode>(B);
29609 bool IsLoLaneAllZeroOrUndef = BIsBuildVector;
29610 bool IsHiLaneAllZeroOrUndef = BIsBuildVector;
29611 if (BIsBuildVector) {
29612 for (auto [Idx, Val] : enumerate(B->ops())) {
29613 if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2))
29614 IsHiLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29615 else
29616 IsLoLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29617 }
29618 }
29619 if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) {
29620 SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT));
29621 SDValue BLo = DAG.getNode(ISD::AND, dl, VT, Mask, B);
29622 SDValue BHi = DAG.getNode(X86ISD::ANDNP, dl, VT, Mask, B);
29623 SDValue RLo = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BLo);
29624 SDValue RHi = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BHi);
29625 RLo = DAG.getNode(ISD::AND, dl, VT, DAG.getBitcast(VT, RLo), Mask);
29626 RHi = DAG.getNode(X86ISD::VSHLI, dl, ExVT, RHi,
29627 DAG.getTargetConstant(8, dl, MVT::i8));
29628 return DAG.getNode(ISD::OR, dl, VT, RLo, DAG.getBitcast(VT, RHi));
29629 }
29630 }
29631
29632 // Extract the lo/hi parts to any extend to i16.
29633 // We're going to mask off the low byte of each result element of the
29634 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
29635 // element.
29636 SDValue Undef = DAG.getUNDEF(VT);
29637 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
29638 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
29639
29640 SDValue BLo, BHi;
29641 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29642 // If the RHS is a constant, manually unpackl/unpackh.
29643 SmallVector<SDValue, 16> LoOps, HiOps;
29644 for (unsigned i = 0; i != NumElts; i += 16) {
29645 for (unsigned j = 0; j != 8; ++j) {
29646 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
29647 MVT::i16));
29648 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
29649 MVT::i16));
29650 }
29651 }
29652
29653 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29654 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29655 } else {
29656 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
29657 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
29658 }
29659
29660 // Multiply, mask the lower 8bits of the lo/hi results and pack.
29661 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
29662 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
29663 return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29664 }
29665
29666 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
29667 if (VT == MVT::v4i32) {
29668 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
29669 "Should not custom lower when pmulld is available!");
29670
29671 // Extract the odd parts.
29672 static const int UnpackMask[] = {1, 1, 3, 3};
29673 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
29674 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
29675
29676 // Multiply the even parts.
29677 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29678 DAG.getBitcast(MVT::v2i64, A),
29679 DAG.getBitcast(MVT::v2i64, B));
29680 // Now multiply odd parts.
29681 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29682 DAG.getBitcast(MVT::v2i64, Aodds),
29683 DAG.getBitcast(MVT::v2i64, Bodds));
29684
29685 Evens = DAG.getBitcast(VT, Evens);
29686 Odds = DAG.getBitcast(VT, Odds);
29687
29688 // Merge the two vectors back together with a shuffle. This expands into 2
29689 // shuffles.
29690 static const int ShufMask[] = { 0, 4, 2, 6 };
29691 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
29692 }
29693
29694 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
29695 "Only know how to lower V2I64/V4I64/V8I64 multiply");
29696 assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
29697
29698 // Ahi = psrlqi(a, 32);
29699 // Bhi = psrlqi(b, 32);
29700 //
29701 // AloBlo = pmuludq(a, b);
29702 // AloBhi = pmuludq(a, Bhi);
29703 // AhiBlo = pmuludq(Ahi, b);
29704 //
29705 // Hi = psllqi(AloBhi + AhiBlo, 32);
29706 // return AloBlo + Hi;
29707 KnownBits AKnown = DAG.computeKnownBits(A);
29708 KnownBits BKnown = DAG.computeKnownBits(B);
29709
29710 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
29711 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
29712 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
29713
29714 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
29715 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
29716 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
29717
29718 SDValue Zero = DAG.getConstant(0, dl, VT);
29719
29720 // Only multiply lo/hi halves that aren't known to be zero.
29721 SDValue AloBlo = Zero;
29722 if (!ALoIsZero && !BLoIsZero)
29723 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
29724
29725 SDValue AloBhi = Zero;
29726 if (!ALoIsZero && !BHiIsZero) {
29727 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
29728 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
29729 }
29730
29731 SDValue AhiBlo = Zero;
29732 if (!AHiIsZero && !BLoIsZero) {
29733 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
29734 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
29735 }
29736
29737 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
29738 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
29739
29740 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
29741}
29742
29744 MVT VT, bool IsSigned,
29745 const X86Subtarget &Subtarget,
29746 SelectionDAG &DAG,
29747 SDValue *Low = nullptr) {
29748 unsigned NumElts = VT.getVectorNumElements();
29749
29750 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
29751 // to a vXi16 type. Do the multiplies, shift the results and pack the half
29752 // lane results back together.
29753
29754 // We'll take different approaches for signed and unsigned.
29755 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
29756 // and use pmullw to calculate the full 16-bit product.
29757 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
29758 // shift them left into the upper byte of each word. This allows us to use
29759 // pmulhw to calculate the full 16-bit product. This trick means we don't
29760 // need to sign extend the bytes to use pmullw.
29761
29762 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29763 SDValue Zero = DAG.getConstant(0, dl, VT);
29764
29765 SDValue ALo, AHi;
29766 if (IsSigned) {
29767 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
29768 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
29769 } else {
29770 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
29771 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
29772 }
29773
29774 SDValue BLo, BHi;
29775 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29776 // If the RHS is a constant, manually unpackl/unpackh and extend.
29777 SmallVector<SDValue, 16> LoOps, HiOps;
29778 for (unsigned i = 0; i != NumElts; i += 16) {
29779 for (unsigned j = 0; j != 8; ++j) {
29780 SDValue LoOp = B.getOperand(i + j);
29781 SDValue HiOp = B.getOperand(i + j + 8);
29782
29783 if (IsSigned) {
29784 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
29785 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
29786 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
29787 DAG.getConstant(8, dl, MVT::i16));
29788 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
29789 DAG.getConstant(8, dl, MVT::i16));
29790 } else {
29791 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
29792 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
29793 }
29794
29795 LoOps.push_back(LoOp);
29796 HiOps.push_back(HiOp);
29797 }
29798 }
29799
29800 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29801 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29802 } else if (IsSigned) {
29803 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
29804 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
29805 } else {
29806 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
29807 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
29808 }
29809
29810 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
29811 // pack back to vXi8.
29812 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
29813 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
29814 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
29815
29816 if (Low)
29817 *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29818
29819 return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
29820}
29821
29822static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
29823 SelectionDAG &DAG) {
29824 SDLoc dl(Op);
29825 MVT VT = Op.getSimpleValueType();
29826 bool IsSigned = Op->getOpcode() == ISD::MULHS;
29827 unsigned NumElts = VT.getVectorNumElements();
29828 SDValue A = Op.getOperand(0);
29829 SDValue B = Op.getOperand(1);
29830
29831 // Decompose 256-bit ops into 128-bit ops.
29832 if (VT.is256BitVector() && !Subtarget.hasInt256())
29833 return splitVectorIntBinary(Op, DAG, dl);
29834
29835 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29836 return splitVectorIntBinary(Op, DAG, dl);
29837
29838 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
29839 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
29840 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
29841 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
29842
29843 // PMULxD operations multiply each even value (starting at 0) of LHS with
29844 // the related value of RHS and produce a widen result.
29845 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29846 // => <2 x i64> <ae|cg>
29847 //
29848 // In other word, to have all the results, we need to perform two PMULxD:
29849 // 1. one with the even values.
29850 // 2. one with the odd values.
29851 // To achieve #2, with need to place the odd values at an even position.
29852 //
29853 // Place the odd value at an even position (basically, shift all values 1
29854 // step to the left):
29855 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
29856 9, -1, 11, -1, 13, -1, 15, -1};
29857 // <a|b|c|d> => <b|undef|d|undef>
29858 SDValue Odd0 =
29859 DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));
29860 // <e|f|g|h> => <f|undef|h|undef>
29861 SDValue Odd1 =
29862 DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));
29863
29864 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
29865 // ints.
29866 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
29867 unsigned Opcode =
29868 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
29869 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29870 // => <2 x i64> <ae|cg>
29871 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29872 DAG.getBitcast(MulVT, A),
29873 DAG.getBitcast(MulVT, B)));
29874 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
29875 // => <2 x i64> <bf|dh>
29876 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29877 DAG.getBitcast(MulVT, Odd0),
29878 DAG.getBitcast(MulVT, Odd1)));
29879
29880 // Shuffle it back into the right order.
29881 SmallVector<int, 16> ShufMask(NumElts);
29882 for (int i = 0; i != (int)NumElts; ++i)
29883 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
29884
29885 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
29886
29887 // If we have a signed multiply but no PMULDQ fix up the result of an
29888 // unsigned multiply.
29889 if (IsSigned && !Subtarget.hasSSE41()) {
29890 SDValue Zero = DAG.getConstant(0, dl, VT);
29891 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
29892 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
29893 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
29894 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
29895
29896 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
29897 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
29898 }
29899
29900 return Res;
29901 }
29902
29903 // Only i8 vectors should need custom lowering after this.
29904 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
29905 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
29906 "Unsupported vector type");
29907
29908 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
29909 // logical shift down the upper half and pack back to i8.
29910
29911 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
29912 // and then ashr/lshr the upper bits down to the lower bits before multiply.
29913
29914 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29915 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29916 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29917 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29918 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29919 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29920 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29921 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29922 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29923 }
29924
29925 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
29926}
29927
29928// Custom lowering for SMULO/UMULO.
29929static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
29930 SelectionDAG &DAG) {
29931 MVT VT = Op.getSimpleValueType();
29932
29933 // Scalars defer to LowerXALUO.
29934 if (!VT.isVector())
29935 return LowerXALUO(Op, DAG);
29936
29937 SDLoc dl(Op);
29938 bool IsSigned = Op->getOpcode() == ISD::SMULO;
29939 SDValue A = Op.getOperand(0);
29940 SDValue B = Op.getOperand(1);
29941 EVT OvfVT = Op->getValueType(1);
29942
29943 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
29944 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
29945 // Extract the LHS Lo/Hi vectors
29946 SDValue LHSLo, LHSHi;
29947 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
29948
29949 // Extract the RHS Lo/Hi vectors
29950 SDValue RHSLo, RHSHi;
29951 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
29952
29953 EVT LoOvfVT, HiOvfVT;
29954 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
29955 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
29956 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
29957
29958 // Issue the split operations.
29959 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
29960 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
29961
29962 // Join the separate data results and the overflow results.
29963 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
29964 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
29965 Hi.getValue(1));
29966
29967 return DAG.getMergeValues({Res, Ovf}, dl);
29968 }
29969
29970 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29971 EVT SetccVT =
29972 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29973
29974 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29975 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29976 unsigned NumElts = VT.getVectorNumElements();
29977 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29978 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29979 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29980 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29981 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29982
29983 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29984
29985 SDValue Ovf;
29986 if (IsSigned) {
29987 SDValue High, LowSign;
29988 if (OvfVT.getVectorElementType() == MVT::i1 &&
29989 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
29990 // Rather the truncating try to do the compare on vXi16 or vXi32.
29991 // Shift the high down filling with sign bits.
29992 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
29993 // Fill all 16 bits with the sign bit from the low.
29994 LowSign =
29995 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
29996 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
29997 15, DAG);
29998 SetccVT = OvfVT;
29999 if (!Subtarget.hasBWI()) {
30000 // We can't do a vXi16 compare so sign extend to v16i32.
30001 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
30002 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
30003 }
30004 } else {
30005 // Otherwise do the compare at vXi8.
30006 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
30007 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
30008 LowSign =
30009 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30010 }
30011
30012 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30013 } else {
30014 SDValue High =
30015 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
30016 if (OvfVT.getVectorElementType() == MVT::i1 &&
30017 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
30018 // Rather the truncating try to do the compare on vXi16 or vXi32.
30019 SetccVT = OvfVT;
30020 if (!Subtarget.hasBWI()) {
30021 // We can't do a vXi16 compare so sign extend to v16i32.
30022 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
30023 }
30024 } else {
30025 // Otherwise do the compare at vXi8.
30026 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
30027 }
30028
30029 Ovf =
30030 DAG.getSetCC(dl, SetccVT, High,
30031 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
30032 }
30033
30034 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30035
30036 return DAG.getMergeValues({Low, Ovf}, dl);
30037 }
30038
30039 SDValue Low;
30040 SDValue High =
30041 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
30042
30043 SDValue Ovf;
30044 if (IsSigned) {
30045 // SMULO overflows if the high bits don't match the sign of the low.
30046 SDValue LowSign =
30047 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30048 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30049 } else {
30050 // UMULO overflows if the high bits are non-zero.
30051 Ovf =
30052 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
30053 }
30054
30055 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30056
30057 return DAG.getMergeValues({Low, Ovf}, dl);
30058}
30059
30060SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
30061 assert(Subtarget.isTargetWin64() && "Unexpected target");
30062 EVT VT = Op.getValueType();
30063 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
30064 "Unexpected return type for lowering");
30065
30066 if (isa<ConstantSDNode>(Op->getOperand(1))) {
30068 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))
30069 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);
30070 }
30071
30072 RTLIB::Libcall LC;
30073 bool isSigned;
30074 switch (Op->getOpcode()) {
30075 // clang-format off
30076 default: llvm_unreachable("Unexpected request for libcall!");
30077 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
30078 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
30079 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
30080 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
30081 // clang-format on
30082 }
30083
30084 SDLoc dl(Op);
30085 SDValue InChain = DAG.getEntryNode();
30086
30088 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
30089 EVT ArgVT = Op->getOperand(i).getValueType();
30090 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
30091 "Unexpected argument type for lowering");
30092 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30093 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30094 MachinePointerInfo MPI =
30096 InChain =
30097 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
30098 Args.emplace_back(StackPtr, PointerType::get(*DAG.getContext(), 0));
30099 }
30100
30103
30104 TargetLowering::CallLoweringInfo CLI(DAG);
30105 CLI.setDebugLoc(dl)
30106 .setChain(InChain)
30107 .setLibCallee(
30109 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
30110 std::move(Args))
30111 .setInRegister()
30112 .setSExtResult(isSigned)
30113 .setZExtResult(!isSigned);
30114
30115 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
30116 return DAG.getBitcast(VT, CallInfo.first);
30117}
30118
30119SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
30120 SelectionDAG &DAG,
30121 SDValue &Chain) const {
30122 assert(Subtarget.isTargetWin64() && "Unexpected target");
30123 EVT VT = Op.getValueType();
30124 bool IsStrict = Op->isStrictFPOpcode();
30125
30126 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30127 EVT ArgVT = Arg.getValueType();
30128
30129 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
30130 "Unexpected return type for lowering");
30131
30132 RTLIB::Libcall LC;
30133 if (Op->getOpcode() == ISD::FP_TO_SINT ||
30134 Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
30135 LC = RTLIB::getFPTOSINT(ArgVT, VT);
30136 else
30137 LC = RTLIB::getFPTOUINT(ArgVT, VT);
30138 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
30139
30140 SDLoc dl(Op);
30141 MakeLibCallOptions CallOptions;
30142 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30143
30145 // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
30146 // expected VT (i128).
30147 std::tie(Result, Chain) =
30148 makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
30149 Result = DAG.getBitcast(VT, Result);
30150 return Result;
30151}
30152
30153SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
30154 SelectionDAG &DAG) const {
30155 assert(Subtarget.isTargetWin64() && "Unexpected target");
30156 EVT VT = Op.getValueType();
30157 bool IsStrict = Op->isStrictFPOpcode();
30158
30159 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30160 EVT ArgVT = Arg.getValueType();
30161
30162 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
30163 "Unexpected argument type for lowering");
30164
30165 RTLIB::Libcall LC;
30166 if (Op->getOpcode() == ISD::SINT_TO_FP ||
30167 Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
30168 LC = RTLIB::getSINTTOFP(ArgVT, VT);
30169 else
30170 LC = RTLIB::getUINTTOFP(ArgVT, VT);
30171 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
30172
30173 SDLoc dl(Op);
30174 MakeLibCallOptions CallOptions;
30175 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30176
30177 // Pass the i128 argument as an indirect argument on the stack.
30178 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30179 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30180 MachinePointerInfo MPI =
30182 Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
30183
30185 std::tie(Result, Chain) =
30186 makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
30187 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
30188}
30189
30190// Return true if the required (according to Opcode) shift-imm form is natively
30191// supported by the Subtarget
30192static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
30193 unsigned Opcode) {
30194 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
30195 "Unexpected shift opcode");
30196
30197 if (!VT.isSimple())
30198 return false;
30199
30200 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30201 return false;
30202
30203 if (VT.getScalarSizeInBits() < 16)
30204 return false;
30205
30206 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
30207 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
30208 return true;
30209
30210 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
30211 (VT.is256BitVector() && Subtarget.hasInt256());
30212
30213 bool AShift = LShift && (Subtarget.hasAVX512() ||
30214 (VT != MVT::v2i64 && VT != MVT::v4i64));
30215 return (Opcode == ISD::SRA) ? AShift : LShift;
30216}
30217
30218// The shift amount is a variable, but it is the same for all vector lanes.
30219// These instructions are defined together with shift-immediate.
30220static
30222 unsigned Opcode) {
30223 return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
30224}
30225
30226// Return true if the required (according to Opcode) variable-shift form is
30227// natively supported by the Subtarget
30228static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,
30229 unsigned Opcode) {
30230 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
30231 "Unexpected shift opcode");
30232
30233 if (!VT.isSimple())
30234 return false;
30235
30236 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30237 return false;
30238
30239 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
30240 return false;
30241
30242 // vXi16 supported only on AVX-512, BWI
30243 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
30244 return false;
30245
30246 if (Subtarget.hasAVX512() &&
30247 (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
30248 return true;
30249
30250 bool LShift = VT.is128BitVector() || VT.is256BitVector();
30251 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
30252 return (Opcode == ISD::SRA) ? AShift : LShift;
30253}
30254
30256 const X86Subtarget &Subtarget) {
30257 MVT VT = Op.getSimpleValueType();
30258 SDLoc dl(Op);
30259 SDValue R = Op.getOperand(0);
30260 SDValue Amt = Op.getOperand(1);
30261 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
30262 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30263
30264 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
30265 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
30266 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
30267 SDValue Ex = DAG.getBitcast(ExVT, R);
30268
30269 // ashr(R, 63) === cmp_slt(R, 0)
30270 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
30271 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
30272 "Unsupported PCMPGT op");
30273 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
30274 }
30275
30276 if (ShiftAmt >= 32) {
30277 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
30278 SDValue Upper =
30279 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
30281 ShiftAmt - 32, DAG);
30282 if (VT == MVT::v2i64)
30283 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
30284 if (VT == MVT::v4i64)
30285 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30286 {9, 1, 11, 3, 13, 5, 15, 7});
30287 } else {
30288 // SRA upper i32, SRL whole i64 and select lower i32.
30290 ShiftAmt, DAG);
30291 SDValue Lower =
30292 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
30293 Lower = DAG.getBitcast(ExVT, Lower);
30294 if (VT == MVT::v2i64)
30295 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
30296 if (VT == MVT::v4i64)
30297 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30298 {8, 1, 10, 3, 12, 5, 14, 7});
30299 }
30300 return DAG.getBitcast(VT, Ex);
30301 };
30302
30303 // Optimize shl/srl/sra with constant shift amount.
30304 APInt APIntShiftAmt;
30305 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
30306 return SDValue();
30307
30308 // If the shift amount is out of range, return undef.
30309 if (APIntShiftAmt.uge(EltSizeInBits))
30310 return DAG.getUNDEF(VT);
30311
30312 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
30313
30314 if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
30315 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
30316
30317 // i64 SRA needs to be performed as partial shifts.
30318 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
30319 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
30320 Op.getOpcode() == ISD::SRA)
30321 return ArithmeticShiftRight64(ShiftAmt);
30322
30323 // If we're logical shifting an all-signbits value then we can just perform as
30324 // a mask.
30325 if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) &&
30326 DAG.ComputeNumSignBits(R) == EltSizeInBits) {
30327 SDValue Mask = DAG.getAllOnesConstant(dl, VT);
30328 Mask = DAG.getNode(Op.getOpcode(), dl, VT, Mask, Amt);
30329 return DAG.getNode(ISD::AND, dl, VT, R, Mask);
30330 }
30331
30332 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30333 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
30334 unsigned NumElts = VT.getVectorNumElements();
30335 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30336
30337 // Simple i8 add case
30338 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
30339 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
30340 // must be 0). (add undef, undef) however can be any value. To make this
30341 // safe, we must freeze R to ensure that register allocation uses the same
30342 // register for an undefined value. This ensures that the result will
30343 // still be even and preserves the original semantics.
30344 R = DAG.getFreeze(R);
30345 return DAG.getNode(ISD::ADD, dl, VT, R, R);
30346 }
30347
30348 // ashr(R, 7) === cmp_slt(R, 0)
30349 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
30350 SDValue Zeros = DAG.getConstant(0, dl, VT);
30351 if (VT.is512BitVector()) {
30352 assert(VT == MVT::v64i8 && "Unexpected element type!");
30353 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
30354 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
30355 }
30356 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
30357 }
30358
30359 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
30360 if (VT == MVT::v16i8 && Subtarget.hasXOP())
30361 return SDValue();
30362
30363 if (Subtarget.hasGFNI()) {
30364 SDValue Mask = getGFNICtrlMask(Op.getOpcode(), DAG, dl, VT, ShiftAmt);
30365 return DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, R, Mask,
30366 DAG.getTargetConstant(0, dl, MVT::i8));
30367 }
30368
30369 if (Op.getOpcode() == ISD::SHL) {
30370 // Make a large shift.
30371 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
30372 ShiftAmt, DAG);
30373 SHL = DAG.getBitcast(VT, SHL);
30374 // Zero out the rightmost bits.
30375 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
30376 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
30377 }
30378 if (Op.getOpcode() == ISD::SRL) {
30379 // Make a large shift.
30380 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
30381 ShiftAmt, DAG);
30382 SRL = DAG.getBitcast(VT, SRL);
30383 // Zero out the leftmost bits.
30384 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
30385 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
30386 }
30387 if (Op.getOpcode() == ISD::SRA) {
30388 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
30389 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30390
30391 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
30392 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
30393 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
30394 return Res;
30395 }
30396 llvm_unreachable("Unknown shift opcode.");
30397 }
30398
30399 return SDValue();
30400}
30401
30403 const X86Subtarget &Subtarget) {
30404 MVT VT = Op.getSimpleValueType();
30405 SDLoc dl(Op);
30406 SDValue R = Op.getOperand(0);
30407 SDValue Amt = Op.getOperand(1);
30408 unsigned Opcode = Op.getOpcode();
30409 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
30410
30411 int BaseShAmtIdx = -1;
30412 if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
30413 if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
30414 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
30415 Subtarget, DAG);
30416
30417 // vXi8 shifts - shift as v8i16 + mask result.
30418 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
30419 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
30420 VT == MVT::v64i8) &&
30421 !Subtarget.hasXOP()) {
30422 unsigned NumElts = VT.getVectorNumElements();
30423 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30424 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
30425 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
30426 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
30427
30428 // Create the mask using vXi16 shifts. For shift-rights we need to move
30429 // the upper byte down before splatting the vXi8 mask.
30430 SDValue BitMask = DAG.getAllOnesConstant(dl, ExtVT);
30431 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
30432 BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
30433 if (Opcode != ISD::SHL)
30434 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
30435 8, DAG);
30436 BitMask = DAG.getBitcast(VT, BitMask);
30437 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
30438 SmallVector<int, 64>(NumElts, 0));
30439
30440 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
30441 DAG.getBitcast(ExtVT, R), BaseShAmt,
30442 BaseShAmtIdx, Subtarget, DAG);
30443 Res = DAG.getBitcast(VT, Res);
30444 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
30445
30446 if (Opcode == ISD::SRA) {
30447 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
30448 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
30449 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
30450 SignMask =
30451 getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
30452 BaseShAmtIdx, Subtarget, DAG);
30453 SignMask = DAG.getBitcast(VT, SignMask);
30454 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
30455 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
30456 }
30457 return Res;
30458 }
30459 }
30460 }
30461
30462 return SDValue();
30463}
30464
30465// Convert a shift/rotate left amount to a multiplication scale factor.
30467 const X86Subtarget &Subtarget,
30468 SelectionDAG &DAG) {
30469 MVT VT = Amt.getSimpleValueType();
30470 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
30471 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
30472 (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
30473 (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
30474 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30475 (Subtarget.hasBWI() && VT == MVT::v64i8)))
30476 return SDValue();
30477
30478 MVT SVT = VT.getVectorElementType();
30479 unsigned SVTBits = SVT.getSizeInBits();
30480 unsigned NumElems = VT.getVectorNumElements();
30481
30482 APInt UndefElts;
30483 SmallVector<APInt> EltBits;
30484 if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
30485 APInt One(SVTBits, 1);
30486 SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
30487 for (unsigned I = 0; I != NumElems; ++I) {
30488 if (UndefElts[I] || EltBits[I].uge(SVTBits))
30489 continue;
30490 uint64_t ShAmt = EltBits[I].getZExtValue();
30491 Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
30492 }
30493 return DAG.getBuildVector(VT, dl, Elts);
30494 }
30495
30496 // If the target doesn't support variable shifts, use either FP conversion
30497 // or integer multiplication to avoid shifting each element individually.
30498 if (VT == MVT::v4i32) {
30499 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
30500 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
30501 DAG.getConstant(0x3f800000U, dl, VT));
30502 Amt = DAG.getBitcast(MVT::v4f32, Amt);
30503 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
30504 }
30505
30506 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
30507 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
30508 SDValue Z = DAG.getConstant(0, dl, VT);
30509 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
30510 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
30511 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
30512 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
30513 if (Subtarget.hasSSE41())
30514 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
30515 return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
30516 }
30517
30518 return SDValue();
30519}
30520
30521static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
30522 SelectionDAG &DAG) {
30523 MVT VT = Op.getSimpleValueType();
30524 SDLoc dl(Op);
30525 SDValue R = Op.getOperand(0);
30526 SDValue Amt = Op.getOperand(1);
30527 unsigned NumElts = VT.getVectorNumElements();
30528 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30529 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
30530
30531 unsigned Opc = Op.getOpcode();
30532 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
30533 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
30534
30535 assert(VT.isVector() && "Custom lowering only for vector shifts!");
30536 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
30537
30538 if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
30539 return V;
30540
30541 if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
30542 return V;
30543
30544 if (supportedVectorVarShift(VT, Subtarget, Opc))
30545 return Op;
30546
30547 // i64 vector arithmetic shift can be emulated with the transform:
30548 // M = lshr(SIGN_MASK, Amt)
30549 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
30550 if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
30551 (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
30552 Opc == ISD::SRA) {
30553 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
30554 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
30555 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30556 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
30557 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
30558 return R;
30559 }
30560
30561 // XOP has 128-bit variable logical/arithmetic shifts.
30562 // +ve/-ve Amt = shift left/right.
30563 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
30564 VT == MVT::v8i16 || VT == MVT::v16i8)) {
30565 if (Opc == ISD::SRL || Opc == ISD::SRA)
30566 Amt = DAG.getNegative(Amt, dl, VT);
30567 if (Opc == ISD::SHL || Opc == ISD::SRL)
30568 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
30569 if (Opc == ISD::SRA)
30570 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
30571 }
30572
30573 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
30574 // shifts per-lane and then shuffle the partial results back together.
30575 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
30576 // Splat the shift amounts so the scalar shifts above will catch it.
30577 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
30578 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
30579 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
30580 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
30581 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
30582 }
30583
30584 // Build a map of inrange constant amounts with element mask where they occur.
30586 if (ConstantAmt) {
30587 for (unsigned I = 0; I != NumElts; ++I) {
30588 SDValue A = Amt.getOperand(I);
30589 if (A.isUndef() || A->getAsAPIntVal().uge(EltSizeInBits))
30590 continue;
30591 unsigned CstAmt = A->getAsAPIntVal().getZExtValue();
30592 auto [It, Inserted] = UniqueCstAmt.try_emplace(CstAmt);
30593 if (!Inserted) {
30594 It->second.setBit(I);
30595 continue;
30596 }
30597 It->second = APInt::getOneBitSet(NumElts, I);
30598 }
30599 assert(!UniqueCstAmt.empty() && "Illegal constant shift amounts");
30600 }
30601
30602 // If possible, lower this shift as a sequence of two shifts by
30603 // constant plus a BLENDing shuffle instead of scalarizing it.
30604 // Example:
30605 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
30606 //
30607 // Could be rewritten as:
30608 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
30609 //
30610 // The advantage is that the two shifts from the example would be
30611 // lowered as X86ISD::VSRLI nodes in parallel before blending.
30612 if (UniqueCstAmt.size() == 2 &&
30613 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
30614 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30615 unsigned AmtA = UniqueCstAmt.begin()->first;
30616 unsigned AmtB = std::next(UniqueCstAmt.begin())->first;
30617 const APInt &MaskA = UniqueCstAmt.begin()->second;
30618 const APInt &MaskB = std::next(UniqueCstAmt.begin())->second;
30619 SmallVector<int, 8> ShuffleMask(NumElts, SM_SentinelUndef);
30620 for (unsigned I = 0; I != NumElts; ++I) {
30621 if (MaskA[I])
30622 ShuffleMask[I] = I;
30623 if (MaskB[I])
30624 ShuffleMask[I] = I + NumElts;
30625 }
30626
30627 // Only perform this blend if we can perform it without loading a mask.
30628 if ((VT != MVT::v16i16 ||
30629 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
30630 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
30631 canWidenShuffleElements(ShuffleMask))) {
30632 SDValue Shift1 =
30633 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtA, dl, VT));
30634 SDValue Shift2 =
30635 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtB, dl, VT));
30636 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
30637 }
30638 }
30639
30640 // Constant ISD::SRA/SRL/SHL can be performed efficiently on vXiN vectors by
30641 // using vYiM vector operations where X*N == Y*M and M > N.
30642 if (ConstantAmt &&
30643 (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
30644 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16) &&
30645 !Subtarget.hasXOP()) {
30646 MVT NarrowScalarVT = VT.getScalarType();
30647 // We can do this extra fast if each pair of narrow elements is shifted by
30648 // the same amount by doing this SWAR style: use a shift to move the valid
30649 // bits to the right position, mask out any bits which crossed from one
30650 // element to the other.
30651 // This optimized lowering is only valid if the elements in a pair can
30652 // be treated identically.
30653 SmallVector<SDValue, 32> AmtWideElts(Amt->ops());
30654 SmallVector<SDValue, 32> TmpAmtWideElts;
30655 int WideEltSizeInBits = EltSizeInBits;
30656 while (WideEltSizeInBits < 32) {
30657 // AVX1 does not have psrlvd, etc. which makes interesting 32-bit shifts
30658 // unprofitable.
30659 if (WideEltSizeInBits >= 16 && !Subtarget.hasAVX2()) {
30660 break;
30661 }
30662 TmpAmtWideElts.resize(AmtWideElts.size() / 2);
30663 bool SameShifts = true;
30664 for (unsigned SrcI = 0, E = AmtWideElts.size(); SrcI != E; SrcI += 2) {
30665 unsigned DstI = SrcI / 2;
30666 // Both elements are undef? Make a note and keep going.
30667 if (AmtWideElts[SrcI].isUndef() && AmtWideElts[SrcI + 1].isUndef()) {
30668 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30669 continue;
30670 }
30671 // Even element is undef? We will shift it by the same shift amount as
30672 // the odd element.
30673 if (AmtWideElts[SrcI].isUndef()) {
30674 TmpAmtWideElts[DstI] = AmtWideElts[SrcI + 1];
30675 continue;
30676 }
30677 // Odd element is undef? We will shift it by the same shift amount as
30678 // the even element.
30679 if (AmtWideElts[SrcI + 1].isUndef()) {
30680 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30681 continue;
30682 }
30683 // Both elements are equal.
30684 if (AmtWideElts[SrcI].getNode()->getAsAPIntVal() ==
30685 AmtWideElts[SrcI + 1].getNode()->getAsAPIntVal()) {
30686 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30687 continue;
30688 }
30689 // One of the provisional wide elements will not have the same shift
30690 // amount. Let's bail.
30691 SameShifts = false;
30692 break;
30693 }
30694 if (!SameShifts) {
30695 break;
30696 }
30697 WideEltSizeInBits *= 2;
30698 std::swap(TmpAmtWideElts, AmtWideElts);
30699 }
30700 APInt APIntShiftAmt;
30701 bool IsConstantSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
30702 bool Profitable = WidenShift;
30703 // AVX512BW brings support for vpsllvw.
30704 if (WideEltSizeInBits * AmtWideElts.size() >= 512 &&
30705 WideEltSizeInBits < 32 && !Subtarget.hasBWI()) {
30706 Profitable = false;
30707 }
30708 // Leave AVX512 uniform arithmetic shifts alone, they can be implemented
30709 // fairly cheaply in other ways.
30710 if (WideEltSizeInBits * AmtWideElts.size() >= 512 && IsConstantSplat) {
30711 Profitable = false;
30712 }
30713 // Leave it up to GFNI if we have it around.
30714 // TODO: gf2p8affine is usually higher latency and more port restricted. It
30715 // is probably a win to use other strategies in some cases.
30716 if (EltSizeInBits == 8 && Subtarget.hasGFNI()) {
30717 Profitable = false;
30718 }
30719
30720 // AVX1 does not have vpand which makes our masking impractical. It does
30721 // have vandps but that is an FP instruction and crossing FP<->int typically
30722 // has some cost.
30723 if (WideEltSizeInBits * AmtWideElts.size() >= 256 &&
30724 (WideEltSizeInBits < 32 || IsConstantSplat) && !Subtarget.hasAVX2()) {
30725 Profitable = false;
30726 }
30727 unsigned WideNumElts = AmtWideElts.size();
30728 // We are only dealing with identical pairs.
30729 if (Profitable && WideNumElts != NumElts) {
30730 MVT WideScalarVT = MVT::getIntegerVT(WideEltSizeInBits);
30731 MVT WideVT = MVT::getVectorVT(WideScalarVT, WideNumElts);
30732 // Cast the operand to vXiM.
30733 SDValue RWide = DAG.getBitcast(WideVT, R);
30734 // Create our new vector of shift amounts.
30735 SDValue AmtWide = DAG.getBuildVector(
30736 MVT::getVectorVT(NarrowScalarVT, WideNumElts), dl, AmtWideElts);
30737 AmtWide = DAG.getZExtOrTrunc(AmtWide, dl, WideVT);
30738 // Perform the actual shift.
30739 unsigned LogicalOpc = Opc == ISD::SRA ? (unsigned)ISD::SRL : Opc;
30740 SDValue ShiftedR = DAG.getNode(LogicalOpc, dl, WideVT, RWide, AmtWide);
30741 // Now we need to construct a mask which will "drop" bits that get
30742 // shifted past the LSB/MSB. For a logical shift left, it will look
30743 // like:
30744 // FullMask = (1 << EltSizeInBits) - 1
30745 // Mask = FullMask << Amt
30746 //
30747 // This masking ensures that bits cannot migrate from one narrow lane to
30748 // another. The construction of this mask will be constant folded.
30749 // The mask for a logical right shift is nearly identical, the only
30750 // difference is that the all ones mask is shifted right instead of left.
30751 SDValue SplatFullMask = DAG.getAllOnesConstant(dl, VT);
30752 SDValue Mask = DAG.getNode(LogicalOpc, dl, VT, SplatFullMask, Amt);
30753 Mask = DAG.getBitcast(WideVT, Mask);
30754 // Finally, we mask the shifted vector with the SWAR mask.
30755 SDValue Masked = DAG.getNode(ISD::AND, dl, WideVT, ShiftedR, Mask);
30756 Masked = DAG.getBitcast(VT, Masked);
30757 if (Opc != ISD::SRA) {
30758 // Logical shifts are complete at this point.
30759 return Masked;
30760 }
30761 // At this point, we have done a *logical* shift right. We now need to
30762 // sign extend the result so that we get behavior equivalent to an
30763 // arithmetic shift right. Post-shifting by AmtWide, our narrow elements
30764 // are `EltSizeInBits-AmtWide` bits wide.
30765 //
30766 // To convert our `EltSizeInBits-AmtWide` bit unsigned numbers to signed
30767 // numbers as wide as `EltSizeInBits`, we need to replicate the bit at
30768 // position `EltSizeInBits-AmtWide` into the MSBs of each narrow lane. We
30769 // can use the following trick to accomplish this:
30770 // SignBitMask = 1 << (EltSizeInBits-AmtWide-1)
30771 // (Masked ^ SignBitMask) - SignBitMask
30772 //
30773 // When the sign bit is already clear, this will compute:
30774 // Masked + SignBitMask - SignBitMask
30775 //
30776 // This is equal to Masked which is what we want: the sign bit was clear
30777 // so sign extending should be a no-op.
30778 //
30779 // When the sign bit is set, this will compute:
30780 // Masked - SignBitmask - SignBitMask
30781 //
30782 // This is equal to Masked - 2*SignBitMask which will correctly sign
30783 // extend our result.
30784 SDValue SplatHighBit =
30785 DAG.getConstant(APInt::getSignMask(EltSizeInBits), dl, VT);
30786 // This does not induce recursion, all operands are constants.
30787 SDValue SignBitMask = DAG.getNode(LogicalOpc, dl, VT, SplatHighBit, Amt);
30788 SDValue FlippedSignBit =
30789 DAG.getNode(ISD::XOR, dl, VT, Masked, SignBitMask);
30790 SDValue Subtraction =
30791 DAG.getNode(ISD::SUB, dl, VT, FlippedSignBit, SignBitMask);
30792 return Subtraction;
30793 }
30794 }
30795
30796 // If possible, lower this packed shift into a vector multiply instead of
30797 // expanding it into a sequence of scalar shifts.
30798 // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
30799 if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
30800 Subtarget.canExtendTo512BW())))
30801 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
30802 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
30803
30804 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
30805 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
30806 if (Opc == ISD::SRL && ConstantAmt &&
30807 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30808 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30809 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30810 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30811 SDValue Zero = DAG.getConstant(0, dl, VT);
30812 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
30813 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
30814 return DAG.getSelect(dl, VT, ZAmt, R, Res);
30815 }
30816 }
30817
30818 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
30819 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
30820 // TODO: Special case handling for shift by 0/1, really we can afford either
30821 // of these cases in pre-SSE41/XOP/AVX512 but not both.
30822 if (Opc == ISD::SRA && ConstantAmt &&
30823 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
30824 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
30825 !Subtarget.hasAVX512()) ||
30826 DAG.isKnownNeverZero(Amt))) {
30827 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30828 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30829 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30830 SDValue Amt0 =
30831 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
30832 SDValue Amt1 =
30833 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
30834 SDValue Sra1 =
30835 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
30836 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
30837 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
30838 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
30839 }
30840 }
30841
30842 // v4i32 Non Uniform Shifts.
30843 // If the shift amount is constant we can shift each lane using the SSE2
30844 // immediate shifts, else we need to zero-extend each lane to the lower i64
30845 // and shift using the SSE2 variable shifts.
30846 // The separate results can then be blended together.
30847 if (VT == MVT::v4i32) {
30848 SDValue Amt0, Amt1, Amt2, Amt3;
30849 if (ConstantAmt) {
30850 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
30851 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
30852 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
30853 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
30854 } else {
30855 // The SSE2 shifts use the lower i64 as the same shift amount for
30856 // all lanes and the upper i64 is ignored. On AVX we're better off
30857 // just zero-extending, but for SSE just duplicating the top 16-bits is
30858 // cheaper and has the same effect for out of range values.
30859 if (Subtarget.hasAVX()) {
30860 SDValue Z = DAG.getConstant(0, dl, VT);
30861 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
30862 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
30863 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
30864 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
30865 } else {
30866 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
30867 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
30868 {4, 5, 6, 7, -1, -1, -1, -1});
30869 SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);
30870 SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);
30871 Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);
30872 Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);
30873 Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);
30874 Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);
30875 }
30876 }
30877
30878 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
30879 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
30880 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
30881 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
30882 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
30883
30884 // Merge the shifted lane results optimally with/without PBLENDW.
30885 // TODO - ideally shuffle combining would handle this.
30886 if (Subtarget.hasSSE41()) {
30887 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
30888 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
30889 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
30890 }
30891 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
30892 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
30893 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
30894 }
30895
30896 // If we're shifting (per-lane) uniform vXi8 constants, we can use PSHUFB to
30897 // look up the pre-computed shift values.
30898 if ((VT == MVT::v16i8 && Subtarget.hasSSSE3()) ||
30899 (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30900 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
30901 unsigned NumLanes = VT.getSizeInBits() / 128u;
30902 unsigned NumEltsPerLane = NumElts / NumLanes;
30904 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
30905 unsigned LoElt = Lane * NumEltsPerLane;
30906 APInt EltMask = APInt::getBitsSet(NumElts, LoElt, LoElt + NumEltsPerLane);
30907 KnownBits KnownLane = DAG.computeKnownBits(R, EltMask);
30908 if (!KnownLane.isConstant())
30909 break;
30910 const APInt &LaneSplat = KnownLane.getConstant();
30911 for (unsigned I = 0; I != 8; ++I) {
30912 if (Opc == ISD::SHL)
30913 LUT.push_back(LaneSplat.shl(I));
30914 else if (Opc == ISD::SRL)
30915 LUT.push_back(LaneSplat.lshr(I));
30916 else if (Opc == ISD::SRA)
30917 LUT.push_back(LaneSplat.ashr(I));
30918 }
30919 LUT.append(8, APInt::getZero(8));
30920 }
30921 if (LUT.size() == NumElts) {
30922 APInt Undefs = APInt::getSplat(NumElts, APInt(16, 0xFF00));
30923 SDValue Mask = getConstVector(LUT, Undefs, VT, DAG, dl);
30924 return DAG.getNode(X86ISD::PSHUFB, dl, VT, Mask, Amt);
30925 }
30926 }
30927
30928 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
30929 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
30930 // make the existing SSE solution better.
30931 // NOTE: We honor prefered vector width before promoting to 512-bits.
30932 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
30933 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
30934 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
30935 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
30936 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
30937 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
30938 "Unexpected vector type");
30939 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
30940 MVT ExtVT = MVT::getVectorVT(EvtSVT, NumElts);
30941 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30942 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
30943 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
30944 return DAG.getNode(ISD::TRUNCATE, dl, VT,
30945 DAG.getNode(Opc, dl, ExtVT, R, Amt));
30946 }
30947
30948 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
30949 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
30950 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
30951 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30952 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
30953 !Subtarget.hasXOP()) {
30954 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
30955 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
30956
30957 // Extend constant shift amount to vXi16 (it doesn't matter if the type
30958 // isn't legal).
30959 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
30960 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
30961 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
30962 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
30964 "Constant build vector expected");
30965
30966 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
30967 bool IsSigned = Opc == ISD::SRA;
30968 R = DAG.getExtOrTrunc(IsSigned, R, dl, ExVT);
30969 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
30970 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
30971 return DAG.getZExtOrTrunc(R, dl, VT);
30972 }
30973
30974 SmallVector<SDValue, 16> LoAmt, HiAmt;
30975 for (unsigned i = 0; i != NumElts; i += 16) {
30976 for (int j = 0; j != 8; ++j) {
30977 LoAmt.push_back(Amt.getOperand(i + j));
30978 HiAmt.push_back(Amt.getOperand(i + j + 8));
30979 }
30980 }
30981
30982 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
30983 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
30984
30985 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
30986 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
30987 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
30988 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
30989 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
30990 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
30991 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
30992 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
30993 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
30994 }
30995
30996 if (VT == MVT::v16i8 ||
30997 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
30998 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
30999 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
31000
31001 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31002 if (VT.is512BitVector()) {
31003 // On AVX512BW targets we make use of the fact that VSELECT lowers
31004 // to a masked blend which selects bytes based just on the sign bit
31005 // extracted to a mask.
31006 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
31007 V0 = DAG.getBitcast(VT, V0);
31008 V1 = DAG.getBitcast(VT, V1);
31009 Sel = DAG.getBitcast(VT, Sel);
31010 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
31011 ISD::SETGT);
31012 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
31013 } else if (Subtarget.hasSSE41()) {
31014 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31015 // on the sign bit.
31016 V0 = DAG.getBitcast(VT, V0);
31017 V1 = DAG.getBitcast(VT, V1);
31018 Sel = DAG.getBitcast(VT, Sel);
31019 return DAG.getBitcast(SelVT,
31020 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
31021 }
31022 // On pre-SSE41 targets we test for the sign bit by comparing to
31023 // zero - a negative value will set all bits of the lanes to true
31024 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31025 SDValue Z = DAG.getConstant(0, dl, SelVT);
31026 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
31027 return DAG.getSelect(dl, SelVT, C, V0, V1);
31028 };
31029
31030 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31031 // We can safely do this using i16 shifts as we're only interested in
31032 // the 3 lower bits of each byte.
31033 Amt = DAG.getBitcast(ExtVT, Amt);
31034 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
31035 Amt = DAG.getBitcast(VT, Amt);
31036
31037 if (Opc == ISD::SHL || Opc == ISD::SRL) {
31038 // r = VSELECT(r, shift(r, 4), a);
31039 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
31040 R = SignBitSelect(VT, Amt, M, R);
31041
31042 // a += a
31043 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31044
31045 // r = VSELECT(r, shift(r, 2), a);
31046 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
31047 R = SignBitSelect(VT, Amt, M, R);
31048
31049 // a += a
31050 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31051
31052 // return VSELECT(r, shift(r, 1), a);
31053 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
31054 R = SignBitSelect(VT, Amt, M, R);
31055 return R;
31056 }
31057
31058 if (Opc == ISD::SRA) {
31059 // For SRA we need to unpack each byte to the higher byte of a i16 vector
31060 // so we can correctly sign extend. We don't care what happens to the
31061 // lower byte.
31062 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31063 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31064 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
31065 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
31066 ALo = DAG.getBitcast(ExtVT, ALo);
31067 AHi = DAG.getBitcast(ExtVT, AHi);
31068 RLo = DAG.getBitcast(ExtVT, RLo);
31069 RHi = DAG.getBitcast(ExtVT, RHi);
31070
31071 // r = VSELECT(r, shift(r, 4), a);
31072 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
31073 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
31074 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31075 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31076
31077 // a += a
31078 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31079 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31080
31081 // r = VSELECT(r, shift(r, 2), a);
31082 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
31083 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
31084 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31085 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31086
31087 // a += a
31088 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31089 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31090
31091 // r = VSELECT(r, shift(r, 1), a);
31092 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
31093 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
31094 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31095 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31096
31097 // Logical shift the result back to the lower byte, leaving a zero upper
31098 // byte meaning that we can safely pack with PACKUSWB.
31099 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
31100 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
31101 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
31102 }
31103 }
31104
31105 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
31106 MVT ExtVT = MVT::v8i32;
31107 SDValue Z = DAG.getConstant(0, dl, VT);
31108 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
31109 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
31110 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
31111 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
31112 ALo = DAG.getBitcast(ExtVT, ALo);
31113 AHi = DAG.getBitcast(ExtVT, AHi);
31114 RLo = DAG.getBitcast(ExtVT, RLo);
31115 RHi = DAG.getBitcast(ExtVT, RHi);
31116 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
31117 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
31118 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
31119 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
31120 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
31121 }
31122
31123 if (VT == MVT::v8i16) {
31124 // If we have a constant shift amount, the non-SSE41 path is best as
31125 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
31126 bool UseSSE41 = Subtarget.hasSSE41() &&
31128
31129 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
31130 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
31131 // the sign bit.
31132 if (UseSSE41) {
31133 MVT ExtVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
31134 V0 = DAG.getBitcast(ExtVT, V0);
31135 V1 = DAG.getBitcast(ExtVT, V1);
31136 Sel = DAG.getBitcast(ExtVT, Sel);
31137 return DAG.getBitcast(
31138 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
31139 }
31140 // On pre-SSE41 targets we splat the sign bit - a negative value will
31141 // set all bits of the lanes to true and VSELECT uses that in
31142 // its OR(AND(V0,C),AND(V1,~C)) lowering.
31143 SDValue C =
31144 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
31145 return DAG.getSelect(dl, VT, C, V0, V1);
31146 };
31147
31148 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
31149 if (UseSSE41) {
31150 // On SSE41 targets we need to replicate the shift mask in both
31151 // bytes for PBLENDVB.
31152 Amt = DAG.getNode(
31153 ISD::OR, dl, VT,
31154 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
31155 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
31156 } else {
31157 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
31158 }
31159
31160 // r = VSELECT(r, shift(r, 8), a);
31161 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
31162 R = SignBitSelect(Amt, M, R);
31163
31164 // a += a
31165 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31166
31167 // r = VSELECT(r, shift(r, 4), a);
31168 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
31169 R = SignBitSelect(Amt, M, R);
31170
31171 // a += a
31172 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31173
31174 // r = VSELECT(r, shift(r, 2), a);
31175 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
31176 R = SignBitSelect(Amt, M, R);
31177
31178 // a += a
31179 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31180
31181 // return VSELECT(r, shift(r, 1), a);
31182 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
31183 R = SignBitSelect(Amt, M, R);
31184 return R;
31185 }
31186
31187 // Decompose 256-bit shifts into 128-bit shifts.
31188 if (VT.is256BitVector())
31189 return splitVectorIntBinary(Op, DAG, dl);
31190
31191 if (VT == MVT::v32i16 || VT == MVT::v64i8)
31192 return splitVectorIntBinary(Op, DAG, dl);
31193
31194 return SDValue();
31195}
31196
31198 SelectionDAG &DAG) {
31199 MVT VT = Op.getSimpleValueType();
31200 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
31201 "Unexpected funnel shift opcode!");
31202
31203 SDLoc DL(Op);
31204 SDValue Op0 = Op.getOperand(0);
31205 SDValue Op1 = Op.getOperand(1);
31206 SDValue Amt = Op.getOperand(2);
31207 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31208 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
31209
31210 if (VT.isVector()) {
31211 APInt APIntShiftAmt;
31212 bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
31213 unsigned NumElts = VT.getVectorNumElements();
31214
31215 if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
31216
31217 if (IsCstSplat) {
31218 if (IsFSHR)
31219 std::swap(Op0, Op1);
31220 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31221 SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
31222 return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
31223 {Op0, Op1, Imm}, DAG, Subtarget);
31224 }
31225 return getAVX512Node(IsFSHR ? ISD::FSHR : ISD::FSHL, DL, VT,
31226 {Op0, Op1, Amt}, DAG, Subtarget);
31227 }
31228 assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
31229 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||
31230 VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&
31231 "Unexpected funnel shift type!");
31232
31233 // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
31234 // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
31235 if (IsCstSplat) {
31236 // TODO: Can't use generic expansion as UNDEF amt elements can be
31237 // converted to other values when folded to shift amounts, losing the
31238 // splat.
31239 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31240 uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;
31241 uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);
31242 assert((ShXAmt + ShYAmt) == EltSizeInBits && "Illegal funnel shift");
31243 MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
31244
31245 if (EltSizeInBits == 8 &&
31246 (Subtarget.hasXOP() ||
31247 (useVPTERNLOG(Subtarget, VT) &&
31248 supportedVectorShiftWithImm(WideVT, Subtarget, ISD::SHL)))) {
31249 // For vXi8 cases on Subtargets that can perform VPCMOV/VPTERNLOG
31250 // bit-select - lower using vXi16 shifts and then perform the bitmask at
31251 // the original vector width to handle cases where we split.
31252 APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt);
31253 APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt);
31254 SDValue ShX =
31255 DAG.getNode(ISD::SHL, DL, WideVT, DAG.getBitcast(WideVT, Op0),
31256 DAG.getShiftAmountConstant(ShXAmt, WideVT, DL));
31257 SDValue ShY =
31258 DAG.getNode(ISD::SRL, DL, WideVT, DAG.getBitcast(WideVT, Op1),
31259 DAG.getShiftAmountConstant(ShYAmt, WideVT, DL));
31260 ShX = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShX),
31261 DAG.getConstant(MaskX, DL, VT));
31262 ShY = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShY),
31263 DAG.getConstant(MaskY, DL, VT));
31264 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
31265 }
31266
31267 SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, Op0,
31268 DAG.getShiftAmountConstant(ShXAmt, VT, DL));
31269 SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Op1,
31270 DAG.getShiftAmountConstant(ShYAmt, VT, DL));
31271 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
31272 }
31273
31274 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31275 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31276 bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
31277
31278 // Constant vXi16 funnel shifts can be efficiently handled by default.
31279 if (IsCst && EltSizeInBits == 16)
31280 return SDValue();
31281
31282 unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
31283 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31284 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31285
31286 // Split 256-bit integers on XOP/pre-AVX2 targets.
31287 // Split 512-bit integers on non 512-bit BWI targets.
31288 if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
31289 !Subtarget.hasAVX2())) ||
31290 (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
31291 EltSizeInBits < 32)) {
31292 // Pre-mask the amount modulo using the wider vector.
31293 Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
31294 return splitVectorOp(Op, DAG, DL);
31295 }
31296
31297 // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
31298 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
31299 int ScalarAmtIdx = -1;
31300 if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
31301 // Uniform vXi16 funnel shifts can be efficiently handled by default.
31302 if (EltSizeInBits == 16)
31303 return SDValue();
31304
31305 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31306 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31307 Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
31308 ScalarAmtIdx, Subtarget, DAG);
31309 Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
31310 ScalarAmtIdx, Subtarget, DAG);
31311 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31312 }
31313 }
31314
31315 MVT WideSVT = MVT::getIntegerVT(
31316 std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
31317 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
31318
31319 // If per-element shifts are legal, fallback to generic expansion.
31320 if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
31321 return SDValue();
31322
31323 // Attempt to fold as:
31324 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31325 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31326 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31327 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31328 Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
31329 Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
31330 AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31331 Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
31332 EltSizeInBits, DAG);
31333 SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
31334 Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
31335 if (!IsFSHR)
31336 Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
31337 EltSizeInBits, DAG);
31338 return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
31339 }
31340
31341 // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
31342 if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
31343 supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
31344 SDValue Z = DAG.getConstant(0, DL, VT);
31345 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31346 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31347 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31348 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31349 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31350 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31351 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31352 }
31353
31354 // Fallback to generic expansion.
31355 return SDValue();
31356 }
31357 assert(
31358 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
31359 "Unexpected funnel shift type!");
31360
31361 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
31362 bool OptForSize = DAG.shouldOptForSize();
31363 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
31364
31365 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31366 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31367 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
31368 !isa<ConstantSDNode>(Amt)) {
31369 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
31370 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
31371 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
31372 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
31373 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
31374 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
31375 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
31376 if (IsFSHR) {
31377 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
31378 } else {
31379 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
31380 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
31381 }
31382 return DAG.getZExtOrTrunc(Res, DL, VT);
31383 }
31384
31385 if (VT == MVT::i8 || ExpandFunnel)
31386 return SDValue();
31387
31388 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
31389 if (VT == MVT::i16) {
31390 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
31391 DAG.getConstant(15, DL, Amt.getValueType()));
31392 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
31393 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
31394 }
31395
31396 return Op;
31397}
31398
31399static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
31400 SelectionDAG &DAG) {
31401 MVT VT = Op.getSimpleValueType();
31402 assert(VT.isVector() && "Custom lowering only for vector rotates!");
31403
31404 SDLoc DL(Op);
31405 SDValue R = Op.getOperand(0);
31406 SDValue Amt = Op.getOperand(1);
31407 unsigned Opcode = Op.getOpcode();
31408 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31409 int NumElts = VT.getVectorNumElements();
31410 bool IsROTL = Opcode == ISD::ROTL;
31411
31412 // Check for constant splat rotation amount.
31413 APInt CstSplatValue;
31414 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
31415
31416 // Check for splat rotate by zero.
31417 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
31418 return R;
31419
31420 // AVX512 implicitly uses modulo rotation amounts.
31421 if ((Subtarget.hasVLX() || Subtarget.hasAVX512()) && 32 <= EltSizeInBits) {
31422 // Attempt to rotate by immediate.
31423 if (IsCstSplat) {
31424 unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
31425 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31426 return DAG.getNode(RotOpc, DL, VT, R,
31427 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31428 }
31429
31430 // Else, fall-back on VPROLV/VPRORV.
31431 return Op;
31432 }
31433
31434 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
31435 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
31436 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31437 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31438 }
31439
31440 SDValue Z = DAG.getConstant(0, DL, VT);
31441
31442 if (!IsROTL) {
31443 // If the ISD::ROTR amount is constant, we're always better converting to
31444 // ISD::ROTL.
31445 if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
31446 return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
31447
31448 // XOP targets always prefers ISD::ROTL.
31449 if (Subtarget.hasXOP())
31450 return DAG.getNode(ISD::ROTL, DL, VT, R,
31451 DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
31452 }
31453
31454 // Attempt to use GFNI gf2p8affine to rotate vXi8 by an uniform constant.
31455 if (IsCstSplat && Subtarget.hasGFNI() && VT.getScalarType() == MVT::i8 &&
31457 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31458 SDValue Mask = getGFNICtrlMask(Opcode, DAG, DL, VT, RotAmt);
31459 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, R, Mask,
31460 DAG.getTargetConstant(0, DL, MVT::i8));
31461 }
31462
31463 // Split 256-bit integers on XOP/pre-AVX2 targets.
31464 if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
31465 return splitVectorIntBinary(Op, DAG, DL);
31466
31467 // XOP has 128-bit vector variable + immediate rotates.
31468 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
31469 // XOP implicitly uses modulo rotation amounts.
31470 if (Subtarget.hasXOP()) {
31471 assert(IsROTL && "Only ROTL expected");
31472 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
31473
31474 // Attempt to rotate by immediate.
31475 if (IsCstSplat) {
31476 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31477 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
31478 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31479 }
31480
31481 // Use general rotate by variable (per-element).
31482 return Op;
31483 }
31484
31485 // Rotate by an uniform constant - expand back to shifts.
31486 // TODO: Can't use generic expansion as UNDEF amt elements can be converted
31487 // to other values when folded to shift amounts, losing the splat.
31488 if (IsCstSplat) {
31489 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31490 uint64_t ShlAmt = IsROTL ? RotAmt : (EltSizeInBits - RotAmt);
31491 uint64_t SrlAmt = IsROTL ? (EltSizeInBits - RotAmt) : RotAmt;
31492 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, R,
31493 DAG.getShiftAmountConstant(ShlAmt, VT, DL));
31494 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, R,
31495 DAG.getShiftAmountConstant(SrlAmt, VT, DL));
31496 return DAG.getNode(ISD::OR, DL, VT, Shl, Srl);
31497 }
31498
31499 // Split 512-bit integers on non 512-bit BWI targets.
31500 if (VT.is512BitVector() && !Subtarget.useBWIRegs())
31501 return splitVectorIntBinary(Op, DAG, DL);
31502
31503 assert(
31504 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
31505 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
31506 Subtarget.hasAVX2()) ||
31507 ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&
31508 "Only vXi32/vXi16/vXi8 vector rotates supported");
31509
31510 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31511 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31512
31513 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31514 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31515
31516 // Attempt to fold as unpack(x,x) << zext(splat(y)):
31517 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31518 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31519 if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
31520 int BaseRotAmtIdx = -1;
31521 if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
31522 if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
31523 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31524 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31525 }
31526 unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
31527 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31528 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31529 Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
31530 BaseRotAmtIdx, Subtarget, DAG);
31531 Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
31532 BaseRotAmtIdx, Subtarget, DAG);
31533 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31534 }
31535 }
31536
31537 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
31538 unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
31539
31540 // Attempt to fold as unpack(x,x) << zext(y):
31541 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31542 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31543 // Const vXi16/vXi32 are excluded in favor of MUL-based lowering.
31544 if (!(ConstantAmt && EltSizeInBits != 8) &&
31545 !supportedVectorVarShift(VT, Subtarget, ShiftOpc) &&
31546 (ConstantAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc))) {
31547 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31548 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31549 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31550 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31551 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31552 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31553 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31554 }
31555
31556 // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
31557 // the amount bit.
31558 // TODO: We're doing nothing here that we couldn't do for funnel shifts.
31559 if (EltSizeInBits == 8) {
31560 MVT WideVT =
31561 MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
31562
31563 // Attempt to fold as:
31564 // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
31565 // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
31566 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31567 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31568 // If we're rotating by constant, just use default promotion.
31569 if (ConstantAmt)
31570 return SDValue();
31571 // See if we can perform this by widening to vXi16 or vXi32.
31572 R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
31573 R = DAG.getNode(
31574 ISD::OR, DL, WideVT, R,
31575 getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
31576 Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31577 R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
31578 if (IsROTL)
31579 R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
31580 return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
31581 }
31582
31583 // We don't need ModuloAmt here as we just peek at individual bits.
31584 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31585 if (Subtarget.hasSSE41()) {
31586 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31587 // on the sign bit.
31588 V0 = DAG.getBitcast(VT, V0);
31589 V1 = DAG.getBitcast(VT, V1);
31590 Sel = DAG.getBitcast(VT, Sel);
31591 return DAG.getBitcast(SelVT,
31592 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
31593 }
31594 // On pre-SSE41 targets we test for the sign bit by comparing to
31595 // zero - a negative value will set all bits of the lanes to true
31596 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31597 SDValue Z = DAG.getConstant(0, DL, SelVT);
31598 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
31599 return DAG.getSelect(DL, SelVT, C, V0, V1);
31600 };
31601
31602 // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
31603 if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
31604 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31605 IsROTL = true;
31606 }
31607
31608 unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
31609 unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
31610
31611 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31612 // We can safely do this using i16 shifts as we're only interested in
31613 // the 3 lower bits of each byte.
31614 Amt = DAG.getBitcast(ExtVT, Amt);
31615 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
31616 Amt = DAG.getBitcast(VT, Amt);
31617
31618 // r = VSELECT(r, rot(r, 4), a);
31619 SDValue M;
31620 M = DAG.getNode(
31621 ISD::OR, DL, VT,
31622 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
31623 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
31624 R = SignBitSelect(VT, Amt, M, R);
31625
31626 // a += a
31627 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31628
31629 // r = VSELECT(r, rot(r, 2), a);
31630 M = DAG.getNode(
31631 ISD::OR, DL, VT,
31632 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
31633 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
31634 R = SignBitSelect(VT, Amt, M, R);
31635
31636 // a += a
31637 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31638
31639 // return VSELECT(r, rot(r, 1), a);
31640 M = DAG.getNode(
31641 ISD::OR, DL, VT,
31642 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
31643 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
31644 return SignBitSelect(VT, Amt, M, R);
31645 }
31646
31647 bool IsSplatAmt = DAG.isSplatValue(Amt);
31648 bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
31649 supportedVectorVarShift(VT, Subtarget, ISD::SRL);
31650
31651 // Fallback for splats + all supported variable shifts.
31652 // Fallback for non-constants AVX2 vXi16 as well.
31653 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
31654 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31655 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
31656 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
31657 SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
31658 SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
31659 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
31660 }
31661
31662 // Everything below assumes ISD::ROTL.
31663 if (!IsROTL) {
31664 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31665 IsROTL = true;
31666 }
31667
31668 // ISD::ROT* uses modulo rotate amounts.
31669 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31670
31671 assert(IsROTL && "Only ROTL supported");
31672
31673 // As with shifts, attempt to convert the rotation amount to a multiplication
31674 // factor, fallback to general expansion.
31675 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
31676 if (!Scale)
31677 return SDValue();
31678
31679 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
31680 if (EltSizeInBits == 16) {
31681 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
31682 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
31683 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
31684 }
31685
31686 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
31687 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
31688 // that can then be OR'd with the lower 32-bits.
31689 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
31690 static const int OddMask[] = {1, 1, 3, 3};
31691 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
31692 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
31693
31694 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31695 DAG.getBitcast(MVT::v2i64, R),
31696 DAG.getBitcast(MVT::v2i64, Scale));
31697 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31698 DAG.getBitcast(MVT::v2i64, R13),
31699 DAG.getBitcast(MVT::v2i64, Scale13));
31700 Res02 = DAG.getBitcast(VT, Res02);
31701 Res13 = DAG.getBitcast(VT, Res13);
31702
31703 return DAG.getNode(ISD::OR, DL, VT,
31704 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
31705 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
31706}
31707
31708/// Returns true if the operand type is exactly twice the native width, and
31709/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
31710/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
31711/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
31712bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
31713 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
31714
31715 if (OpWidth == 64)
31716 return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
31717 if (OpWidth == 128)
31718 return Subtarget.canUseCMPXCHG16B();
31719
31720 return false;
31721}
31722
31724X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
31725 Type *MemType = SI->getValueOperand()->getType();
31726
31727 if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31728 !Subtarget.useSoftFloat()) {
31729 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31730 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31732
31733 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31734 Subtarget.hasAVX())
31736 }
31737
31738 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
31740}
31741
31742// Note: this turns large loads into lock cmpxchg8b/16b.
31744X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
31745 Type *MemType = LI->getType();
31746
31747 if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31748 !Subtarget.useSoftFloat()) {
31749 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
31750 // can use movq to do the load. If we have X87 we can load into an 80-bit
31751 // X87 register and store it to a stack temporary.
31752 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31753 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31755
31756 // If this is a 128-bit load with AVX, 128-bit SSE loads/stores are atomic.
31757 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31758 Subtarget.hasAVX())
31760 }
31761
31762 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
31764}
31765
31773
31774static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
31775 using namespace llvm::PatternMatch;
31776 BitTestKind BTK = UndefBit;
31777 if (auto *C = dyn_cast<ConstantInt>(V)) {
31778 // Check if V is a power of 2 or NOT power of 2.
31779 if (isPowerOf2_64(C->getZExtValue()))
31780 BTK = ConstantBit;
31781 else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
31782 BTK = NotConstantBit;
31783 return {V, BTK};
31784 }
31785
31786 // Check if V is some power of 2 pattern known to be non-zero
31787 if (auto *I = dyn_cast<Instruction>(V)) {
31788 bool Not = false;
31789 // Check if we have a NOT
31790 Value *PeekI;
31791 if (match(I, m_Not(m_Value(PeekI))) ||
31792 match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
31793 Not = true;
31794 I = dyn_cast<Instruction>(PeekI);
31795
31796 // If I is constant, it will fold and we can evaluate later. If its an
31797 // argument or something of that nature, we can't analyze.
31798 if (I == nullptr)
31799 return {nullptr, UndefBit};
31800 }
31801 // We can only use 1 << X without more sophisticated analysis. C << X where
31802 // C is a power of 2 but not 1 can result in zero which cannot be translated
31803 // to bittest. Likewise any C >> X (either arith or logical) can be zero.
31804 if (I->getOpcode() == Instruction::Shl) {
31805 // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
31806 // -X` and some other provable power of 2 patterns that we can use CTZ on
31807 // may be profitable.
31808 // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
31809 // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
31810 // be provably a non-zero power of 2.
31811 // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
31812 // transformable to bittest.
31813 auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
31814 if (!ShiftVal)
31815 return {nullptr, UndefBit};
31816 if (ShiftVal->equalsInt(1))
31817 BTK = Not ? NotShiftBit : ShiftBit;
31818
31819 if (BTK == UndefBit)
31820 return {nullptr, UndefBit};
31821
31822 Value *BitV = I->getOperand(1);
31823
31824 // Read past a shiftmask instruction to find count
31825 Value *AndOp;
31826 uint64_t ShiftMask = I->getType()->getPrimitiveSizeInBits() - 1;
31827 if (match(BitV, m_c_And(m_Value(AndOp), m_SpecificInt(ShiftMask))))
31828 BitV = AndOp;
31829
31830 return {BitV, BTK};
31831 }
31832 }
31833 return {nullptr, UndefBit};
31834}
31835
31837X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
31838 using namespace llvm::PatternMatch;
31839 // If the atomicrmw's result isn't actually used, we can just add a "lock"
31840 // prefix to a normal instruction for these operations.
31841 if (AI->use_empty())
31843
31844 if (AI->getOperation() == AtomicRMWInst::Xor) {
31845 // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is
31846 // preferable to both `cmpxchg` and `btc`.
31847 if (match(AI->getOperand(1), m_SignMask()))
31849 }
31850
31851 // If the atomicrmw's result is used by a single bit AND, we may use
31852 // bts/btr/btc instruction for these operations.
31853 // Note: InstCombinePass can cause a de-optimization here. It replaces the
31854 // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
31855 // (depending on CC). This pattern can only use bts/btr/btc but we don't
31856 // detect it.
31857 Instruction *I = AI->user_back();
31858 auto BitChange = FindSingleBitChange(AI->getValOperand());
31859 if (BitChange.second == UndefBit || !AI->hasOneUse() ||
31860 I->getOpcode() != Instruction::And ||
31861 AI->getType()->getPrimitiveSizeInBits() == 8 ||
31862 AI->getParent() != I->getParent())
31864
31865 unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
31866
31867 // This is a redundant AND, it should get cleaned up elsewhere.
31868 if (AI == I->getOperand(OtherIdx))
31870
31871 // The following instruction must be a AND single bit.
31872 if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
31873 auto *C1 = cast<ConstantInt>(AI->getValOperand());
31874 auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
31875 if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
31877 }
31878 if (AI->getOperation() == AtomicRMWInst::And) {
31879 return ~C1->getValue() == C2->getValue()
31882 }
31885 }
31886
31887 assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit);
31888
31889 auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
31890 if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
31892
31893 assert(BitChange.first != nullptr && BitTested.first != nullptr);
31894
31895 // If shift amounts are not the same we can't use BitTestIntrinsic.
31896 if (BitChange.first != BitTested.first)
31898
31899 // If atomic AND need to be masking all be one bit and testing the one bit
31900 // unset in the mask.
31901 if (AI->getOperation() == AtomicRMWInst::And)
31902 return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
31905
31906 // If atomic XOR/OR need to be setting and testing the same bit.
31907 return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
31910}
31911
31912void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
31913 IRBuilder<> Builder(AI);
31914 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
31917 switch (AI->getOperation()) {
31918 default:
31919 llvm_unreachable("Unknown atomic operation");
31920 case AtomicRMWInst::Or:
31921 IID_C = Intrinsic::x86_atomic_bts;
31922 IID_I = Intrinsic::x86_atomic_bts_rm;
31923 break;
31924 case AtomicRMWInst::Xor:
31925 IID_C = Intrinsic::x86_atomic_btc;
31926 IID_I = Intrinsic::x86_atomic_btc_rm;
31927 break;
31928 case AtomicRMWInst::And:
31929 IID_C = Intrinsic::x86_atomic_btr;
31930 IID_I = Intrinsic::x86_atomic_btr_rm;
31931 break;
31932 }
31933 Instruction *I = AI->user_back();
31934 LLVMContext &Ctx = AI->getContext();
31935 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
31937 Value *Result = nullptr;
31938 auto BitTested = FindSingleBitChange(AI->getValOperand());
31939 assert(BitTested.first != nullptr);
31940
31941 if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
31942 auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
31943
31944 unsigned Imm = llvm::countr_zero(C->getZExtValue());
31945 Result = Builder.CreateIntrinsic(IID_C, AI->getType(),
31946 {Addr, Builder.getInt8(Imm)});
31947 } else {
31948 assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit);
31949
31950 Value *SI = BitTested.first;
31951 assert(SI != nullptr);
31952
31953 // BT{S|R|C} on memory operand don't modulo bit position so we need to
31954 // mask it.
31955 unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
31956 Value *BitPos =
31957 Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
31958 // Todo(1): In many cases it may be provable that SI is less than
31959 // ShiftBits in which case this mask is unnecessary
31960 // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
31961 // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
31962 // favor of just a raw BT{S|R|C}.
31963
31964 Result = Builder.CreateIntrinsic(IID_I, AI->getType(), {Addr, BitPos});
31965 Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
31966
31967 // If the result is only used for zero/non-zero status then we don't need to
31968 // shift value back. Otherwise do so.
31969 for (auto It = I->user_begin(); It != I->user_end(); ++It) {
31970 if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
31971 if (ICmp->isEquality()) {
31972 auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
31973 auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
31974 if (C0 || C1) {
31975 assert(C0 == nullptr || C1 == nullptr);
31976 if ((C0 ? C0 : C1)->isZero())
31977 continue;
31978 }
31979 }
31980 }
31981 Result = Builder.CreateShl(Result, BitPos);
31982 break;
31983 }
31984 }
31985
31986 I->replaceAllUsesWith(Result);
31987 I->eraseFromParent();
31988 AI->eraseFromParent();
31989}
31990
31992 using namespace llvm::PatternMatch;
31993 if (!AI->hasOneUse())
31994 return false;
31995
31996 Value *Op = AI->getOperand(1);
31997 CmpPredicate Pred;
31998 Instruction *I = AI->user_back();
32000 if (Opc == AtomicRMWInst::Add) {
32001 if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
32002 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32003 if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
32004 if (match(I->user_back(),
32006 return true;
32007 if (match(I->user_back(),
32009 return true;
32010 }
32011 return false;
32012 }
32013 if (Opc == AtomicRMWInst::Sub) {
32014 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32015 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32016 if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
32017 if (match(I->user_back(),
32019 return true;
32020 if (match(I->user_back(),
32022 return true;
32023 }
32024 return false;
32025 }
32026 if ((Opc == AtomicRMWInst::Or &&
32028 (Opc == AtomicRMWInst::And &&
32030 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32031 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
32032 Pred == CmpInst::ICMP_SLT;
32033 if (match(I->user_back(),
32035 return true;
32036 return false;
32037 }
32038 if (Opc == AtomicRMWInst::Xor) {
32039 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32040 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32041 if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
32042 if (match(I->user_back(),
32044 return true;
32045 if (match(I->user_back(),
32047 return true;
32048 }
32049 return false;
32050 }
32051
32052 return false;
32053}
32054
32055void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(
32056 AtomicRMWInst *AI) const {
32057 IRBuilder<> Builder(AI);
32058 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32059 Instruction *TempI = nullptr;
32060 LLVMContext &Ctx = AI->getContext();
32061 ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
32062 if (!ICI) {
32063 TempI = AI->user_back();
32064 assert(TempI->hasOneUse() && "Must have one use");
32065 ICI = cast<ICmpInst>(TempI->user_back());
32066 }
32068 ICmpInst::Predicate Pred = ICI->getPredicate();
32069 switch (Pred) {
32070 default:
32071 llvm_unreachable("Not supported Pred");
32072 case CmpInst::ICMP_EQ:
32073 CC = X86::COND_E;
32074 break;
32075 case CmpInst::ICMP_NE:
32076 CC = X86::COND_NE;
32077 break;
32078 case CmpInst::ICMP_SLT:
32079 CC = X86::COND_S;
32080 break;
32081 case CmpInst::ICMP_SGT:
32082 CC = X86::COND_NS;
32083 break;
32084 }
32086 switch (AI->getOperation()) {
32087 default:
32088 llvm_unreachable("Unknown atomic operation");
32089 case AtomicRMWInst::Add:
32090 IID = Intrinsic::x86_atomic_add_cc;
32091 break;
32092 case AtomicRMWInst::Sub:
32093 IID = Intrinsic::x86_atomic_sub_cc;
32094 break;
32095 case AtomicRMWInst::Or:
32096 IID = Intrinsic::x86_atomic_or_cc;
32097 break;
32098 case AtomicRMWInst::And:
32099 IID = Intrinsic::x86_atomic_and_cc;
32100 break;
32101 case AtomicRMWInst::Xor:
32102 IID = Intrinsic::x86_atomic_xor_cc;
32103 break;
32104 }
32105 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
32107 Value *Call = Builder.CreateIntrinsic(
32108 IID, AI->getType(),
32109 {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
32110 Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));
32111 ICI->replaceAllUsesWith(Result);
32112 ICI->eraseFromParent();
32113 if (TempI)
32114 TempI->eraseFromParent();
32115 AI->eraseFromParent();
32116}
32117
32119X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
32120 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32121 Type *MemType = AI->getType();
32122
32123 // If the operand is too big, we must see if cmpxchg8/16b is available
32124 // and default to library calls otherwise.
32125 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
32126 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
32128 }
32129
32131 switch (Op) {
32134 case AtomicRMWInst::Add:
32135 case AtomicRMWInst::Sub:
32138 // It's better to use xadd, xsub or xchg for these in other cases.
32140 case AtomicRMWInst::Or:
32141 case AtomicRMWInst::And:
32142 case AtomicRMWInst::Xor:
32145 return shouldExpandLogicAtomicRMWInIR(AI);
32147 case AtomicRMWInst::Max:
32148 case AtomicRMWInst::Min:
32159 default:
32160 // These always require a non-trivial set of data operations on x86. We must
32161 // use a cmpxchg loop.
32163 }
32164}
32165
32166LoadInst *
32167X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
32168 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32169 Type *MemType = AI->getType();
32170 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
32171 // there is no benefit in turning such RMWs into loads, and it is actually
32172 // harmful as it introduces a mfence.
32173 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
32174 return nullptr;
32175
32176 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
32177 // lowering available in lowerAtomicArith.
32178 // TODO: push more cases through this path.
32179 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
32180 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
32181 AI->use_empty())
32182 return nullptr;
32183
32184 IRBuilder<> Builder(AI);
32185 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32186 auto SSID = AI->getSyncScopeID();
32187 // We must restrict the ordering to avoid generating loads with Release or
32188 // ReleaseAcquire orderings.
32190
32191 // Before the load we need a fence. Here is an example lifted from
32192 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
32193 // is required:
32194 // Thread 0:
32195 // x.store(1, relaxed);
32196 // r1 = y.fetch_add(0, release);
32197 // Thread 1:
32198 // y.fetch_add(42, acquire);
32199 // r2 = x.load(relaxed);
32200 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
32201 // lowered to just a load without a fence. A mfence flushes the store buffer,
32202 // making the optimization clearly correct.
32203 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
32204 // otherwise, we might be able to be more aggressive on relaxed idempotent
32205 // rmw. In practice, they do not look useful, so we don't try to be
32206 // especially clever.
32207
32208 // Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct
32209 // lowering for SSID == SyncScope::SingleThread and avoidMFence || !hasMFence
32210 Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID);
32211
32212 // Finally we can emit the atomic load.
32213 LoadInst *Loaded = Builder.CreateAlignedLoad(
32214 AI->getType(), AI->getPointerOperand(), AI->getAlign());
32215 Loaded->setAtomic(Order, SSID);
32216 AI->replaceAllUsesWith(Loaded);
32217 AI->eraseFromParent();
32218 return Loaded;
32219}
32220
32221/// Emit a locked operation on a stack location which does not change any
32222/// memory location, but does involve a lock prefix. Location is chosen to be
32223/// a) very likely accessed only by a single thread to minimize cache traffic,
32224/// and b) definitely dereferenceable. Returns the new Chain result.
32226 const X86Subtarget &Subtarget, SDValue Chain,
32227 const SDLoc &DL) {
32228 // Implementation notes:
32229 // 1) LOCK prefix creates a full read/write reordering barrier for memory
32230 // operations issued by the current processor. As such, the location
32231 // referenced is not relevant for the ordering properties of the instruction.
32232 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
32233 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
32234 // 2) Using an immediate operand appears to be the best encoding choice
32235 // here since it doesn't require an extra register.
32236 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
32237 // is small enough it might just be measurement noise.)
32238 // 4) When choosing offsets, there are several contributing factors:
32239 // a) If there's no redzone, we default to TOS. (We could allocate a cache
32240 // line aligned stack object to improve this case.)
32241 // b) To minimize our chances of introducing a false dependence, we prefer
32242 // to offset the stack usage from TOS slightly.
32243 // c) To minimize concerns about cross thread stack usage - in particular,
32244 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
32245 // captures state in the TOS frame and accesses it from many threads -
32246 // we want to use an offset such that the offset is in a distinct cache
32247 // line from the TOS frame.
32248 //
32249 // For a general discussion of the tradeoffs and benchmark results, see:
32250 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
32251
32252 auto &MF = DAG.getMachineFunction();
32253 auto &TFL = *Subtarget.getFrameLowering();
32254 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
32255
32256 if (Subtarget.is64Bit()) {
32257 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32258 SDValue Ops[] = {
32259 DAG.getRegister(X86::RSP, MVT::i64), // Base
32260 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32261 DAG.getRegister(0, MVT::i64), // Index
32262 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32263 DAG.getRegister(0, MVT::i16), // Segment.
32264 Zero,
32265 Chain};
32266 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32267 MVT::Other, Ops);
32268 return SDValue(Res, 1);
32269 }
32270
32271 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32272 SDValue Ops[] = {
32273 DAG.getRegister(X86::ESP, MVT::i32), // Base
32274 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32275 DAG.getRegister(0, MVT::i32), // Index
32276 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32277 DAG.getRegister(0, MVT::i16), // Segment.
32278 Zero,
32279 Chain
32280 };
32281 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32282 MVT::Other, Ops);
32283 return SDValue(Res, 1);
32284}
32285
32287 SelectionDAG &DAG) {
32288 SDLoc dl(Op);
32289 AtomicOrdering FenceOrdering =
32290 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
32291 SyncScope::ID FenceSSID =
32292 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
32293
32294 // The only fence that needs an instruction is a sequentially-consistent
32295 // cross-thread fence.
32296 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
32297 FenceSSID == SyncScope::System) {
32298 if (!Subtarget.avoidMFence() && Subtarget.hasMFence())
32299 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
32300
32301 SDValue Chain = Op.getOperand(0);
32302 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
32303 }
32304
32305 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32306 return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
32307}
32308
32310 SelectionDAG &DAG) {
32311 MVT T = Op.getSimpleValueType();
32312 SDLoc DL(Op);
32313 unsigned Reg = 0;
32314 unsigned size = 0;
32315 switch(T.SimpleTy) {
32316 default: llvm_unreachable("Invalid value type!");
32317 case MVT::i8: Reg = X86::AL; size = 1; break;
32318 case MVT::i16: Reg = X86::AX; size = 2; break;
32319 case MVT::i32: Reg = X86::EAX; size = 4; break;
32320 case MVT::i64:
32321 assert(Subtarget.is64Bit() && "Node not type legal!");
32322 Reg = X86::RAX; size = 8;
32323 break;
32324 }
32325 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
32326 Op.getOperand(2), SDValue());
32327 SDValue Ops[] = { cpIn.getValue(0),
32328 Op.getOperand(1),
32329 Op.getOperand(3),
32330 DAG.getTargetConstant(size, DL, MVT::i8),
32331 cpIn.getValue(1) };
32332 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
32333 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
32335 Ops, T, MMO);
32336
32337 SDValue cpOut =
32338 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
32339 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
32340 MVT::i32, cpOut.getValue(2));
32341 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
32342
32343 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
32344 cpOut, Success, EFLAGS.getValue(1));
32345}
32346
32347// Create MOVMSKB, taking into account whether we need to split for AVX1.
32349 const X86Subtarget &Subtarget) {
32350 MVT InVT = V.getSimpleValueType();
32351
32352 if (InVT == MVT::v64i8) {
32353 SDValue Lo, Hi;
32354 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32355 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
32356 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
32357 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
32358 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
32359 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
32360 DAG.getConstant(32, DL, MVT::i8));
32361 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
32362 }
32363 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
32364 SDValue Lo, Hi;
32365 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32366 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
32367 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
32368 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
32369 DAG.getConstant(16, DL, MVT::i8));
32370 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
32371 }
32372
32373 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
32374}
32375
32376static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
32377 SelectionDAG &DAG) {
32378 SDValue Src = Op.getOperand(0);
32379 MVT SrcVT = Src.getSimpleValueType();
32380 MVT DstVT = Op.getSimpleValueType();
32381
32382 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
32383 // half to v32i1 and concatenating the result.
32384 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
32385 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
32386 assert(Subtarget.hasBWI() && "Expected BWI target");
32387 SDLoc dl(Op);
32388 SDValue Lo, Hi;
32389 std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);
32390 Lo = DAG.getBitcast(MVT::v32i1, Lo);
32391 Hi = DAG.getBitcast(MVT::v32i1, Hi);
32392 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
32393 }
32394
32395 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
32396 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
32397 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
32398 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
32399 SDLoc DL(Op);
32400 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
32401 V = getPMOVMSKB(DL, V, DAG, Subtarget);
32402 return DAG.getZExtOrTrunc(V, DL, DstVT);
32403 }
32404
32405 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
32406 SrcVT == MVT::i64) && "Unexpected VT!");
32407
32408 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32409 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
32410 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
32411 // This conversion needs to be expanded.
32412 return SDValue();
32413
32414 SDLoc dl(Op);
32415 if (SrcVT.isVector()) {
32416 // Widen the vector in input in the case of MVT::v2i32.
32417 // Example: from MVT::v2i32 to MVT::v4i32.
32419 SrcVT.getVectorNumElements() * 2);
32420 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
32421 DAG.getUNDEF(SrcVT));
32422 } else {
32423 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
32424 "Unexpected source type in LowerBITCAST");
32425 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
32426 }
32427
32428 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
32429 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
32430
32431 if (DstVT == MVT::x86mmx)
32432 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
32433
32434 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
32435 DAG.getVectorIdxConstant(0, dl));
32436}
32437
32438/// Compute the horizontal sum of bytes in V for the elements of VT.
32439///
32440/// Requires V to be a byte vector and VT to be an integer vector type with
32441/// wider elements than V's type. The width of the elements of VT determines
32442/// how many bytes of V are summed horizontally to produce each element of the
32443/// result.
32445 const X86Subtarget &Subtarget,
32446 SelectionDAG &DAG) {
32447 SDLoc DL(V);
32448 MVT ByteVecVT = V.getSimpleValueType();
32449 MVT EltVT = VT.getVectorElementType();
32450 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
32451 "Expected value to have byte element type.");
32452 assert(EltVT != MVT::i8 &&
32453 "Horizontal byte sum only makes sense for wider elements!");
32454 unsigned VecSize = VT.getSizeInBits();
32455 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
32456
32457 // PSADBW instruction horizontally add all bytes and leave the result in i64
32458 // chunks, thus directly computes the pop count for v2i64 and v4i64.
32459 if (EltVT == MVT::i64) {
32460 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
32461 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32462 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
32463 return DAG.getBitcast(VT, V);
32464 }
32465
32466 if (EltVT == MVT::i32) {
32467 // We unpack the low half and high half into i32s interleaved with zeros so
32468 // that we can use PSADBW to horizontally sum them. The most useful part of
32469 // this is that it lines up the results of two PSADBW instructions to be
32470 // two v2i64 vectors which concatenated are the 4 population counts. We can
32471 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
32472 SDValue Zeros = DAG.getConstant(0, DL, VT);
32473 SDValue V32 = DAG.getBitcast(VT, V);
32474 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
32475 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
32476
32477 // Do the horizontal sums into two v2i64s.
32478 Zeros = DAG.getConstant(0, DL, ByteVecVT);
32479 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32480 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32481 DAG.getBitcast(ByteVecVT, Low), Zeros);
32482 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32483 DAG.getBitcast(ByteVecVT, High), Zeros);
32484
32485 // Merge them together.
32486 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
32487 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
32488 DAG.getBitcast(ShortVecVT, Low),
32489 DAG.getBitcast(ShortVecVT, High));
32490
32491 return DAG.getBitcast(VT, V);
32492 }
32493
32494 // The only element type left is i16.
32495 assert(EltVT == MVT::i16 && "Unknown how to handle type");
32496
32497 // To obtain pop count for each i16 element starting from the pop count for
32498 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
32499 // right by 8. It is important to shift as i16s as i8 vector shift isn't
32500 // directly supported.
32501 SDValue ShifterV = DAG.getConstant(8, DL, VT);
32502 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32503 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
32504 DAG.getBitcast(ByteVecVT, V));
32505 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32506}
32507
32509 const X86Subtarget &Subtarget,
32510 SelectionDAG &DAG) {
32511 MVT VT = Op.getSimpleValueType();
32512 MVT EltVT = VT.getVectorElementType();
32513 int NumElts = VT.getVectorNumElements();
32514 (void)EltVT;
32515 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
32516
32517 // Implement a lookup table in register by using an algorithm based on:
32518 // http://wm.ite.pl/articles/sse-popcount.html
32519 //
32520 // The general idea is that every lower byte nibble in the input vector is an
32521 // index into a in-register pre-computed pop count table. We then split up the
32522 // input vector in two new ones: (1) a vector with only the shifted-right
32523 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
32524 // masked out higher ones) for each byte. PSHUFB is used separately with both
32525 // to index the in-register table. Next, both are added and the result is a
32526 // i8 vector where each element contains the pop count for input byte.
32527 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
32528 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
32529 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
32530 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
32531
32533 for (int i = 0; i < NumElts; ++i)
32534 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
32535 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
32536 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
32537
32538 // High nibbles
32539 SDValue FourV = DAG.getConstant(4, DL, VT);
32540 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
32541
32542 // Low nibbles
32543 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
32544
32545 // The input vector is used as the shuffle mask that index elements into the
32546 // LUT. After counting low and high nibbles, add the vector to obtain the
32547 // final pop count per i8 element.
32548 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
32549 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
32550 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
32551}
32552
32553// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
32554// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
32556 const X86Subtarget &Subtarget,
32557 SelectionDAG &DAG) {
32558 MVT VT = Op.getSimpleValueType();
32559 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
32560 "Unknown CTPOP type to handle");
32561 SDValue Op0 = Op.getOperand(0);
32562
32563 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
32564 if (Subtarget.hasVPOPCNTDQ()) {
32565 unsigned NumElems = VT.getVectorNumElements();
32566 assert((VT.getVectorElementType() == MVT::i8 ||
32567 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
32568 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
32569 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
32570 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
32571 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
32572 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
32573 }
32574 }
32575
32576 // Decompose 256-bit ops into smaller 128-bit ops.
32577 if (VT.is256BitVector() && !Subtarget.hasInt256())
32578 return splitVectorIntUnary(Op, DAG, DL);
32579
32580 // Decompose 512-bit ops into smaller 256-bit ops.
32581 if (VT.is512BitVector() && !Subtarget.hasBWI())
32582 return splitVectorIntUnary(Op, DAG, DL);
32583
32584 // For element types greater than i8, do vXi8 pop counts and a bytesum.
32585 if (VT.getScalarType() != MVT::i8) {
32586 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32587 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
32588 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
32589 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
32590 }
32591
32592 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
32593 if (!Subtarget.hasSSSE3())
32594 return SDValue();
32595
32596 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
32597}
32598
32599static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget,
32600 SelectionDAG &DAG) {
32601 MVT VT = N.getSimpleValueType();
32602 SDValue Op = N.getOperand(0);
32603 SDLoc DL(N);
32604
32605 if (VT.isScalarInteger()) {
32606 // Compute the lower/upper bounds of the active bits of the value,
32607 // allowing us to shift the active bits down if necessary to fit into the
32608 // special cases below.
32609 KnownBits Known = DAG.computeKnownBits(Op);
32610 if (Known.isConstant())
32611 return DAG.getConstant(Known.getConstant().popcount(), DL, VT);
32612 unsigned LZ = Known.countMinLeadingZeros();
32613 unsigned TZ = Known.countMinTrailingZeros();
32614 assert((LZ + TZ) < Known.getBitWidth() && "Illegal shifted mask");
32615 unsigned ActiveBits = Known.getBitWidth() - LZ;
32616 unsigned ShiftedActiveBits = Known.getBitWidth() - (LZ + TZ);
32617
32618 // i2 CTPOP - "ctpop(x) --> sub(x, (x >> 1))".
32619 if (ShiftedActiveBits <= 2) {
32620 if (ActiveBits > 2)
32621 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32622 DAG.getShiftAmountConstant(TZ, VT, DL));
32623 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32624 Op = DAG.getNode(ISD::SUB, DL, MVT::i32, Op,
32625 DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32626 DAG.getShiftAmountConstant(1, VT, DL)));
32627 return DAG.getZExtOrTrunc(Op, DL, VT);
32628 }
32629
32630 // i3 CTPOP - perform LUT into i32 integer.
32631 if (ShiftedActiveBits <= 3) {
32632 if (ActiveBits > 3)
32633 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32634 DAG.getShiftAmountConstant(TZ, VT, DL));
32635 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32636 Op = DAG.getNode(ISD::SHL, DL, MVT::i32, Op,
32637 DAG.getShiftAmountConstant(1, VT, DL));
32638 Op = DAG.getNode(ISD::SRL, DL, MVT::i32,
32639 DAG.getConstant(0b1110100110010100U, DL, MVT::i32), Op);
32640 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op,
32641 DAG.getConstant(0x3, DL, MVT::i32));
32642 return DAG.getZExtOrTrunc(Op, DL, VT);
32643 }
32644
32645 // i4 CTPOP - perform LUT into i64 integer.
32646 if (ShiftedActiveBits <= 4 &&
32647 DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64)) {
32648 SDValue LUT = DAG.getConstant(0x4332322132212110ULL, DL, MVT::i64);
32649 if (ActiveBits > 4)
32650 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32651 DAG.getShiftAmountConstant(TZ, VT, DL));
32652 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32653 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32654 DAG.getConstant(4, DL, MVT::i32));
32655 Op = DAG.getNode(ISD::SRL, DL, MVT::i64, LUT,
32656 DAG.getShiftAmountOperand(MVT::i64, Op));
32657 Op = DAG.getNode(ISD::AND, DL, MVT::i64, Op,
32658 DAG.getConstant(0x7, DL, MVT::i64));
32659 return DAG.getZExtOrTrunc(Op, DL, VT);
32660 }
32661
32662 // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
32663 if (ShiftedActiveBits <= 8) {
32664 SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);
32665 if (ActiveBits > 8)
32666 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32667 DAG.getShiftAmountConstant(TZ, VT, DL));
32668 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32669 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32670 DAG.getConstant(0x08040201U, DL, MVT::i32));
32671 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32672 DAG.getShiftAmountConstant(3, MVT::i32, DL));
32673 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11);
32674 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11);
32675 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32676 DAG.getShiftAmountConstant(28, MVT::i32, DL));
32677 return DAG.getZExtOrTrunc(Op, DL, VT);
32678 }
32679
32680 return SDValue(); // fallback to generic expansion.
32681 }
32682
32683 assert(VT.isVector() &&
32684 "We only do custom lowering for vector population count.");
32685 return LowerVectorCTPOP(N, DL, Subtarget, DAG);
32686}
32687
32689 MVT VT = Op.getSimpleValueType();
32690 SDValue In = Op.getOperand(0);
32691 SDLoc DL(Op);
32692
32693 // For scalars, its still beneficial to transfer to/from the SIMD unit to
32694 // perform the BITREVERSE.
32695 if (!VT.isVector()) {
32696 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32697 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32698 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
32699 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
32700 DAG.getVectorIdxConstant(0, DL));
32701 }
32702
32703 int NumElts = VT.getVectorNumElements();
32704 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
32705
32706 // Decompose 256-bit ops into smaller 128-bit ops.
32707 if (VT.is256BitVector())
32708 return splitVectorIntUnary(Op, DAG, DL);
32709
32710 assert(VT.is128BitVector() &&
32711 "Only 128-bit vector bitreverse lowering supported.");
32712
32713 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
32714 // perform the BSWAP in the shuffle.
32715 // Its best to shuffle using the second operand as this will implicitly allow
32716 // memory folding for multiple vectors.
32717 SmallVector<SDValue, 16> MaskElts;
32718 for (int i = 0; i != NumElts; ++i) {
32719 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
32720 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
32721 int PermuteByte = SourceByte | (2 << 5);
32722 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
32723 }
32724 }
32725
32726 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
32727 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
32728 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
32729 Res, Mask);
32730 return DAG.getBitcast(VT, Res);
32731}
32732
32734 SelectionDAG &DAG) {
32735 MVT VT = Op.getSimpleValueType();
32736
32737 if (Subtarget.hasXOP() && !VT.is512BitVector())
32738 return LowerBITREVERSE_XOP(Op, DAG);
32739
32740 assert((Subtarget.hasSSSE3() || Subtarget.hasGFNI()) &&
32741 "SSSE3 or GFNI required for BITREVERSE");
32742
32743 SDValue In = Op.getOperand(0);
32744 SDLoc DL(Op);
32745
32746 // Split 512-bit ops without BWI so that we can still use the PSHUFB lowering.
32747 if (VT.is512BitVector() && !Subtarget.hasBWI())
32748 return splitVectorIntUnary(Op, DAG, DL);
32749
32750 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
32751 if (VT.is256BitVector() && !Subtarget.hasInt256())
32752 return splitVectorIntUnary(Op, DAG, DL);
32753
32754 // Lower i8/i16/i32/i64 as vXi8 BITREVERSE + BSWAP
32755 if (!VT.isVector()) {
32756 assert(
32757 (VT == MVT::i32 || VT == MVT::i64 || VT == MVT::i16 || VT == MVT::i8) &&
32758 "Only tested for i8/i16/i32/i64");
32759 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32760 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32761 Res = DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8,
32762 DAG.getBitcast(MVT::v16i8, Res));
32763 Res =
32764 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, DAG.getBitcast(VecVT, Res),
32765 DAG.getVectorIdxConstant(0, DL));
32766 return (VT == MVT::i8) ? Res : DAG.getNode(ISD::BSWAP, DL, VT, Res);
32767 }
32768
32769 assert(VT.isVector() && VT.getSizeInBits() >= 128);
32770
32771 // Lower vXi16/vXi32/vXi64 as BSWAP + vXi8 BITREVERSE.
32772 if (VT.getScalarType() != MVT::i8) {
32773 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32774 SDValue Res = DAG.getNode(ISD::BSWAP, DL, VT, In);
32775 Res = DAG.getBitcast(ByteVT, Res);
32776 Res = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, Res);
32777 return DAG.getBitcast(VT, Res);
32778 }
32779 assert(VT.isVector() && VT.getScalarType() == MVT::i8 &&
32780 "Only byte vector BITREVERSE supported");
32781
32782 unsigned NumElts = VT.getVectorNumElements();
32783
32784 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
32785 if (Subtarget.hasGFNI()) {
32787 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
32788 DAG.getTargetConstant(0, DL, MVT::i8));
32789 }
32790
32791 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
32792 // two nibbles and a PSHUFB lookup to find the bitreverse of each
32793 // 0-15 value (moved to the other nibble).
32794 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
32795 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
32796 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
32797
32798 const int LoLUT[16] = {
32799 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
32800 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
32801 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
32802 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
32803 const int HiLUT[16] = {
32804 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
32805 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
32806 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
32807 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
32808
32809 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
32810 for (unsigned i = 0; i < NumElts; ++i) {
32811 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
32812 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
32813 }
32814
32815 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
32816 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
32817 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
32818 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
32819 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
32820}
32821
32822static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
32823 SelectionDAG &DAG) {
32824 SDLoc DL(Op);
32825 SDValue X = Op.getOperand(0);
32826 MVT VT = Op.getSimpleValueType();
32827
32828 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
32829 if (VT == MVT::i8 ||
32831 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32832 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
32833 DAG.getConstant(0, DL, MVT::i8));
32834 // Copy the inverse of the parity flag into a register with setcc.
32835 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32836 // Extend to the original type.
32837 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32838 }
32839
32840 // If we have POPCNT, use the default expansion.
32841 if (Subtarget.hasPOPCNT())
32842 return SDValue();
32843
32844 if (VT == MVT::i64) {
32845 // Xor the high and low 16-bits together using a 32-bit operation.
32846 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
32847 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
32848 DAG.getConstant(32, DL, MVT::i8)));
32849 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
32850 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
32851 }
32852
32853 if (VT != MVT::i16) {
32854 // Xor the high and low 16-bits together using a 32-bit operation.
32855 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
32856 DAG.getConstant(16, DL, MVT::i8));
32857 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
32858 } else {
32859 // If the input is 16-bits, we need to extend to use an i32 shift below.
32860 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
32861 }
32862
32863 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
32864 // This should allow an h-reg to be used to save a shift.
32865 SDValue Hi = DAG.getNode(
32866 ISD::TRUNCATE, DL, MVT::i8,
32867 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
32868 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32869 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
32870 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
32871
32872 // Copy the inverse of the parity flag into a register with setcc.
32873 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32874 // Extend to the original type.
32875 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32876}
32877
32879 const X86Subtarget &Subtarget) {
32880 unsigned NewOpc = 0;
32881 switch (N->getOpcode()) {
32882 case ISD::ATOMIC_LOAD_ADD:
32883 NewOpc = X86ISD::LADD;
32884 break;
32885 case ISD::ATOMIC_LOAD_SUB:
32886 NewOpc = X86ISD::LSUB;
32887 break;
32888 case ISD::ATOMIC_LOAD_OR:
32889 NewOpc = X86ISD::LOR;
32890 break;
32891 case ISD::ATOMIC_LOAD_XOR:
32892 NewOpc = X86ISD::LXOR;
32893 break;
32894 case ISD::ATOMIC_LOAD_AND:
32895 NewOpc = X86ISD::LAND;
32896 break;
32897 default:
32898 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
32899 }
32900
32901 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
32902
32903 return DAG.getMemIntrinsicNode(
32904 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
32905 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
32906 /*MemVT=*/N->getSimpleValueType(0), MMO);
32907}
32908
32909/// Lower atomic_load_ops into LOCK-prefixed operations.
32911 const X86Subtarget &Subtarget) {
32912 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
32913 SDValue Chain = N->getOperand(0);
32914 SDValue LHS = N->getOperand(1);
32915 SDValue RHS = N->getOperand(2);
32916 unsigned Opc = N->getOpcode();
32917 MVT VT = N->getSimpleValueType(0);
32918 SDLoc DL(N);
32919
32920 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
32921 // can only be lowered when the result is unused. They should have already
32922 // been transformed into a cmpxchg loop in AtomicExpand.
32923 if (N->hasAnyUseOfValue(0)) {
32924 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
32925 // select LXADD if LOCK_SUB can't be selected.
32926 // Handle (atomic_load_xor p, SignBit) as (atomic_load_add p, SignBit) so we
32927 // can use LXADD as opposed to cmpxchg.
32928 if (Opc == ISD::ATOMIC_LOAD_SUB ||
32929 (Opc == ISD::ATOMIC_LOAD_XOR && isMinSignedConstant(RHS)))
32930 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
32931 DAG.getNegative(RHS, DL, VT), AN->getMemOperand());
32932
32933 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
32934 "Used AtomicRMW ops other than Add should have been expanded!");
32935 return N;
32936 }
32937
32938 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
32939 // The core idea here is that since the memory location isn't actually
32940 // changing, all we need is a lowering for the *ordering* impacts of the
32941 // atomicrmw. As such, we can chose a different operation and memory
32942 // location to minimize impact on other code.
32943 // The above holds unless the node is marked volatile in which
32944 // case it needs to be preserved according to the langref.
32945 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS) && !AN->isVolatile()) {
32946 // On X86, the only ordering which actually requires an instruction is
32947 // seq_cst which isn't SingleThread, everything just needs to be preserved
32948 // during codegen and then dropped. Note that we expect (but don't assume),
32949 // that orderings other than seq_cst and acq_rel have been canonicalized to
32950 // a store or load.
32953 // Prefer a locked operation against a stack location to minimize cache
32954 // traffic. This assumes that stack locations are very likely to be
32955 // accessed only by the owning thread.
32956 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
32957 assert(!N->hasAnyUseOfValue(0));
32958 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32959 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32960 DAG.getUNDEF(VT), NewChain);
32961 }
32962 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32963 SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
32964 assert(!N->hasAnyUseOfValue(0));
32965 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32966 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32967 DAG.getUNDEF(VT), NewChain);
32968 }
32969
32970 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
32971 // RAUW the chain, but don't worry about the result, as it's unused.
32972 assert(!N->hasAnyUseOfValue(0));
32973 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32974 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32975 DAG.getUNDEF(VT), LockOp.getValue(1));
32976}
32977
32979 const X86Subtarget &Subtarget) {
32980 auto *Node = cast<AtomicSDNode>(Op.getNode());
32981 SDLoc dl(Node);
32982 EVT VT = Node->getMemoryVT();
32983
32984 bool IsSeqCst =
32985 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
32986 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
32987
32988 // If this store is not sequentially consistent and the type is legal
32989 // we can just keep it.
32990 if (!IsSeqCst && IsTypeLegal)
32991 return Op;
32992
32993 if (!IsTypeLegal && !Subtarget.useSoftFloat() &&
32995 Attribute::NoImplicitFloat)) {
32996 SDValue Chain;
32997 // For illegal i128 atomic_store, when AVX is enabled, we can simply emit a
32998 // vector store.
32999 if (VT == MVT::i128 && Subtarget.is64Bit() && Subtarget.hasAVX()) {
33000 SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal());
33001 Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(),
33002 Node->getMemOperand());
33003 }
33004
33005 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
33006 // is enabled.
33007 if (VT == MVT::i64) {
33008 if (Subtarget.hasSSE1()) {
33009 SDValue SclToVec =
33010 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
33011 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
33012 SclToVec = DAG.getBitcast(StVT, SclToVec);
33013 SDVTList Tys = DAG.getVTList(MVT::Other);
33014 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
33015 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
33016 MVT::i64, Node->getMemOperand());
33017 } else if (Subtarget.hasX87()) {
33018 // First load this into an 80-bit X87 register using a stack temporary.
33019 // This will put the whole integer into the significand.
33020 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
33021 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
33022 MachinePointerInfo MPI =
33024 Chain = DAG.getStore(Node->getChain(), dl, Node->getVal(), StackPtr,
33026 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
33027 SDValue LdOps[] = {Chain, StackPtr};
33029 X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
33030 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
33031 Chain = Value.getValue(1);
33032
33033 // Now use an FIST to do the atomic store.
33034 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
33035 Chain =
33036 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
33037 StoreOps, MVT::i64, Node->getMemOperand());
33038 }
33039 }
33040
33041 if (Chain) {
33042 // If this is a sequentially consistent store, also emit an appropriate
33043 // barrier.
33044 if (IsSeqCst)
33045 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
33046
33047 return Chain;
33048 }
33049 }
33050
33051 // Convert seq_cst store -> xchg
33052 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
33053 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
33054 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, Node->getMemoryVT(),
33055 Node->getOperand(0), Node->getOperand(2),
33056 Node->getOperand(1), Node->getMemOperand());
33057 return Swap.getValue(1);
33058}
33059
33061 SDNode *N = Op.getNode();
33062 MVT VT = N->getSimpleValueType(0);
33063 unsigned Opc = Op.getOpcode();
33064
33065 // Let legalize expand this if it isn't a legal type yet.
33066 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
33067 return SDValue();
33068
33069 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
33070 SDLoc DL(N);
33071
33072 // Set the carry flag.
33073 SDValue Carry = Op.getOperand(2);
33074 EVT CarryVT = Carry.getValueType();
33075 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
33076 Carry, DAG.getAllOnesConstant(DL, CarryVT));
33077
33078 bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY;
33079 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
33080 Op.getOperand(0), Op.getOperand(1),
33081 Carry.getValue(1));
33082
33083 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
33084 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
33085 Sum.getValue(1), DL, DAG);
33086 if (N->getValueType(1) == MVT::i1)
33087 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
33088
33089 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
33090}
33091
33092static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
33093 SelectionDAG &DAG) {
33094 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
33095
33096 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
33097 // which returns the values as { float, float } (in XMM0) or
33098 // { double, double } (which is returned in XMM0, XMM1).
33099 SDLoc dl(Op);
33100 SDValue Arg = Op.getOperand(0);
33101 EVT ArgVT = Arg.getValueType();
33102 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
33103
33105 Args.emplace_back(Arg, ArgTy);
33106
33107 bool isF64 = ArgVT == MVT::f64;
33108 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
33109 // the small struct {f32, f32} is returned in (eax, edx). For f64,
33110 // the results are returned via SRet in memory.
33111 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33112 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
33113 const char *LibcallName = TLI.getLibcallName(LC);
33114 SDValue Callee =
33115 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
33116
33117 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
33118 : (Type *)FixedVectorType::get(ArgTy, 4);
33119
33121 CLI.setDebugLoc(dl)
33122 .setChain(DAG.getEntryNode())
33123 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
33124
33125 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
33126
33127 if (isF64)
33128 // Returned in xmm0 and xmm1.
33129 return CallResult.first;
33130
33131 // Returned in bits 0:31 and 32:64 xmm0.
33132 SDValue SinVal =
33133 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
33134 DAG.getVectorIdxConstant(0, dl));
33135 SDValue CosVal =
33136 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
33137 DAG.getVectorIdxConstant(1, dl));
33138 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
33139 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
33140}
33141
33142/// Widen a vector input to a vector of NVT. The
33143/// input vector must have the same element type as NVT.
33145 bool FillWithZeroes = false) {
33146 // Check if InOp already has the right width.
33147 MVT InVT = InOp.getSimpleValueType();
33148 if (InVT == NVT)
33149 return InOp;
33150
33151 if (InOp.isUndef())
33152 return DAG.getUNDEF(NVT);
33153
33155 "input and widen element type must match");
33156
33157 unsigned InNumElts = InVT.getVectorNumElements();
33158 unsigned WidenNumElts = NVT.getVectorNumElements();
33159 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
33160 "Unexpected request for vector widening");
33161
33162 SDLoc dl(InOp);
33163 if (InOp.getOpcode() == ISD::CONCAT_VECTORS && InOp.getNumOperands() == 2) {
33164 SDValue N1 = InOp.getOperand(1);
33165 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
33166 N1.isUndef()) {
33167 InOp = InOp.getOperand(0);
33168 InVT = InOp.getSimpleValueType();
33169 InNumElts = InVT.getVectorNumElements();
33170 }
33171 }
33174 EVT EltVT = InOp.getOperand(0).getValueType();
33175 SDValue FillVal =
33176 FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : DAG.getUNDEF(EltVT);
33178 Ops.append(WidenNumElts - InNumElts, FillVal);
33179 return DAG.getBuildVector(NVT, dl, Ops);
33180 }
33181 SDValue FillVal =
33182 FillWithZeroes ? DAG.getConstant(0, dl, NVT) : DAG.getUNDEF(NVT);
33183 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, InOp,
33184 DAG.getVectorIdxConstant(0, dl));
33185}
33186
33188 SelectionDAG &DAG) {
33189 assert(Subtarget.hasAVX512() &&
33190 "MGATHER/MSCATTER are supported on AVX-512 arch only");
33191
33193 SDValue Src = N->getValue();
33194 MVT VT = Src.getSimpleValueType();
33195 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
33196 SDLoc dl(Op);
33197
33198 SDValue Scale = N->getScale();
33199 SDValue Index = N->getIndex();
33200 SDValue Mask = N->getMask();
33201 SDValue Chain = N->getChain();
33202 SDValue BasePtr = N->getBasePtr();
33203
33204 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
33205 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
33206 // If the index is v2i64 and we have VLX we can use xmm for data and index.
33207 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
33208 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33209 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
33210 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
33211 SDVTList VTs = DAG.getVTList(MVT::Other);
33212 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33213 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33214 N->getMemoryVT(), N->getMemOperand());
33215 }
33216 return SDValue();
33217 }
33218
33219 MVT IndexVT = Index.getSimpleValueType();
33220
33221 // If the index is v2i32, we're being called by type legalization and we
33222 // should just let the default handling take care of it.
33223 if (IndexVT == MVT::v2i32)
33224 return SDValue();
33225
33226 // If we don't have VLX and neither the passthru or index is 512-bits, we
33227 // need to widen until one is.
33228 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
33229 !Index.getSimpleValueType().is512BitVector()) {
33230 // Determine how much we need to widen by to get a 512-bit type.
33231 unsigned Factor = std::min(512/VT.getSizeInBits(),
33232 512/IndexVT.getSizeInBits());
33233 unsigned NumElts = VT.getVectorNumElements() * Factor;
33234
33235 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33236 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33237 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33238
33239 Src = ExtendToType(Src, VT, DAG);
33240 Index = ExtendToType(Index, IndexVT, DAG);
33241 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33242 }
33243
33244 SDVTList VTs = DAG.getVTList(MVT::Other);
33245 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33246 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33247 N->getMemoryVT(), N->getMemOperand());
33248}
33249
33250static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
33251 SelectionDAG &DAG) {
33252
33254 MVT VT = Op.getSimpleValueType();
33255 MVT ScalarVT = VT.getScalarType();
33256 SDValue Mask = N->getMask();
33257 MVT MaskVT = Mask.getSimpleValueType();
33258 SDValue PassThru = N->getPassThru();
33259 SDLoc dl(Op);
33260
33261 // Handle AVX masked loads which don't support passthru other than 0.
33262 if (MaskVT.getVectorElementType() != MVT::i1) {
33263 // We also allow undef in the isel pattern.
33264 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
33265 return Op;
33266
33267 SDValue NewLoad = DAG.getMaskedLoad(
33268 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33269 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
33270 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
33271 N->isExpandingLoad());
33272 // Emit a blend.
33273 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
33274 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
33275 }
33276
33277 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
33278 "Expanding masked load is supported on AVX-512 target only!");
33279
33280 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
33281 "Expanding masked load is supported for 32 and 64-bit types only!");
33282
33283 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33284 "Cannot lower masked load op.");
33285
33286 assert((ScalarVT.getSizeInBits() >= 32 ||
33287 (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||
33288 ScalarVT == MVT::f16))) &&
33289 "Unsupported masked load op.");
33290
33291 // This operation is legal for targets with VLX, but without
33292 // VLX the vector should be widened to 512 bit
33293 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
33294 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33295 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
33296
33297 // Mask element has to be i1.
33298 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
33299 "Unexpected mask type");
33300
33301 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33302
33303 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33304 SDValue NewLoad = DAG.getMaskedLoad(
33305 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33306 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
33307 N->getExtensionType(), N->isExpandingLoad());
33308
33309 SDValue Extract =
33310 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
33311 DAG.getVectorIdxConstant(0, dl));
33312 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
33313 return DAG.getMergeValues(RetOps, dl);
33314}
33315
33316static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
33317 SelectionDAG &DAG) {
33319 SDValue DataToStore = N->getValue();
33320 MVT VT = DataToStore.getSimpleValueType();
33321 MVT ScalarVT = VT.getScalarType();
33322 SDValue Mask = N->getMask();
33323 SDLoc dl(Op);
33324
33325 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
33326 "Expanding masked load is supported on AVX-512 target only!");
33327
33328 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
33329 "Expanding masked load is supported for 32 and 64-bit types only!");
33330
33331 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33332 "Cannot lower masked store op.");
33333
33334 assert((ScalarVT.getSizeInBits() >= 32 ||
33335 (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||
33336 ScalarVT == MVT::f16))) &&
33337 "Unsupported masked store op.");
33338
33339 // This operation is legal for targets with VLX, but without
33340 // VLX the vector should be widened to 512 bit
33341 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
33342 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33343
33344 // Mask element has to be i1.
33345 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
33346 "Unexpected mask type");
33347
33348 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33349
33350 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
33351 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33352 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
33353 N->getOffset(), Mask, N->getMemoryVT(),
33354 N->getMemOperand(), N->getAddressingMode(),
33355 N->isTruncatingStore(), N->isCompressingStore());
33356}
33357
33358static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
33359 SelectionDAG &DAG) {
33360 assert(Subtarget.hasAVX2() &&
33361 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
33362
33364 SDLoc dl(Op);
33365 MVT VT = Op.getSimpleValueType();
33366 SDValue Index = N->getIndex();
33367 SDValue Mask = N->getMask();
33368 SDValue PassThru = N->getPassThru();
33369 MVT IndexVT = Index.getSimpleValueType();
33370
33371 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
33372
33373 // If the index is v2i32, we're being called by type legalization.
33374 if (IndexVT == MVT::v2i32)
33375 return SDValue();
33376
33377 // If we don't have VLX and neither the passthru or index is 512-bits, we
33378 // need to widen until one is.
33379 MVT OrigVT = VT;
33380 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33381 !IndexVT.is512BitVector()) {
33382 // Determine how much we need to widen by to get a 512-bit type.
33383 unsigned Factor = std::min(512/VT.getSizeInBits(),
33384 512/IndexVT.getSizeInBits());
33385
33386 unsigned NumElts = VT.getVectorNumElements() * Factor;
33387
33388 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33389 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33390 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33391
33392 PassThru = ExtendToType(PassThru, VT, DAG);
33393 Index = ExtendToType(Index, IndexVT, DAG);
33394 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33395 }
33396
33397 // Break dependency on the data register.
33398 if (PassThru.isUndef())
33399 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
33400
33401 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
33402 N->getScale() };
33403 SDValue NewGather = DAG.getMemIntrinsicNode(
33404 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
33405 N->getMemOperand());
33406 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT, NewGather,
33407 DAG.getVectorIdxConstant(0, dl));
33408 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
33409}
33410
33412 SDLoc dl(Op);
33413 SDValue Src = Op.getOperand(0);
33414 MVT DstVT = Op.getSimpleValueType();
33415
33417 unsigned SrcAS = N->getSrcAddressSpace();
33418
33419 assert(SrcAS != N->getDestAddressSpace() &&
33420 "addrspacecast must be between different address spaces");
33421
33422 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
33423 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
33424 } else if (DstVT == MVT::i64) {
33425 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
33426 } else if (DstVT == MVT::i32) {
33427 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
33428 } else {
33429 report_fatal_error("Bad address space in addrspacecast");
33430 }
33431 return Op;
33432}
33433
33434SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
33435 SelectionDAG &DAG) const {
33436 // TODO: Eventually, the lowering of these nodes should be informed by or
33437 // deferred to the GC strategy for the function in which they appear. For
33438 // now, however, they must be lowered to something. Since they are logically
33439 // no-ops in the case of a null GC strategy (or a GC strategy which does not
33440 // require special handling for these nodes), lower them as literal NOOPs for
33441 // the time being.
33443 Ops.push_back(Op.getOperand(0));
33444 if (Op->getGluedNode())
33445 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
33446
33447 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
33448 return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
33449}
33450
33451// Custom split CVTPS2PH with wide types.
33453 SDLoc dl(Op);
33454 EVT VT = Op.getValueType();
33455 SDValue Lo, Hi;
33456 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
33457 EVT LoVT, HiVT;
33458 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33459 SDValue RC = Op.getOperand(1);
33460 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
33461 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
33462 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33463}
33464
33466 SelectionDAG &DAG) {
33467 unsigned IsData = Op.getConstantOperandVal(4);
33468
33469 // We don't support non-data prefetch without PREFETCHI.
33470 // Just preserve the chain.
33471 if (!IsData && !Subtarget.hasPREFETCHI())
33472 return Op.getOperand(0);
33473
33474 return Op;
33475}
33476
33478 SDNode *N = Op.getNode();
33479 SDValue Operand = N->getOperand(0);
33480 EVT VT = Operand.getValueType();
33481 SDLoc dl(N);
33482
33483 SDValue One = DAG.getConstantFP(1.0, dl, VT);
33484
33485 // TODO: Fix Crash for bf16 when generating strict_fmul as it
33486 // leads to a error : SoftPromoteHalfResult #0: t11: bf16,ch = strict_fmul t0,
33487 // ConstantFP:bf16<APFloat(16256)>, t5 LLVM ERROR: Do not know how to soft
33488 // promote this operator's result!
33489 SDValue Chain = DAG.getEntryNode();
33490 SDValue StrictFmul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other},
33491 {Chain, Operand, One});
33492 return StrictFmul;
33493}
33494
33496 unsigned OpNo) {
33497 const APInt Operand(32, OpNo);
33498 std::string OpNoStr = llvm::toString(Operand, 10, false);
33499 std::string Str(" $");
33500
33501 std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)
33502 std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}
33503
33504 auto I = StringRef::npos;
33505 for (auto &AsmStr : AsmStrs) {
33506 // Match the OpNo string. We should match exactly to exclude match
33507 // sub-string, e.g. "$12" contain "$1"
33508 if (AsmStr.ends_with(OpNoStr1))
33509 I = AsmStr.size() - OpNoStr1.size();
33510
33511 // Get the index of operand in AsmStr.
33512 if (I == StringRef::npos)
33513 I = AsmStr.find(OpNoStr1 + ",");
33514 if (I == StringRef::npos)
33515 I = AsmStr.find(OpNoStr2);
33516
33517 if (I == StringRef::npos)
33518 continue;
33519
33520 assert(I > 0 && "Unexpected inline asm string!");
33521 // Remove the operand string and label (if exsit).
33522 // For example:
33523 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"
33524 // ==>
33525 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr "
33526 // ==>
33527 // "call dword ptr "
33528 auto TmpStr = AsmStr.substr(0, I);
33529 I = TmpStr.rfind(':');
33530 if (I != StringRef::npos)
33531 TmpStr = TmpStr.substr(I + 1);
33532 return TmpStr.take_while(llvm::isAlpha);
33533 }
33534
33535 return StringRef();
33536}
33537
33539 const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {
33540 // In a __asm block, __asm inst foo where inst is CALL or JMP should be
33541 // changed from indirect TargetLowering::C_Memory to direct
33542 // TargetLowering::C_Address.
33543 // We don't need to special case LOOP* and Jcc, which cannot target a memory
33544 // location.
33545 StringRef Inst = getInstrStrFromOpNo(AsmStrs, OpNo);
33546 return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp");
33547}
33548
33550 SDValue Mask) {
33551 EVT Ty = MVT::i8;
33552 auto V = DAG.getBitcast(MVT::i1, Mask);
33553 auto VE = DAG.getZExtOrTrunc(V, DL, Ty);
33554 auto Zero = DAG.getConstant(0, DL, Ty);
33555 SDVTList X86SubVTs = DAG.getVTList(Ty, MVT::i32);
33556 auto CmpZero = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, VE);
33557 return SDValue(CmpZero.getNode(), 1);
33558}
33559
33561 SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO,
33562 SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const {
33563 // @llvm.masked.load.v1*(ptr, alignment, mask, passthru)
33564 // ->
33565 // _, flags = SUB 0, mask
33566 // res, chain = CLOAD inchain, ptr, (bit_cast_to_scalar passthru), cond, flags
33567 // bit_cast_to_vector<res>
33568 EVT VTy = PassThru.getValueType();
33569 EVT Ty = VTy.getVectorElementType();
33570 SDVTList Tys = DAG.getVTList(Ty, MVT::Other);
33571 auto ScalarPassThru = PassThru.isUndef() ? DAG.getConstant(0, DL, Ty)
33572 : DAG.getBitcast(Ty, PassThru);
33573 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33574 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33575 SDValue Ops[] = {Chain, Ptr, ScalarPassThru, COND_NE, Flags};
33576 NewLoad = DAG.getMemIntrinsicNode(X86ISD::CLOAD, DL, Tys, Ops, Ty, MMO);
33577 return DAG.getBitcast(VTy, NewLoad);
33578}
33579
33581 SDValue Chain,
33583 SDValue Val, SDValue Mask) const {
33584 // llvm.masked.store.v1*(Src0, Ptr, alignment, Mask)
33585 // ->
33586 // _, flags = SUB 0, mask
33587 // chain = CSTORE inchain, (bit_cast_to_scalar val), ptr, cond, flags
33589 SDVTList Tys = DAG.getVTList(MVT::Other);
33590 auto ScalarVal = DAG.getBitcast(Ty, Val);
33591 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33592 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33593 SDValue Ops[] = {Chain, ScalarVal, Ptr, COND_NE, Flags};
33594 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, DL, Tys, Ops, Ty, MMO);
33595}
33596
33597/// Provide custom lowering hooks for some operations.
33599 switch (Op.getOpcode()) {
33600 // clang-format off
33601 default: llvm_unreachable("Should not custom lower this!");
33602 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
33603 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
33604 return LowerCMP_SWAP(Op, Subtarget, DAG);
33605 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
33606 case ISD::ATOMIC_LOAD_ADD:
33607 case ISD::ATOMIC_LOAD_SUB:
33608 case ISD::ATOMIC_LOAD_OR:
33609 case ISD::ATOMIC_LOAD_XOR:
33610 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
33611 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
33612 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
33613 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
33614 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
33615 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
33616 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
33617 case ISD::VECTOR_COMPRESS: return lowerVECTOR_COMPRESS(Op, Subtarget, DAG);
33618 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
33619 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
33620 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
33621 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
33622 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
33623 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
33624 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
33625 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
33626 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
33627 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
33628 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
33629 case ISD::SHL_PARTS:
33630 case ISD::SRA_PARTS:
33631 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
33632 case ISD::FSHL:
33633 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
33634 case ISD::FCANONICALIZE: return LowerFCanonicalize(Op, DAG);
33636 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
33638 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
33639 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
33640 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
33641 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
33642 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
33645 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
33646 case ISD::FP_TO_SINT:
33648 case ISD::FP_TO_UINT:
33649 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
33651 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
33652 case ISD::FP_EXTEND:
33653 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
33654 case ISD::FP_ROUND:
33655 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
33656 case ISD::FP16_TO_FP:
33657 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
33658 case ISD::FP_TO_FP16:
33659 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
33660 case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);
33661 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
33662 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
33663 case ISD::FADD:
33664 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
33665 case ISD::FROUND: return LowerFROUND(Op, DAG);
33666 case ISD::FABS:
33667 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
33668 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
33669 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
33670 case ISD::LRINT:
33671 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
33672 case ISD::SETCC:
33673 case ISD::STRICT_FSETCC:
33674 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
33675 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
33676 case ISD::SELECT: return LowerSELECT(Op, DAG);
33677 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
33678 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
33679 case ISD::VASTART: return LowerVASTART(Op, DAG);
33680 case ISD::VAARG: return LowerVAARG(Op, DAG);
33681 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
33682 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
33684 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
33685 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
33686 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
33687 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
33689 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
33690 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
33691 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
33692 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
33693 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
33695 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
33696 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
33697 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
33698 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
33699 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
33700 case ISD::GET_FPENV_MEM: return LowerGET_FPENV_MEM(Op, DAG);
33701 case ISD::SET_FPENV_MEM: return LowerSET_FPENV_MEM(Op, DAG);
33702 case ISD::RESET_FPENV: return LowerRESET_FPENV(Op, DAG);
33703 case ISD::CTLZ:
33704 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
33705 case ISD::CTTZ:
33706 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
33707 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
33708 case ISD::MULHS:
33709 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
33710 case ISD::ROTL:
33711 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
33712 case ISD::SRA:
33713 case ISD::SRL:
33714 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
33715 case ISD::SADDO:
33716 case ISD::UADDO:
33717 case ISD::SSUBO:
33718 case ISD::USUBO: return LowerXALUO(Op, DAG);
33719 case ISD::SMULO:
33720 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
33721 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
33722 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
33723 case ISD::SADDO_CARRY:
33724 case ISD::SSUBO_CARRY:
33725 case ISD::UADDO_CARRY:
33726 case ISD::USUBO_CARRY: return LowerADDSUBO_CARRY(Op, DAG);
33727 case ISD::ADD:
33728 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
33729 case ISD::UADDSAT:
33730 case ISD::SADDSAT:
33731 case ISD::USUBSAT:
33732 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
33733 case ISD::SMAX:
33734 case ISD::SMIN:
33735 case ISD::UMAX:
33736 case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);
33737 case ISD::FMINIMUM:
33738 case ISD::FMAXIMUM:
33739 case ISD::FMINIMUMNUM:
33740 case ISD::FMAXIMUMNUM:
33741 return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);
33742 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
33743 case ISD::ABDS:
33744 case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);
33745 case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
33746 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
33747 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
33748 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
33749 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
33750 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
33751 case ISD::GC_TRANSITION_START:
33752 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
33753 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
33754 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
33755 case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG);
33756 // clang-format on
33757 }
33758}
33759
33760/// Replace a node with an illegal result type with a new node built out of
33761/// custom code.
33764 SelectionDAG &DAG) const {
33765 SDLoc dl(N);
33766 unsigned Opc = N->getOpcode();
33767 switch (Opc) {
33768 default:
33769#ifndef NDEBUG
33770 dbgs() << "ReplaceNodeResults: ";
33771 N->dump(&DAG);
33772#endif
33773 llvm_unreachable("Do not know how to custom type legalize this operation!");
33774 case X86ISD::CVTPH2PS: {
33775 EVT VT = N->getValueType(0);
33776 SDValue Lo, Hi;
33777 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
33778 EVT LoVT, HiVT;
33779 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33780 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
33781 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
33782 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33783 Results.push_back(Res);
33784 return;
33785 }
33787 EVT VT = N->getValueType(0);
33788 SDValue Lo, Hi;
33789 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
33790 EVT LoVT, HiVT;
33791 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33792 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
33793 {N->getOperand(0), Lo});
33794 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
33795 {N->getOperand(0), Hi});
33796 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
33797 Lo.getValue(1), Hi.getValue(1));
33798 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33799 Results.push_back(Res);
33800 Results.push_back(Chain);
33801 return;
33802 }
33803 case X86ISD::CVTPS2PH:
33804 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
33805 return;
33806 case ISD::CTPOP: {
33807 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
33808 // If we have at most 32 active bits, then perform as i32 CTPOP.
33809 // TODO: Perform this in generic legalizer?
33810 KnownBits Known = DAG.computeKnownBits(N->getOperand(0));
33811 unsigned LZ = Known.countMinLeadingZeros();
33812 unsigned TZ = Known.countMinTrailingZeros();
33813 if ((LZ + TZ) >= 32) {
33814 SDValue Op = DAG.getNode(ISD::SRL, dl, MVT::i64, N->getOperand(0),
33815 DAG.getShiftAmountConstant(TZ, MVT::i64, dl));
33816 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Op);
33817 Op = DAG.getNode(ISD::CTPOP, dl, MVT::i32, Op);
33818 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Op);
33819 Results.push_back(Op);
33820 return;
33821 }
33822 // Use a v2i64 if possible.
33823 bool NoImplicitFloatOps =
33825 Attribute::NoImplicitFloat);
33826 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
33827 SDValue Wide =
33828 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
33829 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
33830 // Bit count should fit in 32-bits, extract it as that and then zero
33831 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
33832 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
33833 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
33834 DAG.getVectorIdxConstant(0, dl));
33835 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
33836 Results.push_back(Wide);
33837 }
33838 return;
33839 }
33840 case ISD::MUL: {
33841 EVT VT = N->getValueType(0);
33843 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
33844 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
33845 // elements are needed.
33846 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
33847 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
33848 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
33849 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
33850 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33851 unsigned NumConcats = 16 / VT.getVectorNumElements();
33852 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
33853 ConcatOps[0] = Res;
33854 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
33855 Results.push_back(Res);
33856 return;
33857 }
33858 case ISD::SMULO:
33859 case ISD::UMULO: {
33860 EVT VT = N->getValueType(0);
33862 VT == MVT::v2i32 && "Unexpected VT!");
33863 bool IsSigned = Opc == ISD::SMULO;
33864 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
33865 SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
33866 SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
33867 SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);
33868 // Extract the high 32 bits from each result using PSHUFD.
33869 // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.
33870 SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);
33871 Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
33872 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,
33873 DAG.getVectorIdxConstant(0, dl));
33874
33875 // Truncate the low bits of the result. This will become PSHUFD.
33876 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33877
33878 SDValue HiCmp;
33879 if (IsSigned) {
33880 // SMULO overflows if the high bits don't match the sign of the low.
33881 HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));
33882 } else {
33883 // UMULO overflows if the high bits are non-zero.
33884 HiCmp = DAG.getConstant(0, dl, VT);
33885 }
33886 SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
33887
33888 // Widen the result with by padding with undef.
33889 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
33890 DAG.getUNDEF(VT));
33891 Results.push_back(Res);
33892 Results.push_back(Ovf);
33893 return;
33894 }
33895 case X86ISD::VPMADDWD: {
33896 // Legalize types for X86ISD::VPMADDWD by widening.
33897 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33898
33899 EVT VT = N->getValueType(0);
33900 EVT InVT = N->getOperand(0).getValueType();
33901 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
33902 "Expected a VT that divides into 128 bits.");
33904 "Unexpected type action!");
33905 unsigned NumConcat = 128 / InVT.getSizeInBits();
33906
33907 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
33908 InVT.getVectorElementType(),
33909 NumConcat * InVT.getVectorNumElements());
33910 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
33912 NumConcat * VT.getVectorNumElements());
33913
33914 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
33915 Ops[0] = N->getOperand(0);
33916 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33917 Ops[0] = N->getOperand(1);
33918 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33919
33920 SDValue Res = DAG.getNode(Opc, dl, WideVT, InVec0, InVec1);
33921 Results.push_back(Res);
33922 return;
33923 }
33924 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
33925 case X86ISD::FMINC:
33926 case X86ISD::FMIN:
33927 case X86ISD::FMAXC:
33928 case X86ISD::FMAX:
33930 case X86ISD::STRICT_FMAX: {
33931 EVT VT = N->getValueType(0);
33932 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
33933 bool IsStrict = Opc == X86ISD::STRICT_FMIN || Opc == X86ISD::STRICT_FMAX;
33934 SDValue UNDEF = DAG.getUNDEF(VT);
33935 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33936 N->getOperand(IsStrict ? 1 : 0), UNDEF);
33937 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33938 N->getOperand(IsStrict ? 2 : 1), UNDEF);
33939 SDValue Res;
33940 if (IsStrict)
33941 Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
33942 {N->getOperand(0), LHS, RHS});
33943 else
33944 Res = DAG.getNode(Opc, dl, MVT::v4f32, LHS, RHS);
33945 Results.push_back(Res);
33946 if (IsStrict)
33947 Results.push_back(Res.getValue(1));
33948 return;
33949 }
33950 case ISD::SDIV:
33951 case ISD::UDIV:
33952 case ISD::SREM:
33953 case ISD::UREM: {
33954 EVT VT = N->getValueType(0);
33955 if (VT.isVector()) {
33957 "Unexpected type action!");
33958 // If this RHS is a constant splat vector we can widen this and let
33959 // division/remainder by constant optimize it.
33960 // TODO: Can we do something for non-splat?
33961 APInt SplatVal;
33962 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
33963 unsigned NumConcats = 128 / VT.getSizeInBits();
33964 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
33965 Ops0[0] = N->getOperand(0);
33966 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
33967 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
33968 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
33969 SDValue Res = DAG.getNode(Opc, dl, ResVT, N0, N1);
33970 Results.push_back(Res);
33971 }
33972 return;
33973 }
33974
33975 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
33976 Results.push_back(V);
33977 return;
33978 }
33979 case ISD::TRUNCATE: {
33980 MVT VT = N->getSimpleValueType(0);
33981 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
33982 return;
33983
33984 // The generic legalizer will try to widen the input type to the same
33985 // number of elements as the widened result type. But this isn't always
33986 // the best thing so do some custom legalization to avoid some cases.
33987 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
33988 SDValue In = N->getOperand(0);
33989 EVT InVT = In.getValueType();
33990 EVT InEltVT = InVT.getVectorElementType();
33991 EVT EltVT = VT.getVectorElementType();
33992 unsigned MinElts = VT.getVectorNumElements();
33993 unsigned WidenNumElts = WidenVT.getVectorNumElements();
33994 unsigned InBits = InVT.getSizeInBits();
33995
33996 // See if there are sufficient leading bits to perform a PACKUS/PACKSS.
33997 unsigned PackOpcode;
33998 if (SDValue Src = matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG,
33999 Subtarget, N->getFlags())) {
34000 if (SDValue Res =
34001 truncateVectorWithPACK(PackOpcode, VT, Src, dl, DAG, Subtarget)) {
34002 Res = widenSubVector(WidenVT, Res, false, Subtarget, DAG, dl);
34003 Results.push_back(Res);
34004 return;
34005 }
34006 }
34007
34008 if ((128 % InBits) == 0 && WidenVT.is128BitVector()) {
34009 // 128 bit and smaller inputs should avoid truncate all together and
34010 // use a shuffle.
34011 if ((InEltVT.getSizeInBits() % EltVT.getSizeInBits()) == 0) {
34012 int Scale = InEltVT.getSizeInBits() / EltVT.getSizeInBits();
34013 SmallVector<int, 16> TruncMask(WidenNumElts, -1);
34014 for (unsigned I = 0; I < MinElts; ++I)
34015 TruncMask[I] = Scale * I;
34016 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl, 128);
34017 assert(isTypeLegal(WidenVT) && isTypeLegal(WidenIn.getValueType()) &&
34018 "Illegal vector type in truncation");
34019 WidenIn = DAG.getBitcast(WidenVT, WidenIn);
34020 Results.push_back(
34021 DAG.getVectorShuffle(WidenVT, dl, WidenIn, WidenIn, TruncMask));
34022 return;
34023 }
34024 }
34025
34026 // With AVX512 there are some cases that can use a target specific
34027 // truncate node to go from 256/512 to less than 128 with zeros in the
34028 // upper elements of the 128 bit result.
34029 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
34030 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
34031 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
34032 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34033 return;
34034 }
34035 // There's one case we can widen to 512 bits and use VTRUNC.
34036 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
34037 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
34038 DAG.getUNDEF(MVT::v4i64));
34039 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34040 return;
34041 }
34042 }
34043 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
34044 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
34045 isTypeLegal(MVT::v4i64)) {
34046 // Input needs to be split and output needs to widened. Let's use two
34047 // VTRUNCs, and shuffle their results together into the wider type.
34048 SDValue Lo, Hi;
34049 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
34050
34051 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
34052 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
34053 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
34054 { 0, 1, 2, 3, 16, 17, 18, 19,
34055 -1, -1, -1, -1, -1, -1, -1, -1 });
34056 Results.push_back(Res);
34057 return;
34058 }
34059
34060 // Attempt to widen the truncation input vector to let LowerTRUNCATE handle
34061 // this via type legalization.
34062 if ((InEltVT == MVT::i16 || InEltVT == MVT::i32 || InEltVT == MVT::i64) &&
34063 (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32) &&
34064 (!Subtarget.hasSSSE3() ||
34065 (!isTypeLegal(InVT) &&
34066 !(MinElts <= 4 && InEltVT == MVT::i64 && EltVT == MVT::i8)))) {
34067 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl,
34068 InEltVT.getSizeInBits() * WidenNumElts);
34069 Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, WidenVT, WidenIn));
34070 return;
34071 }
34072
34073 return;
34074 }
34075 case ISD::ANY_EXTEND:
34076 // Right now, only MVT::v8i8 has Custom action for an illegal type.
34077 // It's intended to custom handle the input type.
34078 assert(N->getValueType(0) == MVT::v8i8 &&
34079 "Do not know how to legalize this Node");
34080 return;
34081 case ISD::SIGN_EXTEND:
34082 case ISD::ZERO_EXTEND: {
34083 EVT VT = N->getValueType(0);
34084 SDValue In = N->getOperand(0);
34085 EVT InVT = In.getValueType();
34086 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
34087 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
34089 "Unexpected type action!");
34090 assert(Opc == ISD::SIGN_EXTEND && "Unexpected opcode");
34091 // Custom split this so we can extend i8/i16->i32 invec. This is better
34092 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
34093 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
34094 // we allow the sra from the extend to i32 to be shared by the split.
34095 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
34096
34097 // Fill a vector with sign bits for each element.
34098 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
34099 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
34100
34101 // Create an unpackl and unpackh to interleave the sign bits then bitcast
34102 // to v2i64.
34103 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34104 {0, 4, 1, 5});
34105 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
34106 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34107 {2, 6, 3, 7});
34108 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
34109
34110 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34111 Results.push_back(Res);
34112 return;
34113 }
34114
34115 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
34116 if (!InVT.is128BitVector()) {
34117 // Not a 128 bit vector, but maybe type legalization will promote
34118 // it to 128 bits.
34119 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
34120 return;
34121 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
34122 if (!InVT.is128BitVector())
34123 return;
34124
34125 // Promote the input to 128 bits. Type legalization will turn this into
34126 // zext_inreg/sext_inreg.
34127 In = DAG.getNode(Opc, dl, InVT, In);
34128 }
34129
34130 // Perform custom splitting instead of the two stage extend we would get
34131 // by default.
34132 EVT LoVT, HiVT;
34133 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
34134 assert(isTypeLegal(LoVT) && "Split VT not legal?");
34135
34136 SDValue Lo = getEXTEND_VECTOR_INREG(Opc, dl, LoVT, In, DAG);
34137
34138 // We need to shift the input over by half the number of elements.
34139 unsigned NumElts = InVT.getVectorNumElements();
34140 unsigned HalfNumElts = NumElts / 2;
34141 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
34142 for (unsigned i = 0; i != HalfNumElts; ++i)
34143 ShufMask[i] = i + HalfNumElts;
34144
34145 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
34146 Hi = getEXTEND_VECTOR_INREG(Opc, dl, HiVT, Hi, DAG);
34147
34148 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34149 Results.push_back(Res);
34150 }
34151 return;
34152 }
34154 case ISD::FP_TO_UINT_SAT: {
34155 if (!Subtarget.hasAVX10_2())
34156 return;
34157
34158 bool IsSigned = Opc == ISD::FP_TO_SINT_SAT;
34159 EVT VT = N->getValueType(0);
34160 SDValue Op = N->getOperand(0);
34161 EVT OpVT = Op.getValueType();
34162 SDValue Res;
34163
34164 if (VT == MVT::v2i32 && OpVT == MVT::v2f64) {
34165 if (IsSigned)
34166 Res = DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v4i32, Op);
34167 else
34168 Res = DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v4i32, Op);
34169 Results.push_back(Res);
34170 }
34171 return;
34172 }
34173 case ISD::FP_TO_SINT:
34175 case ISD::FP_TO_UINT:
34177 bool IsStrict = N->isStrictFPOpcode();
34178 bool IsSigned = Opc == ISD::FP_TO_SINT || Opc == ISD::STRICT_FP_TO_SINT;
34179 EVT VT = N->getValueType(0);
34180 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34181 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34182 EVT SrcVT = Src.getValueType();
34183
34184 SDValue Res;
34185 if (isSoftF16(SrcVT, Subtarget)) {
34186 EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
34187 if (IsStrict) {
34188 Res =
34189 DAG.getNode(Opc, dl, {VT, MVT::Other},
34190 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
34191 {NVT, MVT::Other}, {Chain, Src})});
34192 Chain = Res.getValue(1);
34193 } else {
34194 Res =
34195 DAG.getNode(Opc, dl, VT, DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
34196 }
34197 Results.push_back(Res);
34198 if (IsStrict)
34199 Results.push_back(Chain);
34200
34201 return;
34202 }
34203
34204 if (VT.isVector() && Subtarget.hasFP16() && Subtarget.hasVLX() &&
34205 SrcVT.getVectorElementType() == MVT::f16) {
34206 EVT EleVT = VT.getVectorElementType();
34207 EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
34208
34209 if (SrcVT != MVT::v8f16) {
34210 SDValue Tmp =
34211 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
34212 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
34213 Ops[0] = Src;
34214 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
34215 }
34216
34217 if (IsStrict) {
34219 Res =
34220 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
34221 Chain = Res.getValue(1);
34222 } else {
34223 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34224 Res = DAG.getNode(Opc, dl, ResVT, Src);
34225 }
34226
34227 // TODO: Need to add exception check code for strict FP.
34228 if (EleVT.getSizeInBits() < 16) {
34229 MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
34230 Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
34231
34232 // Now widen to 128 bits.
34233 unsigned NumConcats = 128 / TmpVT.getSizeInBits();
34234 MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
34235 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
34236 ConcatOps[0] = Res;
34237 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34238 }
34239
34240 Results.push_back(Res);
34241 if (IsStrict)
34242 Results.push_back(Chain);
34243
34244 return;
34245 }
34246
34247 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
34249 "Unexpected type action!");
34250
34251 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
34252 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
34253 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
34255 SDValue Res;
34256 SDValue Chain;
34257 if (IsStrict) {
34258 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
34259 {N->getOperand(0), Src});
34260 Chain = Res.getValue(1);
34261 } else
34262 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
34263
34264 // Preserve what we know about the size of the original result. If the
34265 // result is v2i32, we have to manually widen the assert.
34266 if (PromoteVT == MVT::v2i32)
34267 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
34268 DAG.getUNDEF(MVT::v2i32));
34269
34270 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
34271 Res.getValueType(), Res,
34273
34274 if (PromoteVT == MVT::v2i32)
34275 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
34276 DAG.getVectorIdxConstant(0, dl));
34277
34278 // Truncate back to the original width.
34279 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
34280
34281 // Now widen to 128 bits.
34282 unsigned NumConcats = 128 / VT.getSizeInBits();
34284 VT.getVectorNumElements() * NumConcats);
34285 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
34286 ConcatOps[0] = Res;
34287 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34288 Results.push_back(Res);
34289 if (IsStrict)
34290 Results.push_back(Chain);
34291 return;
34292 }
34293
34294
34295 if (VT == MVT::v2i32) {
34296 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
34297 "Strict unsigned conversion requires AVX512");
34298 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34300 "Unexpected type action!");
34301 if (Src.getValueType() == MVT::v2f64) {
34302 if (!IsSigned && !Subtarget.hasAVX512()) {
34303 SDValue Res =
34304 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
34305 Results.push_back(Res);
34306 return;
34307 }
34308
34309 if (IsStrict)
34311 else
34312 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34313
34314 // If we have VLX we can emit a target specific FP_TO_UINT node,.
34315 if (!IsSigned && !Subtarget.hasVLX()) {
34316 // Otherwise we can defer to the generic legalizer which will widen
34317 // the input as well. This will be further widened during op
34318 // legalization to v8i32<-v8f64.
34319 // For strict nodes we'll need to widen ourselves.
34320 // FIXME: Fix the type legalizer to safely widen strict nodes?
34321 if (!IsStrict)
34322 return;
34323 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
34324 DAG.getConstantFP(0.0, dl, MVT::v2f64));
34325 Opc = N->getOpcode();
34326 }
34327 SDValue Res;
34328 SDValue Chain;
34329 if (IsStrict) {
34330 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
34331 {N->getOperand(0), Src});
34332 Chain = Res.getValue(1);
34333 } else {
34334 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
34335 }
34336 Results.push_back(Res);
34337 if (IsStrict)
34338 Results.push_back(Chain);
34339 return;
34340 }
34341
34342 // Custom widen strict v2f32->v2i32 by padding with zeros.
34343 // FIXME: Should generic type legalizer do this?
34344 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
34345 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
34346 DAG.getConstantFP(0.0, dl, MVT::v2f32));
34347 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
34348 {N->getOperand(0), Src});
34349 Results.push_back(Res);
34350 Results.push_back(Res.getValue(1));
34351 return;
34352 }
34353
34354 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
34355 // so early out here.
34356 return;
34357 }
34358
34359 assert(!VT.isVector() && "Vectors should have been handled above!");
34360
34361 if ((Subtarget.hasDQI() && VT == MVT::i64 &&
34362 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
34363 (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
34364 assert(!Subtarget.is64Bit() && "i64 should be legal");
34365 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
34366 // If we use a 128-bit result we might need to use a target specific node.
34367 unsigned SrcElts =
34368 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
34369 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
34370 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
34371 if (NumElts != SrcElts) {
34372 if (IsStrict)
34374 else
34375 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34376 }
34377
34378 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
34379 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
34380 DAG.getConstantFP(0.0, dl, VecInVT), Src,
34381 ZeroIdx);
34382 SDValue Chain;
34383 if (IsStrict) {
34384 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
34385 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
34386 Chain = Res.getValue(1);
34387 } else
34388 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
34389 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
34390 Results.push_back(Res);
34391 if (IsStrict)
34392 Results.push_back(Chain);
34393 return;
34394 }
34395
34396 if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
34397 SDValue Chain;
34398 SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
34399 Results.push_back(V);
34400 if (IsStrict)
34401 Results.push_back(Chain);
34402 return;
34403 }
34404
34405 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
34406 Results.push_back(V);
34407 if (IsStrict)
34408 Results.push_back(Chain);
34409 }
34410 return;
34411 }
34412 case ISD::LRINT:
34413 if (N->getValueType(0) == MVT::v2i32) {
34414 SDValue Src = N->getOperand(0);
34415 if (Subtarget.hasFP16() && Src.getValueType() == MVT::v2f16) {
34416 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src,
34417 DAG.getUNDEF(MVT::v2f16));
34418 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Src,
34419 DAG.getUNDEF(MVT::v4f16));
34420 } else if (Src.getValueType() != MVT::v2f64) {
34421 return;
34422 }
34423 Results.push_back(DAG.getNode(X86ISD::CVTP2SI, dl, MVT::v4i32, Src));
34424 return;
34425 }
34426 [[fallthrough]];
34427 case ISD::LLRINT: {
34428 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
34429 Results.push_back(V);
34430 return;
34431 }
34432
34433 case ISD::SINT_TO_FP:
34435 case ISD::UINT_TO_FP:
34437 bool IsStrict = N->isStrictFPOpcode();
34438 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
34439 EVT VT = N->getValueType(0);
34440 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34441 if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
34442 Subtarget.hasVLX()) {
34443 if (Src.getValueType().getVectorElementType() == MVT::i16)
34444 return;
34445
34446 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
34447 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34448 IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
34449 : DAG.getUNDEF(MVT::v2i32));
34450 if (IsStrict) {
34451 unsigned Opc =
34453 SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
34454 {N->getOperand(0), Src});
34455 Results.push_back(Res);
34456 Results.push_back(Res.getValue(1));
34457 } else {
34458 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34459 Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
34460 }
34461 return;
34462 }
34463 if (VT != MVT::v2f32)
34464 return;
34465 EVT SrcVT = Src.getValueType();
34466 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
34467 if (IsStrict) {
34468 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
34470 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34471 {N->getOperand(0), Src});
34472 Results.push_back(Res);
34473 Results.push_back(Res.getValue(1));
34474 } else {
34475 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34476 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
34477 }
34478 return;
34479 }
34480 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
34481 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
34482 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
34483 SDValue One = DAG.getConstant(1, dl, SrcVT);
34484 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
34485 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
34486 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
34487 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
34488 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
34489 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
34490 for (int i = 0; i != 2; ++i) {
34491 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
34492 SignSrc, DAG.getVectorIdxConstant(i, dl));
34493 if (IsStrict)
34494 SignCvts[i] =
34495 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
34496 {N->getOperand(0), Elt});
34497 else
34498 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
34499 };
34500 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
34501 SDValue Slow, Chain;
34502 if (IsStrict) {
34503 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
34504 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
34505 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
34506 {Chain, SignCvt, SignCvt});
34507 Chain = Slow.getValue(1);
34508 } else {
34509 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
34510 }
34511 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
34512 IsNeg =
34513 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
34514 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
34515 Results.push_back(Cvt);
34516 if (IsStrict)
34517 Results.push_back(Chain);
34518 return;
34519 }
34520
34521 if (SrcVT != MVT::v2i32)
34522 return;
34523
34524 if (IsSigned || Subtarget.hasAVX512()) {
34525 if (!IsStrict)
34526 return;
34527
34528 // Custom widen strict v2i32->v2f32 to avoid scalarization.
34529 // FIXME: Should generic type legalizer do this?
34530 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34531 DAG.getConstant(0, dl, MVT::v2i32));
34532 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34533 {N->getOperand(0), Src});
34534 Results.push_back(Res);
34535 Results.push_back(Res.getValue(1));
34536 return;
34537 }
34538
34539 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34540 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
34541 SDValue VBias = DAG.getConstantFP(
34542 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);
34543 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
34544 DAG.getBitcast(MVT::v2i64, VBias));
34545 Or = DAG.getBitcast(MVT::v2f64, Or);
34546 if (IsStrict) {
34547 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
34548 {N->getOperand(0), Or, VBias});
34550 {MVT::v4f32, MVT::Other},
34551 {Sub.getValue(1), Sub});
34552 Results.push_back(Res);
34553 Results.push_back(Res.getValue(1));
34554 } else {
34555 // TODO: Are there any fast-math-flags to propagate here?
34556 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
34557 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
34558 }
34559 return;
34560 }
34562 case ISD::FP_ROUND: {
34563 bool IsStrict = N->isStrictFPOpcode();
34564 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34565 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34566 SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
34567 EVT SrcVT = Src.getValueType();
34568 EVT VT = N->getValueType(0);
34569 SDValue V;
34570 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
34571 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
34572 : DAG.getUNDEF(MVT::v2f32);
34573 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
34574 }
34575 if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
34576 assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C");
34577 if (SrcVT.getVectorElementType() != MVT::f32)
34578 return;
34579
34580 if (IsStrict)
34581 V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
34582 {Chain, Src, Rnd});
34583 else
34584 V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
34585
34586 Results.push_back(DAG.getBitcast(MVT::v8f16, V));
34587 if (IsStrict)
34588 Results.push_back(V.getValue(1));
34589 return;
34590 }
34591 if (!isTypeLegal(Src.getValueType()))
34592 return;
34593 EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
34594 if (IsStrict)
34595 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
34596 {Chain, Src});
34597 else
34598 V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
34599 Results.push_back(V);
34600 if (IsStrict)
34601 Results.push_back(V.getValue(1));
34602 return;
34603 }
34604 case ISD::FP_EXTEND:
34605 case ISD::STRICT_FP_EXTEND: {
34606 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
34607 // No other ValueType for FP_EXTEND should reach this point.
34608 assert(N->getValueType(0) == MVT::v2f32 &&
34609 "Do not know how to legalize this Node");
34610 if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
34611 return;
34612 bool IsStrict = N->isStrictFPOpcode();
34613 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34614 if (Src.getValueType().getVectorElementType() != MVT::f16)
34615 return;
34616 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
34617 : DAG.getUNDEF(MVT::v2f16);
34618 SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
34619 if (IsStrict)
34620 V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
34621 {N->getOperand(0), V});
34622 else
34623 V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
34624 Results.push_back(V);
34625 if (IsStrict)
34626 Results.push_back(V.getValue(1));
34627 return;
34628 }
34630 unsigned IntNo = N->getConstantOperandVal(1);
34631 switch (IntNo) {
34632 default : llvm_unreachable("Do not know how to custom type "
34633 "legalize this intrinsic operation!");
34634 case Intrinsic::x86_rdtsc:
34635 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
34636 Results);
34637 case Intrinsic::x86_rdtscp:
34638 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
34639 Results);
34640 case Intrinsic::x86_rdpmc:
34641 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
34642 Results);
34643 return;
34644 case Intrinsic::x86_rdpru:
34645 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
34646 Results);
34647 return;
34648 case Intrinsic::x86_xgetbv:
34649 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
34650 Results);
34651 return;
34652 }
34653 }
34654 case ISD::READCYCLECOUNTER: {
34655 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
34656 }
34657 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
34658 EVT T = N->getValueType(0);
34659 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
34660 bool Regs64bit = T == MVT::i128;
34661 assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&
34662 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
34663 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
34664 SDValue cpInL, cpInH;
34665 std::tie(cpInL, cpInH) =
34666 DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);
34667 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
34668 Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());
34669 cpInH =
34670 DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,
34671 cpInH, cpInL.getValue(1));
34672 SDValue swapInL, swapInH;
34673 std::tie(swapInL, swapInH) =
34674 DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);
34675 swapInH =
34676 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
34677 swapInH, cpInH.getValue(1));
34678
34679 // In 64-bit mode we might need the base pointer in RBX, but we can't know
34680 // until later. So we keep the RBX input in a vreg and use a custom
34681 // inserter.
34682 // Since RBX will be a reserved register the register allocator will not
34683 // make sure its value will be properly saved and restored around this
34684 // live-range.
34685 SDValue Result;
34686 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
34687 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
34688 if (Regs64bit) {
34689 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
34690 swapInH.getValue(1)};
34691 Result =
34692 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
34693 } else {
34694 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
34695 swapInH.getValue(1));
34696 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
34697 swapInL.getValue(1)};
34698 Result =
34699 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
34700 }
34701
34702 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
34703 Regs64bit ? X86::RAX : X86::EAX,
34704 HalfT, Result.getValue(1));
34705 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
34706 Regs64bit ? X86::RDX : X86::EDX,
34707 HalfT, cpOutL.getValue(2));
34708 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
34709
34710 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
34711 MVT::i32, cpOutH.getValue(2));
34712 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
34713 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
34714
34715 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
34716 Results.push_back(Success);
34717 Results.push_back(EFLAGS.getValue(1));
34718 return;
34719 }
34720 case ISD::ATOMIC_LOAD: {
34721 assert(
34722 (N->getValueType(0) == MVT::i64 || N->getValueType(0) == MVT::i128) &&
34723 "Unexpected VT!");
34724 bool NoImplicitFloatOps =
34726 Attribute::NoImplicitFloat);
34727 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
34728 auto *Node = cast<AtomicSDNode>(N);
34729
34730 if (N->getValueType(0) == MVT::i128) {
34731 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
34732 SDValue Ld = DAG.getLoad(MVT::v2i64, dl, Node->getChain(),
34733 Node->getBasePtr(), Node->getMemOperand());
34734 SDValue ResL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34735 DAG.getVectorIdxConstant(0, dl));
34736 SDValue ResH = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34737 DAG.getVectorIdxConstant(1, dl));
34738 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0),
34739 {ResL, ResH}));
34740 Results.push_back(Ld.getValue(1));
34741 return;
34742 }
34743 break;
34744 }
34745 if (Subtarget.hasSSE1()) {
34746 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
34747 // Then extract the lower 64-bits.
34748 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
34749 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
34750 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34752 MVT::i64, Node->getMemOperand());
34753 if (Subtarget.hasSSE2()) {
34754 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34755 DAG.getVectorIdxConstant(0, dl));
34756 Results.push_back(Res);
34757 Results.push_back(Ld.getValue(1));
34758 return;
34759 }
34760 // We use an alternative sequence for SSE1 that extracts as v2f32 and
34761 // then casts to i64. This avoids a 128-bit stack temporary being
34762 // created by type legalization if we were to cast v4f32->v2i64.
34763 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
34764 DAG.getVectorIdxConstant(0, dl));
34765 Res = DAG.getBitcast(MVT::i64, Res);
34766 Results.push_back(Res);
34767 Results.push_back(Ld.getValue(1));
34768 return;
34769 }
34770 if (Subtarget.hasX87()) {
34771 // First load this into an 80-bit X87 register. This will put the whole
34772 // integer into the significand.
34773 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
34774 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34776 dl, Tys, Ops, MVT::i64,
34777 Node->getMemOperand());
34778 SDValue Chain = Result.getValue(1);
34779
34780 // Now store the X87 register to a stack temporary and convert to i64.
34781 // This store is not atomic and doesn't need to be.
34782 // FIXME: We don't need a stack temporary if the result of the load
34783 // is already being stored. We could just directly store there.
34784 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
34785 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
34786 MachinePointerInfo MPI =
34788 SDValue StoreOps[] = { Chain, Result, StackPtr };
34789 Chain = DAG.getMemIntrinsicNode(
34790 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
34791 MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
34792
34793 // Finally load the value back from the stack temporary and return it.
34794 // This load is not atomic and doesn't need to be.
34795 // This load will be further type legalized.
34796 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
34797 Results.push_back(Result);
34798 Results.push_back(Result.getValue(1));
34799 return;
34800 }
34801 }
34802 // TODO: Use MOVLPS when SSE1 is available?
34803 // Delegate to generic TypeLegalization. Situations we can really handle
34804 // should have already been dealt with by AtomicExpandPass.cpp.
34805 break;
34806 }
34807 case ISD::ATOMIC_SWAP:
34808 case ISD::ATOMIC_LOAD_ADD:
34809 case ISD::ATOMIC_LOAD_SUB:
34810 case ISD::ATOMIC_LOAD_AND:
34811 case ISD::ATOMIC_LOAD_OR:
34812 case ISD::ATOMIC_LOAD_XOR:
34813 case ISD::ATOMIC_LOAD_NAND:
34814 case ISD::ATOMIC_LOAD_MIN:
34815 case ISD::ATOMIC_LOAD_MAX:
34816 case ISD::ATOMIC_LOAD_UMIN:
34817 case ISD::ATOMIC_LOAD_UMAX:
34818 // Delegate to generic TypeLegalization. Situations we can really handle
34819 // should have already been dealt with by AtomicExpandPass.cpp.
34820 break;
34821
34822 case ISD::BITCAST: {
34823 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34824 EVT DstVT = N->getValueType(0);
34825 EVT SrcVT = N->getOperand(0).getValueType();
34826
34827 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
34828 // we can split using the k-register rather than memory.
34829 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
34830 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
34831 SDValue Lo, Hi;
34832 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
34833 Lo = DAG.getBitcast(MVT::i32, Lo);
34834 Hi = DAG.getBitcast(MVT::i32, Hi);
34835 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
34836 Results.push_back(Res);
34837 return;
34838 }
34839
34840 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
34841 // FIXME: Use v4f32 for SSE1?
34842 assert(Subtarget.hasSSE2() && "Requires SSE2");
34843 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
34844 "Unexpected type action!");
34845 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
34846 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
34847 N->getOperand(0));
34848 Res = DAG.getBitcast(WideVT, Res);
34849 Results.push_back(Res);
34850 return;
34851 }
34852
34853 return;
34854 }
34855 case ISD::MGATHER: {
34856 EVT VT = N->getValueType(0);
34857 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
34858 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
34859 auto *Gather = cast<MaskedGatherSDNode>(N);
34860 SDValue Index = Gather->getIndex();
34861 if (Index.getValueType() != MVT::v2i64)
34862 return;
34864 "Unexpected type action!");
34865 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34866 SDValue Mask = Gather->getMask();
34867 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
34868 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
34869 Gather->getPassThru(),
34870 DAG.getUNDEF(VT));
34871 if (!Subtarget.hasVLX()) {
34872 // We need to widen the mask, but the instruction will only use 2
34873 // of its elements. So we can use undef.
34874 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
34875 DAG.getUNDEF(MVT::v2i1));
34876 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
34877 }
34878 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
34879 Gather->getBasePtr(), Index, Gather->getScale() };
34880 SDValue Res = DAG.getMemIntrinsicNode(
34881 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
34882 Gather->getMemoryVT(), Gather->getMemOperand());
34883 Results.push_back(Res);
34884 Results.push_back(Res.getValue(1));
34885 return;
34886 }
34887 return;
34888 }
34889 case ISD::LOAD: {
34890 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
34891 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
34892 // cast since type legalization will try to use an i64 load.
34893 MVT VT = N->getSimpleValueType(0);
34894 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
34896 "Unexpected type action!");
34897 if (!ISD::isNON_EXTLoad(N))
34898 return;
34899 auto *Ld = cast<LoadSDNode>(N);
34900 if (Subtarget.hasSSE2()) {
34901 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
34902 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
34903 Ld->getPointerInfo(), Ld->getBaseAlign(),
34904 Ld->getMemOperand()->getFlags());
34905 SDValue Chain = Res.getValue(1);
34906 MVT VecVT = MVT::getVectorVT(LdVT, 2);
34907 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
34908 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34909 Res = DAG.getBitcast(WideVT, Res);
34910 Results.push_back(Res);
34911 Results.push_back(Chain);
34912 return;
34913 }
34914 assert(Subtarget.hasSSE1() && "Expected SSE");
34915 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
34916 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
34918 MVT::i64, Ld->getMemOperand());
34919 Results.push_back(Res);
34920 Results.push_back(Res.getValue(1));
34921 return;
34922 }
34923 case ISD::ADDRSPACECAST: {
34924 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
34925 Results.push_back(V);
34926 return;
34927 }
34928 case ISD::BITREVERSE: {
34929 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
34930 assert((Subtarget.hasXOP() || Subtarget.hasGFNI()) && "Expected XOP/GFNI");
34931 // We can use VPPERM/GF2P8AFFINEQB by copying to a vector register and back.
34932 // We'll need to move the scalar in two i32 pieces.
34933 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
34934 return;
34935 }
34937 // f16 = extract vXf16 %vec, i64 %idx
34938 assert(N->getSimpleValueType(0) == MVT::f16 &&
34939 "Unexpected Value type of EXTRACT_VECTOR_ELT!");
34940 assert(Subtarget.hasFP16() && "Expected FP16");
34941 SDValue VecOp = N->getOperand(0);
34943 SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
34944 Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,
34945 N->getOperand(1));
34946 Split = DAG.getBitcast(MVT::f16, Split);
34947 Results.push_back(Split);
34948 return;
34949 }
34950 }
34951}
34952
34953const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
34954 switch ((X86ISD::NodeType)Opcode) {
34955 case X86ISD::FIRST_NUMBER: break;
34956#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
34957 NODE_NAME_CASE(BSF)
34958 NODE_NAME_CASE(BSR)
34959 NODE_NAME_CASE(FSHL)
34960 NODE_NAME_CASE(FSHR)
34961 NODE_NAME_CASE(FAND)
34962 NODE_NAME_CASE(FANDN)
34963 NODE_NAME_CASE(FOR)
34964 NODE_NAME_CASE(FXOR)
34965 NODE_NAME_CASE(FILD)
34966 NODE_NAME_CASE(FIST)
34967 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
34968 NODE_NAME_CASE(FLD)
34969 NODE_NAME_CASE(FST)
34970 NODE_NAME_CASE(CALL)
34971 NODE_NAME_CASE(CALL_RVMARKER)
34972 NODE_NAME_CASE(IMP_CALL)
34974 NODE_NAME_CASE(CMP)
34975 NODE_NAME_CASE(FCMP)
34976 NODE_NAME_CASE(STRICT_FCMP)
34977 NODE_NAME_CASE(STRICT_FCMPS)
34979 NODE_NAME_CASE(UCOMI)
34980 NODE_NAME_CASE(COMX)
34981 NODE_NAME_CASE(UCOMX)
34982 NODE_NAME_CASE(CMPM)
34983 NODE_NAME_CASE(CMPMM)
34984 NODE_NAME_CASE(STRICT_CMPM)
34985 NODE_NAME_CASE(CMPMM_SAE)
34986 NODE_NAME_CASE(SETCC)
34987 NODE_NAME_CASE(SETCC_CARRY)
34988 NODE_NAME_CASE(FSETCC)
34989 NODE_NAME_CASE(FSETCCM)
34990 NODE_NAME_CASE(FSETCCM_SAE)
34991 NODE_NAME_CASE(CMOV)
34992 NODE_NAME_CASE(BRCOND)
34993 NODE_NAME_CASE(RET_GLUE)
34994 NODE_NAME_CASE(IRET)
34995 NODE_NAME_CASE(REP_STOS)
34996 NODE_NAME_CASE(REP_MOVS)
34997 NODE_NAME_CASE(GlobalBaseReg)
34999 NODE_NAME_CASE(WrapperRIP)
35000 NODE_NAME_CASE(MOVQ2DQ)
35001 NODE_NAME_CASE(MOVDQ2Q)
35002 NODE_NAME_CASE(MMX_MOVD2W)
35003 NODE_NAME_CASE(MMX_MOVW2D)
35004 NODE_NAME_CASE(PEXTRB)
35005 NODE_NAME_CASE(PEXTRW)
35006 NODE_NAME_CASE(INSERTPS)
35007 NODE_NAME_CASE(PINSRB)
35008 NODE_NAME_CASE(PINSRW)
35009 NODE_NAME_CASE(PSHUFB)
35010 NODE_NAME_CASE(ANDNP)
35011 NODE_NAME_CASE(BLENDI)
35013 NODE_NAME_CASE(HADD)
35014 NODE_NAME_CASE(HSUB)
35015 NODE_NAME_CASE(FHADD)
35016 NODE_NAME_CASE(FHSUB)
35017 NODE_NAME_CASE(CONFLICT)
35018 NODE_NAME_CASE(FMAX)
35019 NODE_NAME_CASE(FMAXS)
35020 NODE_NAME_CASE(FMAX_SAE)
35021 NODE_NAME_CASE(FMAXS_SAE)
35022 NODE_NAME_CASE(STRICT_FMAX)
35023 NODE_NAME_CASE(FMIN)
35024 NODE_NAME_CASE(FMINS)
35025 NODE_NAME_CASE(FMIN_SAE)
35026 NODE_NAME_CASE(FMINS_SAE)
35027 NODE_NAME_CASE(STRICT_FMIN)
35028 NODE_NAME_CASE(FMAXC)
35029 NODE_NAME_CASE(FMINC)
35030 NODE_NAME_CASE(FRSQRT)
35031 NODE_NAME_CASE(FRCP)
35032 NODE_NAME_CASE(EXTRQI)
35033 NODE_NAME_CASE(INSERTQI)
35034 NODE_NAME_CASE(TLSADDR)
35035 NODE_NAME_CASE(TLSBASEADDR)
35036 NODE_NAME_CASE(TLSCALL)
35037 NODE_NAME_CASE(TLSDESC)
35038 NODE_NAME_CASE(EH_SJLJ_SETJMP)
35039 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
35040 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
35041 NODE_NAME_CASE(EH_RETURN)
35042 NODE_NAME_CASE(TC_RETURN)
35043 NODE_NAME_CASE(FNSTCW16m)
35044 NODE_NAME_CASE(FLDCW16m)
35045 NODE_NAME_CASE(FNSTENVm)
35046 NODE_NAME_CASE(FLDENVm)
35047 NODE_NAME_CASE(LCMPXCHG_DAG)
35048 NODE_NAME_CASE(LCMPXCHG8_DAG)
35049 NODE_NAME_CASE(LCMPXCHG16_DAG)
35050 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
35051 NODE_NAME_CASE(LADD)
35052 NODE_NAME_CASE(LSUB)
35053 NODE_NAME_CASE(LOR)
35054 NODE_NAME_CASE(LXOR)
35055 NODE_NAME_CASE(LAND)
35056 NODE_NAME_CASE(LBTS)
35057 NODE_NAME_CASE(LBTC)
35058 NODE_NAME_CASE(LBTR)
35059 NODE_NAME_CASE(LBTS_RM)
35060 NODE_NAME_CASE(LBTC_RM)
35061 NODE_NAME_CASE(LBTR_RM)
35062 NODE_NAME_CASE(AADD)
35063 NODE_NAME_CASE(AOR)
35064 NODE_NAME_CASE(AXOR)
35065 NODE_NAME_CASE(AAND)
35066 NODE_NAME_CASE(VZEXT_MOVL)
35067 NODE_NAME_CASE(VZEXT_LOAD)
35068 NODE_NAME_CASE(VEXTRACT_STORE)
35069 NODE_NAME_CASE(VTRUNC)
35070 NODE_NAME_CASE(VTRUNCS)
35071 NODE_NAME_CASE(VTRUNCUS)
35072 NODE_NAME_CASE(VMTRUNC)
35073 NODE_NAME_CASE(VMTRUNCS)
35074 NODE_NAME_CASE(VMTRUNCUS)
35075 NODE_NAME_CASE(VTRUNCSTORES)
35076 NODE_NAME_CASE(VTRUNCSTOREUS)
35077 NODE_NAME_CASE(VMTRUNCSTORES)
35078 NODE_NAME_CASE(VMTRUNCSTOREUS)
35079 NODE_NAME_CASE(VFPEXT)
35080 NODE_NAME_CASE(STRICT_VFPEXT)
35081 NODE_NAME_CASE(VFPEXT_SAE)
35082 NODE_NAME_CASE(VFPEXTS)
35083 NODE_NAME_CASE(VFPEXTS_SAE)
35084 NODE_NAME_CASE(VFPROUND)
35085 NODE_NAME_CASE(VFPROUND2)
35086 NODE_NAME_CASE(VFPROUND2_RND)
35087 NODE_NAME_CASE(STRICT_VFPROUND)
35088 NODE_NAME_CASE(VMFPROUND)
35089 NODE_NAME_CASE(VFPROUND_RND)
35090 NODE_NAME_CASE(VFPROUNDS)
35091 NODE_NAME_CASE(VFPROUNDS_RND)
35092 NODE_NAME_CASE(VSHLDQ)
35093 NODE_NAME_CASE(VSRLDQ)
35094 NODE_NAME_CASE(VSHL)
35095 NODE_NAME_CASE(VSRL)
35096 NODE_NAME_CASE(VSRA)
35097 NODE_NAME_CASE(VSHLI)
35098 NODE_NAME_CASE(VSRLI)
35099 NODE_NAME_CASE(VSRAI)
35100 NODE_NAME_CASE(VSHLV)
35101 NODE_NAME_CASE(VSRLV)
35102 NODE_NAME_CASE(VSRAV)
35103 NODE_NAME_CASE(VROTLI)
35104 NODE_NAME_CASE(VROTRI)
35105 NODE_NAME_CASE(VPPERM)
35106 NODE_NAME_CASE(CMPP)
35107 NODE_NAME_CASE(STRICT_CMPP)
35108 NODE_NAME_CASE(PCMPEQ)
35109 NODE_NAME_CASE(PCMPGT)
35110 NODE_NAME_CASE(PHMINPOS)
35111 NODE_NAME_CASE(ADD)
35112 NODE_NAME_CASE(SUB)
35113 NODE_NAME_CASE(ADC)
35114 NODE_NAME_CASE(SBB)
35115 NODE_NAME_CASE(SMUL)
35116 NODE_NAME_CASE(UMUL)
35117 NODE_NAME_CASE(OR)
35118 NODE_NAME_CASE(XOR)
35119 NODE_NAME_CASE(AND)
35120 NODE_NAME_CASE(BEXTR)
35122 NODE_NAME_CASE(BZHI)
35123 NODE_NAME_CASE(PDEP)
35124 NODE_NAME_CASE(PEXT)
35125 NODE_NAME_CASE(MUL_IMM)
35126 NODE_NAME_CASE(MOVMSK)
35127 NODE_NAME_CASE(PTEST)
35128 NODE_NAME_CASE(TESTP)
35129 NODE_NAME_CASE(KORTEST)
35130 NODE_NAME_CASE(KTEST)
35131 NODE_NAME_CASE(KADD)
35132 NODE_NAME_CASE(KSHIFTL)
35133 NODE_NAME_CASE(KSHIFTR)
35134 NODE_NAME_CASE(PACKSS)
35135 NODE_NAME_CASE(PACKUS)
35136 NODE_NAME_CASE(PALIGNR)
35137 NODE_NAME_CASE(VALIGN)
35138 NODE_NAME_CASE(VSHLD)
35139 NODE_NAME_CASE(VSHRD)
35140 NODE_NAME_CASE(PSHUFD)
35141 NODE_NAME_CASE(PSHUFHW)
35142 NODE_NAME_CASE(PSHUFLW)
35143 NODE_NAME_CASE(SHUFP)
35144 NODE_NAME_CASE(SHUF128)
35145 NODE_NAME_CASE(MOVLHPS)
35146 NODE_NAME_CASE(MOVHLPS)
35147 NODE_NAME_CASE(MOVDDUP)
35148 NODE_NAME_CASE(MOVSHDUP)
35149 NODE_NAME_CASE(MOVSLDUP)
35150 NODE_NAME_CASE(MOVSD)
35151 NODE_NAME_CASE(MOVSS)
35152 NODE_NAME_CASE(MOVSH)
35153 NODE_NAME_CASE(UNPCKL)
35154 NODE_NAME_CASE(UNPCKH)
35155 NODE_NAME_CASE(VBROADCAST)
35156 NODE_NAME_CASE(VBROADCAST_LOAD)
35157 NODE_NAME_CASE(VBROADCASTM)
35158 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
35159 NODE_NAME_CASE(VPERMILPV)
35160 NODE_NAME_CASE(VPERMILPI)
35161 NODE_NAME_CASE(VPERM2X128)
35162 NODE_NAME_CASE(VPERMV)
35163 NODE_NAME_CASE(VPERMV3)
35164 NODE_NAME_CASE(VPERMI)
35165 NODE_NAME_CASE(VPTERNLOG)
35166 NODE_NAME_CASE(FP_TO_SINT_SAT)
35167 NODE_NAME_CASE(FP_TO_UINT_SAT)
35168 NODE_NAME_CASE(VFIXUPIMM)
35169 NODE_NAME_CASE(VFIXUPIMM_SAE)
35170 NODE_NAME_CASE(VFIXUPIMMS)
35171 NODE_NAME_CASE(VFIXUPIMMS_SAE)
35172 NODE_NAME_CASE(VRANGE)
35173 NODE_NAME_CASE(VRANGE_SAE)
35174 NODE_NAME_CASE(VRANGES)
35175 NODE_NAME_CASE(VRANGES_SAE)
35176 NODE_NAME_CASE(PMULUDQ)
35177 NODE_NAME_CASE(PMULDQ)
35178 NODE_NAME_CASE(PSADBW)
35179 NODE_NAME_CASE(DBPSADBW)
35180 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
35181 NODE_NAME_CASE(VAARG_64)
35182 NODE_NAME_CASE(VAARG_X32)
35183 NODE_NAME_CASE(DYN_ALLOCA)
35184 NODE_NAME_CASE(MFENCE)
35185 NODE_NAME_CASE(SEG_ALLOCA)
35186 NODE_NAME_CASE(PROBED_ALLOCA)
35189 NODE_NAME_CASE(RDPKRU)
35190 NODE_NAME_CASE(WRPKRU)
35191 NODE_NAME_CASE(VPMADDUBSW)
35192 NODE_NAME_CASE(VPMADDWD)
35193 NODE_NAME_CASE(VPSHA)
35194 NODE_NAME_CASE(VPSHL)
35195 NODE_NAME_CASE(VPCOM)
35196 NODE_NAME_CASE(VPCOMU)
35197 NODE_NAME_CASE(VPERMIL2)
35199 NODE_NAME_CASE(STRICT_FMSUB)
35201 NODE_NAME_CASE(STRICT_FNMADD)
35203 NODE_NAME_CASE(STRICT_FNMSUB)
35204 NODE_NAME_CASE(FMADDSUB)
35205 NODE_NAME_CASE(FMSUBADD)
35206 NODE_NAME_CASE(FMADD_RND)
35207 NODE_NAME_CASE(FNMADD_RND)
35208 NODE_NAME_CASE(FMSUB_RND)
35209 NODE_NAME_CASE(FNMSUB_RND)
35210 NODE_NAME_CASE(FMADDSUB_RND)
35211 NODE_NAME_CASE(FMSUBADD_RND)
35212 NODE_NAME_CASE(VFMADDC)
35213 NODE_NAME_CASE(VFMADDC_RND)
35214 NODE_NAME_CASE(VFCMADDC)
35215 NODE_NAME_CASE(VFCMADDC_RND)
35216 NODE_NAME_CASE(VFMULC)
35217 NODE_NAME_CASE(VFMULC_RND)
35218 NODE_NAME_CASE(VFCMULC)
35219 NODE_NAME_CASE(VFCMULC_RND)
35220 NODE_NAME_CASE(VFMULCSH)
35221 NODE_NAME_CASE(VFMULCSH_RND)
35222 NODE_NAME_CASE(VFCMULCSH)
35223 NODE_NAME_CASE(VFCMULCSH_RND)
35224 NODE_NAME_CASE(VFMADDCSH)
35225 NODE_NAME_CASE(VFMADDCSH_RND)
35226 NODE_NAME_CASE(VFCMADDCSH)
35227 NODE_NAME_CASE(VFCMADDCSH_RND)
35228 NODE_NAME_CASE(VPMADD52H)
35229 NODE_NAME_CASE(VPMADD52L)
35230 NODE_NAME_CASE(VRNDSCALE)
35231 NODE_NAME_CASE(STRICT_VRNDSCALE)
35232 NODE_NAME_CASE(VRNDSCALE_SAE)
35233 NODE_NAME_CASE(VRNDSCALES)
35234 NODE_NAME_CASE(VRNDSCALES_SAE)
35235 NODE_NAME_CASE(VREDUCE)
35236 NODE_NAME_CASE(VREDUCE_SAE)
35237 NODE_NAME_CASE(VREDUCES)
35238 NODE_NAME_CASE(VREDUCES_SAE)
35239 NODE_NAME_CASE(VGETMANT)
35240 NODE_NAME_CASE(VGETMANT_SAE)
35241 NODE_NAME_CASE(VGETMANTS)
35242 NODE_NAME_CASE(VGETMANTS_SAE)
35243 NODE_NAME_CASE(PCMPESTR)
35244 NODE_NAME_CASE(PCMPISTR)
35246 NODE_NAME_CASE(COMPRESS)
35248 NODE_NAME_CASE(SELECTS)
35249 NODE_NAME_CASE(ADDSUB)
35250 NODE_NAME_CASE(RCP14)
35251 NODE_NAME_CASE(RCP14S)
35252 NODE_NAME_CASE(RSQRT14)
35253 NODE_NAME_CASE(RSQRT14S)
35254 NODE_NAME_CASE(FADD_RND)
35255 NODE_NAME_CASE(FADDS)
35256 NODE_NAME_CASE(FADDS_RND)
35257 NODE_NAME_CASE(FSUB_RND)
35258 NODE_NAME_CASE(FSUBS)
35259 NODE_NAME_CASE(FSUBS_RND)
35260 NODE_NAME_CASE(FMUL_RND)
35261 NODE_NAME_CASE(FMULS)
35262 NODE_NAME_CASE(FMULS_RND)
35263 NODE_NAME_CASE(FDIV_RND)
35264 NODE_NAME_CASE(FDIVS)
35265 NODE_NAME_CASE(FDIVS_RND)
35266 NODE_NAME_CASE(FSQRT_RND)
35267 NODE_NAME_CASE(FSQRTS)
35268 NODE_NAME_CASE(FSQRTS_RND)
35269 NODE_NAME_CASE(FGETEXP)
35270 NODE_NAME_CASE(FGETEXP_SAE)
35271 NODE_NAME_CASE(FGETEXPS)
35272 NODE_NAME_CASE(FGETEXPS_SAE)
35273 NODE_NAME_CASE(SCALEF)
35274 NODE_NAME_CASE(SCALEF_RND)
35275 NODE_NAME_CASE(SCALEFS)
35276 NODE_NAME_CASE(SCALEFS_RND)
35277 NODE_NAME_CASE(MULHRS)
35278 NODE_NAME_CASE(SINT_TO_FP_RND)
35279 NODE_NAME_CASE(UINT_TO_FP_RND)
35280 NODE_NAME_CASE(CVTTP2SI)
35281 NODE_NAME_CASE(CVTTP2UI)
35282 NODE_NAME_CASE(STRICT_CVTTP2SI)
35283 NODE_NAME_CASE(STRICT_CVTTP2UI)
35284 NODE_NAME_CASE(MCVTTP2SI)
35285 NODE_NAME_CASE(MCVTTP2UI)
35286 NODE_NAME_CASE(CVTTP2SI_SAE)
35287 NODE_NAME_CASE(CVTTP2UI_SAE)
35288 NODE_NAME_CASE(CVTTS2SI)
35289 NODE_NAME_CASE(CVTTS2UI)
35290 NODE_NAME_CASE(CVTTS2SI_SAE)
35291 NODE_NAME_CASE(CVTTS2UI_SAE)
35292 NODE_NAME_CASE(CVTSI2P)
35293 NODE_NAME_CASE(CVTUI2P)
35294 NODE_NAME_CASE(STRICT_CVTSI2P)
35295 NODE_NAME_CASE(STRICT_CVTUI2P)
35296 NODE_NAME_CASE(MCVTSI2P)
35297 NODE_NAME_CASE(MCVTUI2P)
35298 NODE_NAME_CASE(VFPCLASS)
35299 NODE_NAME_CASE(VFPCLASSS)
35300 NODE_NAME_CASE(MULTISHIFT)
35301 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
35302 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
35303 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
35304 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
35305 NODE_NAME_CASE(CVTPS2PH)
35306 NODE_NAME_CASE(STRICT_CVTPS2PH)
35307 NODE_NAME_CASE(CVTPS2PH_SAE)
35308 NODE_NAME_CASE(MCVTPS2PH)
35309 NODE_NAME_CASE(MCVTPS2PH_SAE)
35310 NODE_NAME_CASE(CVTPH2PS)
35311 NODE_NAME_CASE(STRICT_CVTPH2PS)
35312 NODE_NAME_CASE(CVTPH2PS_SAE)
35313 NODE_NAME_CASE(CVTP2SI)
35314 NODE_NAME_CASE(CVTP2UI)
35315 NODE_NAME_CASE(MCVTP2SI)
35316 NODE_NAME_CASE(MCVTP2UI)
35317 NODE_NAME_CASE(CVTP2SI_RND)
35318 NODE_NAME_CASE(CVTP2UI_RND)
35319 NODE_NAME_CASE(CVTS2SI)
35320 NODE_NAME_CASE(CVTS2UI)
35321 NODE_NAME_CASE(CVTS2SI_RND)
35322 NODE_NAME_CASE(CVTS2UI_RND)
35323 NODE_NAME_CASE(CVTNEPS2BF16)
35324 NODE_NAME_CASE(MCVTNEPS2BF16)
35325 NODE_NAME_CASE(DPBF16PS)
35326 NODE_NAME_CASE(DPFP16PS)
35327 NODE_NAME_CASE(MPSADBW)
35328 NODE_NAME_CASE(LWPINS)
35329 NODE_NAME_CASE(MGATHER)
35330 NODE_NAME_CASE(MSCATTER)
35331 NODE_NAME_CASE(VPDPBUSD)
35332 NODE_NAME_CASE(VPDPBUSDS)
35333 NODE_NAME_CASE(VPDPWSSD)
35334 NODE_NAME_CASE(VPDPWSSDS)
35335 NODE_NAME_CASE(VPSHUFBITQMB)
35336 NODE_NAME_CASE(GF2P8MULB)
35337 NODE_NAME_CASE(GF2P8AFFINEQB)
35338 NODE_NAME_CASE(GF2P8AFFINEINVQB)
35339 NODE_NAME_CASE(NT_CALL)
35340 NODE_NAME_CASE(NT_BRIND)
35341 NODE_NAME_CASE(UMWAIT)
35342 NODE_NAME_CASE(TPAUSE)
35343 NODE_NAME_CASE(ENQCMD)
35344 NODE_NAME_CASE(ENQCMDS)
35345 NODE_NAME_CASE(VP2INTERSECT)
35346 NODE_NAME_CASE(VPDPBSUD)
35347 NODE_NAME_CASE(VPDPBSUDS)
35348 NODE_NAME_CASE(VPDPBUUD)
35349 NODE_NAME_CASE(VPDPBUUDS)
35350 NODE_NAME_CASE(VPDPBSSD)
35351 NODE_NAME_CASE(VPDPBSSDS)
35352 NODE_NAME_CASE(VPDPWSUD)
35353 NODE_NAME_CASE(VPDPWSUDS)
35354 NODE_NAME_CASE(VPDPWUSD)
35355 NODE_NAME_CASE(VPDPWUSDS)
35356 NODE_NAME_CASE(VPDPWUUD)
35357 NODE_NAME_CASE(VPDPWUUDS)
35358 NODE_NAME_CASE(VMINMAX)
35359 NODE_NAME_CASE(VMINMAX_SAE)
35360 NODE_NAME_CASE(VMINMAXS)
35361 NODE_NAME_CASE(VMINMAXS_SAE)
35362 NODE_NAME_CASE(CVTP2IBS)
35363 NODE_NAME_CASE(CVTP2IUBS)
35364 NODE_NAME_CASE(CVTP2IBS_RND)
35365 NODE_NAME_CASE(CVTP2IUBS_RND)
35366 NODE_NAME_CASE(CVTTP2IBS)
35367 NODE_NAME_CASE(CVTTP2IUBS)
35368 NODE_NAME_CASE(CVTTP2IBS_SAE)
35369 NODE_NAME_CASE(CVTTP2IUBS_SAE)
35370 NODE_NAME_CASE(VCVT2PH2BF8)
35371 NODE_NAME_CASE(VCVT2PH2BF8S)
35372 NODE_NAME_CASE(VCVT2PH2HF8)
35373 NODE_NAME_CASE(VCVT2PH2HF8S)
35374 NODE_NAME_CASE(VCVTBIASPH2BF8)
35375 NODE_NAME_CASE(VCVTBIASPH2BF8S)
35376 NODE_NAME_CASE(VCVTBIASPH2HF8)
35377 NODE_NAME_CASE(VCVTBIASPH2HF8S)
35378 NODE_NAME_CASE(VCVTPH2BF8)
35379 NODE_NAME_CASE(VCVTPH2BF8S)
35380 NODE_NAME_CASE(VCVTPH2HF8)
35381 NODE_NAME_CASE(VCVTPH2HF8S)
35382 NODE_NAME_CASE(VMCVTBIASPH2BF8)
35383 NODE_NAME_CASE(VMCVTBIASPH2BF8S)
35384 NODE_NAME_CASE(VMCVTBIASPH2HF8)
35385 NODE_NAME_CASE(VMCVTBIASPH2HF8S)
35386 NODE_NAME_CASE(VMCVTPH2BF8)
35387 NODE_NAME_CASE(VMCVTPH2BF8S)
35388 NODE_NAME_CASE(VMCVTPH2HF8)
35389 NODE_NAME_CASE(VMCVTPH2HF8S)
35390 NODE_NAME_CASE(VCVTHF82PH)
35391 NODE_NAME_CASE(AESENC128KL)
35392 NODE_NAME_CASE(AESDEC128KL)
35393 NODE_NAME_CASE(AESENC256KL)
35394 NODE_NAME_CASE(AESDEC256KL)
35395 NODE_NAME_CASE(AESENCWIDE128KL)
35396 NODE_NAME_CASE(AESDECWIDE128KL)
35397 NODE_NAME_CASE(AESENCWIDE256KL)
35398 NODE_NAME_CASE(AESDECWIDE256KL)
35399 NODE_NAME_CASE(CMPCCXADD)
35400 NODE_NAME_CASE(TESTUI)
35401 NODE_NAME_CASE(FP80_ADD)
35402 NODE_NAME_CASE(STRICT_FP80_ADD)
35403 NODE_NAME_CASE(CCMP)
35404 NODE_NAME_CASE(CTEST)
35405 NODE_NAME_CASE(CLOAD)
35406 NODE_NAME_CASE(CSTORE)
35407 NODE_NAME_CASE(CVTTS2SIS)
35408 NODE_NAME_CASE(CVTTS2UIS)
35409 NODE_NAME_CASE(CVTTS2SIS_SAE)
35410 NODE_NAME_CASE(CVTTS2UIS_SAE)
35411 NODE_NAME_CASE(CVTTP2SIS)
35412 NODE_NAME_CASE(MCVTTP2SIS)
35413 NODE_NAME_CASE(CVTTP2UIS_SAE)
35414 NODE_NAME_CASE(CVTTP2SIS_SAE)
35415 NODE_NAME_CASE(CVTTP2UIS)
35416 NODE_NAME_CASE(MCVTTP2UIS)
35417 NODE_NAME_CASE(POP_FROM_X87_REG)
35418 }
35419 return nullptr;
35420#undef NODE_NAME_CASE
35421}
35422
35423/// Return true if the addressing mode represented by AM is legal for this
35424/// target, for a load/store of the specified type.
35426 const AddrMode &AM, Type *Ty,
35427 unsigned AS,
35428 Instruction *I) const {
35429 // X86 supports extremely general addressing modes.
35431
35432 // X86 allows a sign-extended 32-bit immediate field as a displacement.
35433 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
35434 return false;
35435
35436 if (AM.BaseGV) {
35437 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
35438
35439 // If a reference to this global requires an extra load, we can't fold it.
35440 if (isGlobalStubReference(GVFlags))
35441 return false;
35442
35443 // If BaseGV requires a register for the PIC base, we cannot also have a
35444 // BaseReg specified.
35445 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
35446 return false;
35447
35448 // If lower 4G is not available, then we must use rip-relative addressing.
35449 if ((M != CodeModel::Small || isPositionIndependent()) &&
35450 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
35451 return false;
35452 }
35453
35454 switch (AM.Scale) {
35455 case 0:
35456 case 1:
35457 case 2:
35458 case 4:
35459 case 8:
35460 // These scales always work.
35461 break;
35462 case 3:
35463 case 5:
35464 case 9:
35465 // These scales are formed with basereg+scalereg. Only accept if there is
35466 // no basereg yet.
35467 if (AM.HasBaseReg)
35468 return false;
35469 break;
35470 default: // Other stuff never works.
35471 return false;
35472 }
35473
35474 return true;
35475}
35476
35477bool X86TargetLowering::isBinOp(unsigned Opcode) const {
35478 switch (Opcode) {
35479 // These are non-commutative binops.
35480 // TODO: Add more X86ISD opcodes once we have test coverage.
35481 case X86ISD::ANDNP:
35482 case X86ISD::PCMPGT:
35483 case X86ISD::FMAX:
35484 case X86ISD::FMIN:
35485 case X86ISD::FANDN:
35486 case X86ISD::VPSHA:
35487 case X86ISD::VPSHL:
35488 case X86ISD::VSHLV:
35489 case X86ISD::VSRLV:
35490 case X86ISD::VSRAV:
35491 return true;
35492 }
35493
35494 return TargetLoweringBase::isBinOp(Opcode);
35495}
35496
35497bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
35498 switch (Opcode) {
35499 // TODO: Add more X86ISD opcodes once we have test coverage.
35500 case X86ISD::PCMPEQ:
35501 case X86ISD::PMULDQ:
35502 case X86ISD::PMULUDQ:
35503 case X86ISD::FMAXC:
35504 case X86ISD::FMINC:
35505 case X86ISD::FAND:
35506 case X86ISD::FOR:
35507 case X86ISD::FXOR:
35508 return true;
35509 }
35510
35512}
35513
35515 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35516 return false;
35517 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
35518 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
35519 return NumBits1 > NumBits2;
35520}
35521
35523 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35524 return false;
35525
35526 if (!isTypeLegal(EVT::getEVT(Ty1)))
35527 return false;
35528
35529 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
35530
35531 // Assuming the caller doesn't have a zeroext or signext return parameter,
35532 // truncation all the way down to i1 is valid.
35533 return true;
35534}
35535
35537 return isInt<32>(Imm);
35538}
35539
35541 // Can also use sub to handle negated immediates.
35542 return isInt<32>(Imm);
35543}
35544
35546 return isInt<32>(Imm);
35547}
35548
35550 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
35551 return false;
35552 unsigned NumBits1 = VT1.getSizeInBits();
35553 unsigned NumBits2 = VT2.getSizeInBits();
35554 return NumBits1 > NumBits2;
35555}
35556
35558 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35559 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
35560}
35561
35563 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35564 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
35565}
35566
35568 EVT VT1 = Val.getValueType();
35569 if (isZExtFree(VT1, VT2))
35570 return true;
35571
35572 if (Val.getOpcode() != ISD::LOAD)
35573 return false;
35574
35575 if (!VT1.isSimple() || !VT1.isInteger() ||
35576 !VT2.isSimple() || !VT2.isInteger())
35577 return false;
35578
35579 switch (VT1.getSimpleVT().SimpleTy) {
35580 default: break;
35581 case MVT::i8:
35582 case MVT::i16:
35583 case MVT::i32:
35584 // X86 has 8, 16, and 32-bit zero-extending loads.
35585 return true;
35586 }
35587
35588 return false;
35589}
35590
35592 if (!Subtarget.is64Bit())
35593 return false;
35594 return TargetLowering::shouldConvertPhiType(From, To);
35595}
35596
35598 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
35599 return false;
35600
35601 EVT SrcVT = ExtVal.getOperand(0).getValueType();
35602
35603 // There is no extending load for vXi1.
35604 if (SrcVT.getScalarType() == MVT::i1)
35605 return false;
35606
35607 return true;
35608}
35609
35611 EVT VT) const {
35612 if (Subtarget.useSoftFloat())
35613 return false;
35614
35615 if (!Subtarget.hasAnyFMA())
35616 return false;
35617
35618 VT = VT.getScalarType();
35619
35620 if (!VT.isSimple())
35621 return false;
35622
35623 switch (VT.getSimpleVT().SimpleTy) {
35624 case MVT::f16:
35625 return Subtarget.hasFP16();
35626 case MVT::f32:
35627 case MVT::f64:
35628 return true;
35629 default:
35630 break;
35631 }
35632
35633 return false;
35634}
35635
35637 EVT DestVT) const {
35638 // i16 instructions are longer (0x66 prefix) and potentially slower.
35639 return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
35640}
35641
35643 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
35644 SDValue Y) const {
35645 if (SelectOpcode == ISD::SELECT) {
35646 if (VT.isVector())
35647 return false;
35648 if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
35649 return false;
35650 using namespace llvm::SDPatternMatch;
35651 // BLSI
35652 if (BinOpcode == ISD::AND && (sd_match(Y, m_Neg(m_Specific(X))) ||
35654 return true;
35655 // BLSR
35656 if (BinOpcode == ISD::AND &&
35659 return true;
35660 // BLSMSK
35661 if (BinOpcode == ISD::XOR &&
35664 return true;
35665
35666 return false;
35667 }
35668 // TODO: This is too general. There are cases where pre-AVX512 codegen would
35669 // benefit. The transform may also be profitable for scalar code.
35670 if (!Subtarget.hasAVX512())
35671 return false;
35672 if (!Subtarget.hasVLX() && !VT.is512BitVector())
35673 return false;
35674 if (!VT.isVector() || VT.getScalarType() == MVT::i1)
35675 return false;
35676
35677 return true;
35678}
35679
35680/// Targets can use this to indicate that they only support *some*
35681/// VECTOR_SHUFFLE operations, those with specific masks.
35682/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
35683/// are assumed to be legal.
35685 if (!VT.isSimple())
35686 return false;
35687
35688 // Not for i1 vectors
35689 if (VT.getSimpleVT().getScalarType() == MVT::i1)
35690 return false;
35691
35692 // Very little shuffling can be done for 64-bit vectors right now.
35693 if (VT.getSimpleVT().getSizeInBits() == 64)
35694 return false;
35695
35696 // We only care that the types being shuffled are legal. The lowering can
35697 // handle any possible shuffle mask that results.
35698 return isTypeLegal(VT.getSimpleVT());
35699}
35700
35702 EVT VT) const {
35703 // Don't convert an 'and' into a shuffle that we don't directly support.
35704 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
35705 if (!Subtarget.hasAVX2())
35706 if (VT == MVT::v32i8 || VT == MVT::v16i16)
35707 return false;
35708
35709 // Just delegate to the generic legality, clear masks aren't special.
35710 return isShuffleMaskLegal(Mask, VT);
35711}
35712
35714 // If the subtarget is using thunks, we need to not generate jump tables.
35715 if (Subtarget.useIndirectThunkBranches())
35716 return false;
35717
35718 // Otherwise, fallback on the generic logic.
35720}
35721
35723 EVT ConditionVT) const {
35724 // Avoid 8 and 16 bit types because they increase the chance for unnecessary
35725 // zero-extensions.
35726 if (ConditionVT.getSizeInBits() < 32)
35727 return MVT::i32;
35729 ConditionVT);
35730}
35731
35732//===----------------------------------------------------------------------===//
35733// X86 Scheduler Hooks
35734//===----------------------------------------------------------------------===//
35735
35736/// Utility function to emit xbegin specifying the start of an RTM region.
35738 const TargetInstrInfo *TII) {
35739 const MIMetadata MIMD(MI);
35740
35741 const BasicBlock *BB = MBB->getBasicBlock();
35742 MachineFunction::iterator I = ++MBB->getIterator();
35743
35744 // For the v = xbegin(), we generate
35745 //
35746 // thisMBB:
35747 // xbegin sinkMBB
35748 //
35749 // mainMBB:
35750 // s0 = -1
35751 //
35752 // fallBB:
35753 // eax = # XABORT_DEF
35754 // s1 = eax
35755 //
35756 // sinkMBB:
35757 // v = phi(s0/mainBB, s1/fallBB)
35758
35759 MachineBasicBlock *thisMBB = MBB;
35760 MachineFunction *MF = MBB->getParent();
35761 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
35762 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
35763 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35764 MF->insert(I, mainMBB);
35765 MF->insert(I, fallMBB);
35766 MF->insert(I, sinkMBB);
35767
35768 if (isPhysRegUsedAfter(X86::EFLAGS, MI)) {
35769 mainMBB->addLiveIn(X86::EFLAGS);
35770 fallMBB->addLiveIn(X86::EFLAGS);
35771 sinkMBB->addLiveIn(X86::EFLAGS);
35772 }
35773
35774 // Transfer the remainder of BB and its successor edges to sinkMBB.
35775 sinkMBB->splice(sinkMBB->begin(), MBB,
35776 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35778
35780 Register DstReg = MI.getOperand(0).getReg();
35781 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
35782 Register mainDstReg = MRI.createVirtualRegister(RC);
35783 Register fallDstReg = MRI.createVirtualRegister(RC);
35784
35785 // thisMBB:
35786 // xbegin fallMBB
35787 // # fallthrough to mainMBB
35788 // # abortion to fallMBB
35789 BuildMI(thisMBB, MIMD, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
35790 thisMBB->addSuccessor(mainMBB);
35791 thisMBB->addSuccessor(fallMBB);
35792
35793 // mainMBB:
35794 // mainDstReg := -1
35795 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
35796 BuildMI(mainMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
35797 mainMBB->addSuccessor(sinkMBB);
35798
35799 // fallMBB:
35800 // ; pseudo instruction to model hardware's definition from XABORT
35801 // EAX := XABORT_DEF
35802 // fallDstReg := EAX
35803 BuildMI(fallMBB, MIMD, TII->get(X86::XABORT_DEF));
35804 BuildMI(fallMBB, MIMD, TII->get(TargetOpcode::COPY), fallDstReg)
35805 .addReg(X86::EAX);
35806 fallMBB->addSuccessor(sinkMBB);
35807
35808 // sinkMBB:
35809 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
35810 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
35811 .addReg(mainDstReg).addMBB(mainMBB)
35812 .addReg(fallDstReg).addMBB(fallMBB);
35813
35814 MI.eraseFromParent();
35815 return sinkMBB;
35816}
35817
35819X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
35820 MachineBasicBlock *MBB) const {
35821 // Emit va_arg instruction on X86-64.
35822
35823 // Operands to this pseudo-instruction:
35824 // 0 ) Output : destination address (reg)
35825 // 1-5) Input : va_list address (addr, i64mem)
35826 // 6 ) ArgSize : Size (in bytes) of vararg type
35827 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
35828 // 8 ) Align : Alignment of type
35829 // 9 ) EFLAGS (implicit-def)
35830
35831 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
35832 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
35833
35834 Register DestReg = MI.getOperand(0).getReg();
35835 MachineOperand &Base = MI.getOperand(1);
35836 MachineOperand &Scale = MI.getOperand(2);
35837 MachineOperand &Index = MI.getOperand(3);
35838 MachineOperand &Disp = MI.getOperand(4);
35839 MachineOperand &Segment = MI.getOperand(5);
35840 unsigned ArgSize = MI.getOperand(6).getImm();
35841 unsigned ArgMode = MI.getOperand(7).getImm();
35842 Align Alignment = Align(MI.getOperand(8).getImm());
35843
35844 MachineFunction *MF = MBB->getParent();
35845
35846 // Memory Reference
35847 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
35848
35849 MachineMemOperand *OldMMO = MI.memoperands().front();
35850
35851 // Clone the MMO into two separate MMOs for loading and storing
35852 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
35853 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
35854 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
35855 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
35856
35857 // Machine Information
35858 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35859 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
35860 const TargetRegisterClass *AddrRegClass =
35862 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
35863 const MIMetadata MIMD(MI);
35864
35865 // struct va_list {
35866 // i32 gp_offset
35867 // i32 fp_offset
35868 // i64 overflow_area (address)
35869 // i64 reg_save_area (address)
35870 // }
35871 // sizeof(va_list) = 24
35872 // alignment(va_list) = 8
35873
35874 unsigned TotalNumIntRegs = 6;
35875 unsigned TotalNumXMMRegs = 8;
35876 bool UseGPOffset = (ArgMode == 1);
35877 bool UseFPOffset = (ArgMode == 2);
35878 unsigned MaxOffset = TotalNumIntRegs * 8 +
35879 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
35880
35881 /* Align ArgSize to a multiple of 8 */
35882 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
35883 bool NeedsAlign = (Alignment > 8);
35884
35885 MachineBasicBlock *thisMBB = MBB;
35886 MachineBasicBlock *overflowMBB;
35887 MachineBasicBlock *offsetMBB;
35888 MachineBasicBlock *endMBB;
35889
35890 Register OffsetDestReg; // Argument address computed by offsetMBB
35891 Register OverflowDestReg; // Argument address computed by overflowMBB
35892 Register OffsetReg;
35893
35894 if (!UseGPOffset && !UseFPOffset) {
35895 // If we only pull from the overflow region, we don't create a branch.
35896 // We don't need to alter control flow.
35897 OffsetDestReg = Register(); // unused
35898 OverflowDestReg = DestReg;
35899
35900 offsetMBB = nullptr;
35901 overflowMBB = thisMBB;
35902 endMBB = thisMBB;
35903 } else {
35904 // First emit code to check if gp_offset (or fp_offset) is below the bound.
35905 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
35906 // If not, pull from overflow_area. (branch to overflowMBB)
35907 //
35908 // thisMBB
35909 // | .
35910 // | .
35911 // offsetMBB overflowMBB
35912 // | .
35913 // | .
35914 // endMBB
35915
35916 // Registers for the PHI in endMBB
35917 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
35918 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
35919
35920 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
35921 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35922 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35923 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35924
35926
35927 // Insert the new basic blocks
35928 MF->insert(MBBIter, offsetMBB);
35929 MF->insert(MBBIter, overflowMBB);
35930 MF->insert(MBBIter, endMBB);
35931
35932 // Transfer the remainder of MBB and its successor edges to endMBB.
35933 endMBB->splice(endMBB->begin(), thisMBB,
35934 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
35935 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
35936
35937 // Make offsetMBB and overflowMBB successors of thisMBB
35938 thisMBB->addSuccessor(offsetMBB);
35939 thisMBB->addSuccessor(overflowMBB);
35940
35941 // endMBB is a successor of both offsetMBB and overflowMBB
35942 offsetMBB->addSuccessor(endMBB);
35943 overflowMBB->addSuccessor(endMBB);
35944
35945 // Load the offset value into a register
35946 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
35947 BuildMI(thisMBB, MIMD, TII->get(X86::MOV32rm), OffsetReg)
35948 .add(Base)
35949 .add(Scale)
35950 .add(Index)
35951 .addDisp(Disp, UseFPOffset ? 4 : 0)
35952 .add(Segment)
35953 .setMemRefs(LoadOnlyMMO);
35954
35955 // Check if there is enough room left to pull this argument.
35956 BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri))
35957 .addReg(OffsetReg)
35958 .addImm(MaxOffset + 8 - ArgSizeA8);
35959
35960 // Branch to "overflowMBB" if offset >= max
35961 // Fall through to "offsetMBB" otherwise
35962 BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1))
35963 .addMBB(overflowMBB).addImm(X86::COND_AE);
35964 }
35965
35966 // In offsetMBB, emit code to use the reg_save_area.
35967 if (offsetMBB) {
35968 assert(OffsetReg != 0);
35969
35970 // Read the reg_save_area address.
35971 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
35972 BuildMI(
35973 offsetMBB, MIMD,
35974 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
35975 RegSaveReg)
35976 .add(Base)
35977 .add(Scale)
35978 .add(Index)
35979 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
35980 .add(Segment)
35981 .setMemRefs(LoadOnlyMMO);
35982
35983 if (Subtarget.isTarget64BitLP64()) {
35984 // Zero-extend the offset
35985 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
35986 BuildMI(offsetMBB, MIMD, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
35987 .addImm(0)
35988 .addReg(OffsetReg)
35989 .addImm(X86::sub_32bit);
35990
35991 // Add the offset to the reg_save_area to get the final address.
35992 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD64rr), OffsetDestReg)
35993 .addReg(OffsetReg64)
35994 .addReg(RegSaveReg);
35995 } else {
35996 // Add the offset to the reg_save_area to get the final address.
35997 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32rr), OffsetDestReg)
35998 .addReg(OffsetReg)
35999 .addReg(RegSaveReg);
36000 }
36001
36002 // Compute the offset for the next argument
36003 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
36004 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg)
36005 .addReg(OffsetReg)
36006 .addImm(UseFPOffset ? 16 : 8);
36007
36008 // Store it back into the va_list.
36009 BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr))
36010 .add(Base)
36011 .add(Scale)
36012 .add(Index)
36013 .addDisp(Disp, UseFPOffset ? 4 : 0)
36014 .add(Segment)
36015 .addReg(NextOffsetReg)
36016 .setMemRefs(StoreOnlyMMO);
36017
36018 // Jump to endMBB
36019 BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1))
36020 .addMBB(endMBB);
36021 }
36022
36023 //
36024 // Emit code to use overflow area
36025 //
36026
36027 // Load the overflow_area address into a register.
36028 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
36029 BuildMI(overflowMBB, MIMD,
36030 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
36031 OverflowAddrReg)
36032 .add(Base)
36033 .add(Scale)
36034 .add(Index)
36035 .addDisp(Disp, 8)
36036 .add(Segment)
36037 .setMemRefs(LoadOnlyMMO);
36038
36039 // If we need to align it, do so. Otherwise, just copy the address
36040 // to OverflowDestReg.
36041 if (NeedsAlign) {
36042 // Align the overflow address
36043 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
36044
36045 // aligned_addr = (addr + (align-1)) & ~(align-1)
36046 BuildMI(
36047 overflowMBB, MIMD,
36048 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36049 TmpReg)
36050 .addReg(OverflowAddrReg)
36051 .addImm(Alignment.value() - 1);
36052
36053 BuildMI(
36054 overflowMBB, MIMD,
36055 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
36056 OverflowDestReg)
36057 .addReg(TmpReg)
36058 .addImm(~(uint64_t)(Alignment.value() - 1));
36059 } else {
36060 BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg)
36061 .addReg(OverflowAddrReg);
36062 }
36063
36064 // Compute the next overflow address after this argument.
36065 // (the overflow address should be kept 8-byte aligned)
36066 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
36067 BuildMI(
36068 overflowMBB, MIMD,
36069 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36070 NextAddrReg)
36071 .addReg(OverflowDestReg)
36072 .addImm(ArgSizeA8);
36073
36074 // Store the new overflow address.
36075 BuildMI(overflowMBB, MIMD,
36076 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
36077 .add(Base)
36078 .add(Scale)
36079 .add(Index)
36080 .addDisp(Disp, 8)
36081 .add(Segment)
36082 .addReg(NextAddrReg)
36083 .setMemRefs(StoreOnlyMMO);
36084
36085 // If we branched, emit the PHI to the front of endMBB.
36086 if (offsetMBB) {
36087 BuildMI(*endMBB, endMBB->begin(), MIMD,
36088 TII->get(X86::PHI), DestReg)
36089 .addReg(OffsetDestReg).addMBB(offsetMBB)
36090 .addReg(OverflowDestReg).addMBB(overflowMBB);
36091 }
36092
36093 // Erase the pseudo instruction
36094 MI.eraseFromParent();
36095
36096 return endMBB;
36097}
36098
36099// The EFLAGS operand of SelectItr might be missing a kill marker
36100// because there were multiple uses of EFLAGS, and ISel didn't know
36101// which to mark. Figure out whether SelectItr should have had a
36102// kill marker, and set it if it should. Returns the correct kill
36103// marker value.
36106 const TargetRegisterInfo* TRI) {
36107 if (isPhysRegUsedAfter(X86::EFLAGS, SelectItr))
36108 return false;
36109
36110 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
36111 // out. SelectMI should have a kill flag on EFLAGS.
36112 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
36113 return true;
36114}
36115
36116// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
36117// together with other CMOV pseudo-opcodes into a single basic-block with
36118// conditional jump around it.
36120 switch (MI.getOpcode()) {
36121 case X86::CMOV_FR16:
36122 case X86::CMOV_FR16X:
36123 case X86::CMOV_FR32:
36124 case X86::CMOV_FR32X:
36125 case X86::CMOV_FR64:
36126 case X86::CMOV_FR64X:
36127 case X86::CMOV_GR8:
36128 case X86::CMOV_GR16:
36129 case X86::CMOV_GR32:
36130 case X86::CMOV_RFP32:
36131 case X86::CMOV_RFP64:
36132 case X86::CMOV_RFP80:
36133 case X86::CMOV_VR64:
36134 case X86::CMOV_VR128:
36135 case X86::CMOV_VR128X:
36136 case X86::CMOV_VR256:
36137 case X86::CMOV_VR256X:
36138 case X86::CMOV_VR512:
36139 case X86::CMOV_VK1:
36140 case X86::CMOV_VK2:
36141 case X86::CMOV_VK4:
36142 case X86::CMOV_VK8:
36143 case X86::CMOV_VK16:
36144 case X86::CMOV_VK32:
36145 case X86::CMOV_VK64:
36146 return true;
36147
36148 default:
36149 return false;
36150 }
36151}
36152
36153// Helper function, which inserts PHI functions into SinkMBB:
36154// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
36155// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
36156// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
36157// the last PHI function inserted.
36160 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
36161 MachineBasicBlock *SinkMBB) {
36162 MachineFunction *MF = TrueMBB->getParent();
36164 const MIMetadata MIMD(*MIItBegin);
36165
36166 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
36168
36169 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
36170
36171 // As we are creating the PHIs, we have to be careful if there is more than
36172 // one. Later CMOVs may reference the results of earlier CMOVs, but later
36173 // PHIs have to reference the individual true/false inputs from earlier PHIs.
36174 // That also means that PHI construction must work forward from earlier to
36175 // later, and that the code must maintain a mapping from earlier PHI's
36176 // destination registers, and the registers that went into the PHI.
36179
36180 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
36181 Register DestReg = MIIt->getOperand(0).getReg();
36182 Register Op1Reg = MIIt->getOperand(1).getReg();
36183 Register Op2Reg = MIIt->getOperand(2).getReg();
36184
36185 // If this CMOV we are generating is the opposite condition from
36186 // the jump we generated, then we have to swap the operands for the
36187 // PHI that is going to be generated.
36188 if (MIIt->getOperand(3).getImm() == OppCC)
36189 std::swap(Op1Reg, Op2Reg);
36190
36191 if (auto It = RegRewriteTable.find(Op1Reg); It != RegRewriteTable.end())
36192 Op1Reg = It->second.first;
36193
36194 if (auto It = RegRewriteTable.find(Op2Reg); It != RegRewriteTable.end())
36195 Op2Reg = It->second.second;
36196
36197 MIB =
36198 BuildMI(*SinkMBB, SinkInsertionPoint, MIMD, TII->get(X86::PHI), DestReg)
36199 .addReg(Op1Reg)
36200 .addMBB(FalseMBB)
36201 .addReg(Op2Reg)
36202 .addMBB(TrueMBB);
36203
36204 // Add this PHI to the rewrite table.
36205 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
36206 }
36207
36208 return MIB;
36209}
36210
36211// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
36213X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
36214 MachineInstr &SecondCascadedCMOV,
36215 MachineBasicBlock *ThisMBB) const {
36216 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36217 const MIMetadata MIMD(FirstCMOV);
36218
36219 // We lower cascaded CMOVs such as
36220 //
36221 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
36222 //
36223 // to two successive branches.
36224 //
36225 // Without this, we would add a PHI between the two jumps, which ends up
36226 // creating a few copies all around. For instance, for
36227 //
36228 // (sitofp (zext (fcmp une)))
36229 //
36230 // we would generate:
36231 //
36232 // ucomiss %xmm1, %xmm0
36233 // movss <1.0f>, %xmm0
36234 // movaps %xmm0, %xmm1
36235 // jne .LBB5_2
36236 // xorps %xmm1, %xmm1
36237 // .LBB5_2:
36238 // jp .LBB5_4
36239 // movaps %xmm1, %xmm0
36240 // .LBB5_4:
36241 // retq
36242 //
36243 // because this custom-inserter would have generated:
36244 //
36245 // A
36246 // | \
36247 // | B
36248 // | /
36249 // C
36250 // | \
36251 // | D
36252 // | /
36253 // E
36254 //
36255 // A: X = ...; Y = ...
36256 // B: empty
36257 // C: Z = PHI [X, A], [Y, B]
36258 // D: empty
36259 // E: PHI [X, C], [Z, D]
36260 //
36261 // If we lower both CMOVs in a single step, we can instead generate:
36262 //
36263 // A
36264 // | \
36265 // | C
36266 // | /|
36267 // |/ |
36268 // | |
36269 // | D
36270 // | /
36271 // E
36272 //
36273 // A: X = ...; Y = ...
36274 // D: empty
36275 // E: PHI [X, A], [X, C], [Y, D]
36276 //
36277 // Which, in our sitofp/fcmp example, gives us something like:
36278 //
36279 // ucomiss %xmm1, %xmm0
36280 // movss <1.0f>, %xmm0
36281 // jne .LBB5_4
36282 // jp .LBB5_4
36283 // xorps %xmm0, %xmm0
36284 // .LBB5_4:
36285 // retq
36286 //
36287
36288 // We lower cascaded CMOV into two successive branches to the same block.
36289 // EFLAGS is used by both, so mark it as live in the second.
36290 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36291 MachineFunction *F = ThisMBB->getParent();
36292 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36293 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36294 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36295
36296 MachineFunction::iterator It = ++ThisMBB->getIterator();
36297 F->insert(It, FirstInsertedMBB);
36298 F->insert(It, SecondInsertedMBB);
36299 F->insert(It, SinkMBB);
36300
36301 // For a cascaded CMOV, we lower it to two successive branches to
36302 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
36303 // the FirstInsertedMBB.
36304 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
36305
36306 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36307 // live into the sink and copy blocks.
36308 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36309 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
36310 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
36311 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
36312 SinkMBB->addLiveIn(X86::EFLAGS);
36313 }
36314
36315 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36316 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
36317 std::next(MachineBasicBlock::iterator(FirstCMOV)),
36318 ThisMBB->end());
36319 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36320
36321 // Fallthrough block for ThisMBB.
36322 ThisMBB->addSuccessor(FirstInsertedMBB);
36323 // The true block target of the first branch is always SinkMBB.
36324 ThisMBB->addSuccessor(SinkMBB);
36325 // Fallthrough block for FirstInsertedMBB.
36326 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
36327 // The true block for the branch of FirstInsertedMBB.
36328 FirstInsertedMBB->addSuccessor(SinkMBB);
36329 // This is fallthrough.
36330 SecondInsertedMBB->addSuccessor(SinkMBB);
36331
36332 // Create the conditional branch instructions.
36333 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
36334 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
36335
36336 X86::CondCode SecondCC =
36337 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
36338 BuildMI(FirstInsertedMBB, MIMD, TII->get(X86::JCC_1))
36339 .addMBB(SinkMBB)
36340 .addImm(SecondCC);
36341
36342 // SinkMBB:
36343 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
36344 Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();
36345 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
36346 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
36347 MachineInstrBuilder MIB =
36348 BuildMI(*SinkMBB, SinkMBB->begin(), MIMD, TII->get(X86::PHI), DestReg)
36349 .addReg(Op1Reg)
36350 .addMBB(SecondInsertedMBB)
36351 .addReg(Op2Reg)
36352 .addMBB(ThisMBB);
36353
36354 // The second SecondInsertedMBB provides the same incoming value as the
36355 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
36356 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
36357
36358 // Now remove the CMOVs.
36359 FirstCMOV.eraseFromParent();
36360 SecondCascadedCMOV.eraseFromParent();
36361
36362 return SinkMBB;
36363}
36364
36366X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
36367 MachineBasicBlock *ThisMBB) const {
36368 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36369 const MIMetadata MIMD(MI);
36370
36371 // To "insert" a SELECT_CC instruction, we actually have to insert the
36372 // diamond control-flow pattern. The incoming instruction knows the
36373 // destination vreg to set, the condition code register to branch on, the
36374 // true/false values to select between and a branch opcode to use.
36375
36376 // ThisMBB:
36377 // ...
36378 // TrueVal = ...
36379 // cmpTY ccX, r1, r2
36380 // bCC copy1MBB
36381 // fallthrough --> FalseMBB
36382
36383 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
36384 // as described above, by inserting a BB, and then making a PHI at the join
36385 // point to select the true and false operands of the CMOV in the PHI.
36386 //
36387 // The code also handles two different cases of multiple CMOV opcodes
36388 // in a row.
36389 //
36390 // Case 1:
36391 // In this case, there are multiple CMOVs in a row, all which are based on
36392 // the same condition setting (or the exact opposite condition setting).
36393 // In this case we can lower all the CMOVs using a single inserted BB, and
36394 // then make a number of PHIs at the join point to model the CMOVs. The only
36395 // trickiness here, is that in a case like:
36396 //
36397 // t2 = CMOV cond1 t1, f1
36398 // t3 = CMOV cond1 t2, f2
36399 //
36400 // when rewriting this into PHIs, we have to perform some renaming on the
36401 // temps since you cannot have a PHI operand refer to a PHI result earlier
36402 // in the same block. The "simple" but wrong lowering would be:
36403 //
36404 // t2 = PHI t1(BB1), f1(BB2)
36405 // t3 = PHI t2(BB1), f2(BB2)
36406 //
36407 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
36408 // renaming is to note that on the path through BB1, t2 is really just a
36409 // copy of t1, and do that renaming, properly generating:
36410 //
36411 // t2 = PHI t1(BB1), f1(BB2)
36412 // t3 = PHI t1(BB1), f2(BB2)
36413 //
36414 // Case 2:
36415 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
36416 // function - EmitLoweredCascadedSelect.
36417
36418 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
36420 MachineInstr *LastCMOV = &MI;
36422
36423 // Check for case 1, where there are multiple CMOVs with the same condition
36424 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
36425 // number of jumps the most.
36426
36427 if (isCMOVPseudo(MI)) {
36428 // See if we have a string of CMOVS with the same condition. Skip over
36429 // intervening debug insts.
36430 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
36431 (NextMIIt->getOperand(3).getImm() == CC ||
36432 NextMIIt->getOperand(3).getImm() == OppCC)) {
36433 LastCMOV = &*NextMIIt;
36434 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
36435 }
36436 }
36437
36438 // This checks for case 2, but only do this if we didn't already find
36439 // case 1, as indicated by LastCMOV == MI.
36440 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
36441 NextMIIt->getOpcode() == MI.getOpcode() &&
36442 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
36443 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
36444 NextMIIt->getOperand(1).isKill()) {
36445 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
36446 }
36447
36448 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36449 MachineFunction *F = ThisMBB->getParent();
36450 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
36451 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36452
36453 MachineFunction::iterator It = ++ThisMBB->getIterator();
36454 F->insert(It, FalseMBB);
36455 F->insert(It, SinkMBB);
36456
36457 // Set the call frame size on entry to the new basic blocks.
36458 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
36459 FalseMBB->setCallFrameSize(CallFrameSize);
36460 SinkMBB->setCallFrameSize(CallFrameSize);
36461
36462 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36463 // live into the sink and copy blocks.
36464 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36465 if (!LastCMOV->killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
36466 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
36467 FalseMBB->addLiveIn(X86::EFLAGS);
36468 SinkMBB->addLiveIn(X86::EFLAGS);
36469 }
36470
36471 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
36473 MachineBasicBlock::iterator(LastCMOV));
36474 for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
36475 if (MI.isDebugInstr())
36476 SinkMBB->push_back(MI.removeFromParent());
36477
36478 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36479 SinkMBB->splice(SinkMBB->end(), ThisMBB,
36480 std::next(MachineBasicBlock::iterator(LastCMOV)),
36481 ThisMBB->end());
36482 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36483
36484 // Fallthrough block for ThisMBB.
36485 ThisMBB->addSuccessor(FalseMBB);
36486 // The true block target of the first (or only) branch is always a SinkMBB.
36487 ThisMBB->addSuccessor(SinkMBB);
36488 // Fallthrough block for FalseMBB.
36489 FalseMBB->addSuccessor(SinkMBB);
36490
36491 // Create the conditional branch instruction.
36492 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
36493
36494 // SinkMBB:
36495 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
36496 // ...
36499 std::next(MachineBasicBlock::iterator(LastCMOV));
36500 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
36501
36502 // Now remove the CMOV(s).
36503 ThisMBB->erase(MIItBegin, MIItEnd);
36504
36505 return SinkMBB;
36506}
36507
36508static unsigned getSUBriOpcode(bool IsLP64) {
36509 if (IsLP64)
36510 return X86::SUB64ri32;
36511 else
36512 return X86::SUB32ri;
36513}
36514
36516X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
36517 MachineBasicBlock *MBB) const {
36518 MachineFunction *MF = MBB->getParent();
36519 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36520 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
36521 const MIMetadata MIMD(MI);
36522 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
36523
36524 const unsigned ProbeSize = getStackProbeSize(*MF);
36525
36526 MachineRegisterInfo &MRI = MF->getRegInfo();
36527 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36528 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36529 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36530
36532 MF->insert(MBBIter, testMBB);
36533 MF->insert(MBBIter, blockMBB);
36534 MF->insert(MBBIter, tailMBB);
36535
36536 Register sizeVReg = MI.getOperand(1).getReg();
36537
36538 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
36539
36540 Register TmpStackPtr = MRI.createVirtualRegister(
36541 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36542 Register FinalStackPtr = MRI.createVirtualRegister(
36543 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36544
36545 BuildMI(*MBB, {MI}, MIMD, TII->get(TargetOpcode::COPY), TmpStackPtr)
36546 .addReg(physSPReg);
36547 {
36548 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
36549 BuildMI(*MBB, {MI}, MIMD, TII->get(Opc), FinalStackPtr)
36550 .addReg(TmpStackPtr)
36551 .addReg(sizeVReg);
36552 }
36553
36554 // test rsp size
36555
36556 BuildMI(testMBB, MIMD,
36557 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
36558 .addReg(FinalStackPtr)
36559 .addReg(physSPReg);
36560
36561 BuildMI(testMBB, MIMD, TII->get(X86::JCC_1))
36562 .addMBB(tailMBB)
36564 testMBB->addSuccessor(blockMBB);
36565 testMBB->addSuccessor(tailMBB);
36566
36567 // Touch the block then extend it. This is done on the opposite side of
36568 // static probe where we allocate then touch, to avoid the need of probing the
36569 // tail of the static alloca. Possible scenarios are:
36570 //
36571 // + ---- <- ------------ <- ------------- <- ------------ +
36572 // | |
36573 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
36574 // | |
36575 // + <- ----------- <- ------------ <- ----------- <- ------------ +
36576 //
36577 // The property we want to enforce is to never have more than [page alloc] between two probes.
36578
36579 const unsigned XORMIOpc =
36580 TFI.Uses64BitFramePtr ? X86::XOR64mi32 : X86::XOR32mi;
36581 addRegOffset(BuildMI(blockMBB, MIMD, TII->get(XORMIOpc)), physSPReg, false, 0)
36582 .addImm(0);
36583
36584 BuildMI(blockMBB, MIMD, TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr)),
36585 physSPReg)
36586 .addReg(physSPReg)
36587 .addImm(ProbeSize);
36588
36589 BuildMI(blockMBB, MIMD, TII->get(X86::JMP_1)).addMBB(testMBB);
36590 blockMBB->addSuccessor(testMBB);
36591
36592 // Replace original instruction by the expected stack ptr
36593 BuildMI(tailMBB, MIMD, TII->get(TargetOpcode::COPY),
36594 MI.getOperand(0).getReg())
36595 .addReg(FinalStackPtr);
36596
36597 tailMBB->splice(tailMBB->end(), MBB,
36598 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
36600 MBB->addSuccessor(testMBB);
36601
36602 // Delete the original pseudo instruction.
36603 MI.eraseFromParent();
36604
36605 // And we're done.
36606 return tailMBB;
36607}
36608
36610X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
36611 MachineBasicBlock *BB) const {
36612 MachineFunction *MF = BB->getParent();
36613 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36614 const MIMetadata MIMD(MI);
36615 const BasicBlock *LLVM_BB = BB->getBasicBlock();
36616
36617 assert(MF->shouldSplitStack());
36618
36619 const bool Is64Bit = Subtarget.is64Bit();
36620 const bool IsLP64 = Subtarget.isTarget64BitLP64();
36621
36622 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
36623 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
36624
36625 // BB:
36626 // ... [Till the alloca]
36627 // If stacklet is not large enough, jump to mallocMBB
36628 //
36629 // bumpMBB:
36630 // Allocate by subtracting from RSP
36631 // Jump to continueMBB
36632 //
36633 // mallocMBB:
36634 // Allocate by call to runtime
36635 //
36636 // continueMBB:
36637 // ...
36638 // [rest of original BB]
36639 //
36640
36641 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36642 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36643 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36644
36645 MachineRegisterInfo &MRI = MF->getRegInfo();
36646 const TargetRegisterClass *AddrRegClass =
36648
36649 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36650 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36651 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
36652 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
36653 sizeVReg = MI.getOperand(1).getReg(),
36654 physSPReg = IsLP64 ? X86::RSP : X86::ESP;
36655
36656 MachineFunction::iterator MBBIter = ++BB->getIterator();
36657
36658 MF->insert(MBBIter, bumpMBB);
36659 MF->insert(MBBIter, mallocMBB);
36660 MF->insert(MBBIter, continueMBB);
36661
36662 continueMBB->splice(continueMBB->begin(), BB,
36663 std::next(MachineBasicBlock::iterator(MI)), BB->end());
36664 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
36665
36666 // Add code to the main basic block to check if the stack limit has been hit,
36667 // and if so, jump to mallocMBB otherwise to bumpMBB.
36668 BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
36669 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
36670 .addReg(tmpSPVReg).addReg(sizeVReg);
36671 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
36672 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
36673 .addReg(SPLimitVReg);
36674 BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
36675
36676 // bumpMBB simply decreases the stack pointer, since we know the current
36677 // stacklet has enough space.
36678 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg)
36679 .addReg(SPLimitVReg);
36680 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
36681 .addReg(SPLimitVReg);
36682 BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36683
36684 // Calls into a routine in libgcc to allocate more space from the heap.
36685 const uint32_t *RegMask =
36686 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
36687 if (IsLP64) {
36688 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI)
36689 .addReg(sizeVReg);
36690 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36691 .addExternalSymbol("__morestack_allocate_stack_space")
36692 .addRegMask(RegMask)
36693 .addReg(X86::RDI, RegState::Implicit)
36694 .addReg(X86::RAX, RegState::ImplicitDefine);
36695 } else if (Is64Bit) {
36696 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI)
36697 .addReg(sizeVReg);
36698 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36699 .addExternalSymbol("__morestack_allocate_stack_space")
36700 .addRegMask(RegMask)
36701 .addReg(X86::EDI, RegState::Implicit)
36702 .addReg(X86::EAX, RegState::ImplicitDefine);
36703 } else {
36704 BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
36705 .addImm(12);
36706 BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg);
36707 BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32))
36708 .addExternalSymbol("__morestack_allocate_stack_space")
36709 .addRegMask(RegMask)
36710 .addReg(X86::EAX, RegState::ImplicitDefine);
36711 }
36712
36713 if (!Is64Bit)
36714 BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
36715 .addImm(16);
36716
36717 BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg)
36718 .addReg(IsLP64 ? X86::RAX : X86::EAX);
36719 BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36720
36721 // Set up the CFG correctly.
36722 BB->addSuccessor(bumpMBB);
36723 BB->addSuccessor(mallocMBB);
36724 mallocMBB->addSuccessor(continueMBB);
36725 bumpMBB->addSuccessor(continueMBB);
36726
36727 // Take care of the PHI nodes.
36728 BuildMI(*continueMBB, continueMBB->begin(), MIMD, TII->get(X86::PHI),
36729 MI.getOperand(0).getReg())
36730 .addReg(mallocPtrVReg)
36731 .addMBB(mallocMBB)
36732 .addReg(bumpSPPtrVReg)
36733 .addMBB(bumpMBB);
36734
36735 // Delete the original pseudo instruction.
36736 MI.eraseFromParent();
36737
36738 // And we're done.
36739 return continueMBB;
36740}
36741
36743X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
36744 MachineBasicBlock *BB) const {
36745 MachineFunction *MF = BB->getParent();
36746 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
36747 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
36748 const MIMetadata MIMD(MI);
36749
36752 "SEH does not use catchret!");
36753
36754 // Only 32-bit EH needs to worry about manually restoring stack pointers.
36755 if (!Subtarget.is32Bit())
36756 return BB;
36757
36758 // C++ EH creates a new target block to hold the restore code, and wires up
36759 // the new block to the return destination with a normal JMP_4.
36760 MachineBasicBlock *RestoreMBB =
36762 assert(BB->succ_size() == 1);
36763 MF->insert(std::next(BB->getIterator()), RestoreMBB);
36764 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
36765 BB->addSuccessor(RestoreMBB);
36766 MI.getOperand(0).setMBB(RestoreMBB);
36767
36768 // Marking this as an EH pad but not a funclet entry block causes PEI to
36769 // restore stack pointers in the block.
36770 RestoreMBB->setIsEHPad(true);
36771
36772 auto RestoreMBBI = RestoreMBB->begin();
36773 BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4)).addMBB(TargetMBB);
36774 return BB;
36775}
36776
36778X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
36779 MachineBasicBlock *BB) const {
36780 // This is pretty easy. We're taking the value that we received from
36781 // our load from the relocation, sticking it in either RDI (x86-64)
36782 // or EAX and doing an indirect call. The return value will then
36783 // be in the normal return register.
36784 MachineFunction *F = BB->getParent();
36785 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36786 const MIMetadata MIMD(MI);
36787
36788 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
36789 assert(MI.getOperand(3).isGlobal() && "This should be a global");
36790
36791 // Get a register mask for the lowered call.
36792 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
36793 // proper register mask.
36794 const uint32_t *RegMask =
36795 Subtarget.is64Bit() ?
36796 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
36797 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
36798 if (Subtarget.is64Bit()) {
36799 MachineInstrBuilder MIB =
36800 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI)
36801 .addReg(X86::RIP)
36802 .addImm(0)
36803 .addReg(0)
36804 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36805 MI.getOperand(3).getTargetFlags())
36806 .addReg(0);
36807 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL64m));
36808 addDirectMem(MIB, X86::RDI);
36809 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
36810 } else if (!isPositionIndependent()) {
36811 MachineInstrBuilder MIB =
36812 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36813 .addReg(0)
36814 .addImm(0)
36815 .addReg(0)
36816 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36817 MI.getOperand(3).getTargetFlags())
36818 .addReg(0);
36819 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36820 addDirectMem(MIB, X86::EAX);
36821 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36822 } else {
36823 MachineInstrBuilder MIB =
36824 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36825 .addReg(TII->getGlobalBaseReg(F))
36826 .addImm(0)
36827 .addReg(0)
36828 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36829 MI.getOperand(3).getTargetFlags())
36830 .addReg(0);
36831 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36832 addDirectMem(MIB, X86::EAX);
36833 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36834 }
36835
36836 MI.eraseFromParent(); // The pseudo instruction is gone now.
36837 return BB;
36838}
36839
36840static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
36841 switch (RPOpc) {
36842 case X86::INDIRECT_THUNK_CALL32:
36843 return X86::CALLpcrel32;
36844 case X86::INDIRECT_THUNK_CALL64:
36845 return X86::CALL64pcrel32;
36846 case X86::INDIRECT_THUNK_TCRETURN32:
36847 return X86::TCRETURNdi;
36848 case X86::INDIRECT_THUNK_TCRETURN64:
36849 return X86::TCRETURNdi64;
36850 }
36851 llvm_unreachable("not indirect thunk opcode");
36852}
36853
36854static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
36855 Register Reg) {
36856 if (Subtarget.useRetpolineExternalThunk()) {
36857 // When using an external thunk for retpolines, we pick names that match the
36858 // names GCC happens to use as well. This helps simplify the implementation
36859 // of the thunks for kernels where they have no easy ability to create
36860 // aliases and are doing non-trivial configuration of the thunk's body. For
36861 // example, the Linux kernel will do boot-time hot patching of the thunk
36862 // bodies and cannot easily export aliases of these to loaded modules.
36863 //
36864 // Note that at any point in the future, we may need to change the semantics
36865 // of how we implement retpolines and at that time will likely change the
36866 // name of the called thunk. Essentially, there is no hard guarantee that
36867 // LLVM will generate calls to specific thunks, we merely make a best-effort
36868 // attempt to help out kernels and other systems where duplicating the
36869 // thunks is costly.
36870 switch (Reg.id()) {
36871 case X86::EAX:
36872 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36873 return "__x86_indirect_thunk_eax";
36874 case X86::ECX:
36875 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36876 return "__x86_indirect_thunk_ecx";
36877 case X86::EDX:
36878 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36879 return "__x86_indirect_thunk_edx";
36880 case X86::EDI:
36881 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36882 return "__x86_indirect_thunk_edi";
36883 case X86::R11:
36884 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36885 return "__x86_indirect_thunk_r11";
36886 }
36887 llvm_unreachable("unexpected reg for external indirect thunk");
36888 }
36889
36890 if (Subtarget.useRetpolineIndirectCalls() ||
36891 Subtarget.useRetpolineIndirectBranches()) {
36892 // When targeting an internal COMDAT thunk use an LLVM-specific name.
36893 switch (Reg.id()) {
36894 case X86::EAX:
36895 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36896 return "__llvm_retpoline_eax";
36897 case X86::ECX:
36898 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36899 return "__llvm_retpoline_ecx";
36900 case X86::EDX:
36901 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36902 return "__llvm_retpoline_edx";
36903 case X86::EDI:
36904 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36905 return "__llvm_retpoline_edi";
36906 case X86::R11:
36907 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36908 return "__llvm_retpoline_r11";
36909 }
36910 llvm_unreachable("unexpected reg for retpoline");
36911 }
36912
36913 if (Subtarget.useLVIControlFlowIntegrity()) {
36914 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36915 return "__llvm_lvi_thunk_r11";
36916 }
36917 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
36918}
36919
36921X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
36922 MachineBasicBlock *BB) const {
36923 // Copy the virtual register into the R11 physical register and
36924 // call the retpoline thunk.
36925 const MIMetadata MIMD(MI);
36926 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36927 Register CalleeVReg = MI.getOperand(0).getReg();
36928 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
36929
36930 // Find an available scratch register to hold the callee. On 64-bit, we can
36931 // just use R11, but we scan for uses anyway to ensure we don't generate
36932 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
36933 // already a register use operand to the call to hold the callee. If none
36934 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
36935 // register and ESI is the base pointer to realigned stack frames with VLAs.
36936 SmallVector<Register, 3> AvailableRegs;
36937 if (Subtarget.is64Bit())
36938 AvailableRegs.push_back(X86::R11);
36939 else
36940 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
36941
36942 // Zero out any registers that are already used.
36943 for (const auto &MO : MI.operands()) {
36944 if (MO.isReg() && MO.isUse())
36945 llvm::replace(AvailableRegs, MO.getReg(), Register());
36946 }
36947
36948 // Choose the first remaining non-zero available register.
36949 Register AvailableReg;
36950 for (Register MaybeReg : AvailableRegs) {
36951 if (MaybeReg) {
36952 AvailableReg = MaybeReg;
36953 break;
36954 }
36955 }
36956 if (!AvailableReg)
36957 report_fatal_error("calling convention incompatible with retpoline, no "
36958 "available registers");
36959
36960 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
36961
36962 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), AvailableReg)
36963 .addReg(CalleeVReg);
36964 MI.getOperand(0).ChangeToES(Symbol);
36965 MI.setDesc(TII->get(Opc));
36966 MachineInstrBuilder(*BB->getParent(), &MI)
36967 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
36968 return BB;
36969}
36970
36971/// SetJmp implies future control flow change upon calling the corresponding
36972/// LongJmp.
36973/// Instead of using the 'return' instruction, the long jump fixes the stack and
36974/// performs an indirect branch. To do so it uses the registers that were stored
36975/// in the jump buffer (when calling SetJmp).
36976/// In case the shadow stack is enabled we need to fix it as well, because some
36977/// return addresses will be skipped.
36978/// The function will save the SSP for future fixing in the function
36979/// emitLongJmpShadowStackFix.
36980/// \sa emitLongJmpShadowStackFix
36981/// \param [in] MI The temporary Machine Instruction for the builtin.
36982/// \param [in] MBB The Machine Basic Block that will be modified.
36983void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
36984 MachineBasicBlock *MBB) const {
36985 const MIMetadata MIMD(MI);
36986 MachineFunction *MF = MBB->getParent();
36987 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36988 MachineRegisterInfo &MRI = MF->getRegInfo();
36989 MachineInstrBuilder MIB;
36990
36991 // Memory Reference.
36992 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
36993
36994 // Initialize a register with zero.
36995 MVT PVT = getPointerTy(MF->getDataLayout());
36996 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
36997 Register ZReg = MRI.createVirtualRegister(PtrRC);
36998 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
36999 BuildMI(*MBB, MI, MIMD, TII->get(XorRROpc))
37000 .addDef(ZReg)
37001 .addReg(ZReg, RegState::Undef)
37002 .addReg(ZReg, RegState::Undef);
37003
37004 // Read the current SSP Register value to the zeroed register.
37005 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37006 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37007 BuildMI(*MBB, MI, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37008
37009 // Write the SSP register value to offset 3 in input memory buffer.
37010 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37011 MIB = BuildMI(*MBB, MI, MIMD, TII->get(PtrStoreOpc));
37012 const int64_t SSPOffset = 3 * PVT.getStoreSize();
37013 const unsigned MemOpndSlot = 1;
37014 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37015 if (i == X86::AddrDisp)
37016 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
37017 else
37018 MIB.add(MI.getOperand(MemOpndSlot + i));
37019 }
37020 MIB.addReg(SSPCopyReg);
37021 MIB.setMemRefs(MMOs);
37022}
37023
37025X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
37026 MachineBasicBlock *MBB) const {
37027 const MIMetadata MIMD(MI);
37028 MachineFunction *MF = MBB->getParent();
37029 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37030 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
37031 MachineRegisterInfo &MRI = MF->getRegInfo();
37032
37033 const BasicBlock *BB = MBB->getBasicBlock();
37035
37036 // Memory Reference
37037 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37038
37039 unsigned MemOpndSlot = 0;
37040
37041 unsigned CurOp = 0;
37042
37043 Register DstReg = MI.getOperand(CurOp++).getReg();
37044 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
37045 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
37046 (void)TRI;
37047 Register mainDstReg = MRI.createVirtualRegister(RC);
37048 Register restoreDstReg = MRI.createVirtualRegister(RC);
37049
37050 MemOpndSlot = CurOp;
37051
37052 MVT PVT = getPointerTy(MF->getDataLayout());
37053 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
37054 "Invalid Pointer Size!");
37055
37056 // For v = setjmp(buf), we generate
37057 //
37058 // thisMBB:
37059 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
37060 // SjLjSetup restoreMBB
37061 //
37062 // mainMBB:
37063 // v_main = 0
37064 //
37065 // sinkMBB:
37066 // v = phi(main, restore)
37067 //
37068 // restoreMBB:
37069 // if base pointer being used, load it from frame
37070 // v_restore = 1
37071
37072 MachineBasicBlock *thisMBB = MBB;
37073 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
37074 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37075 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
37076 MF->insert(I, mainMBB);
37077 MF->insert(I, sinkMBB);
37078 MF->push_back(restoreMBB);
37079 restoreMBB->setMachineBlockAddressTaken();
37080
37081 MachineInstrBuilder MIB;
37082
37083 // Transfer the remainder of BB and its successor edges to sinkMBB.
37084 sinkMBB->splice(sinkMBB->begin(), MBB,
37085 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
37087
37088 // thisMBB:
37089 unsigned PtrStoreOpc = 0;
37090 Register LabelReg;
37091 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37092 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37094
37095 // Prepare IP either in reg or imm.
37096 if (!UseImmLabel) {
37097 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37098 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37099 LabelReg = MRI.createVirtualRegister(PtrRC);
37100 if (Subtarget.is64Bit()) {
37101 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg)
37102 .addReg(X86::RIP)
37103 .addImm(0)
37104 .addReg(0)
37105 .addMBB(restoreMBB)
37106 .addReg(0);
37107 } else {
37108 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
37109 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg)
37110 .addReg(XII->getGlobalBaseReg(MF))
37111 .addImm(0)
37112 .addReg(0)
37113 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
37114 .addReg(0);
37115 }
37116 } else
37117 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37118 // Store IP
37119 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrStoreOpc));
37120 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37121 if (i == X86::AddrDisp)
37122 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
37123 else
37124 MIB.add(MI.getOperand(MemOpndSlot + i));
37125 }
37126 if (!UseImmLabel)
37127 MIB.addReg(LabelReg);
37128 else
37129 MIB.addMBB(restoreMBB);
37130 MIB.setMemRefs(MMOs);
37131
37132 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
37133 emitSetJmpShadowStackFix(MI, thisMBB);
37134 }
37135
37136 // Setup
37137 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup))
37138 .addMBB(restoreMBB);
37139
37140 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37141 MIB.addRegMask(RegInfo->getNoPreservedMask());
37142 thisMBB->addSuccessor(mainMBB);
37143 thisMBB->addSuccessor(restoreMBB);
37144
37145 // mainMBB:
37146 // EAX = 0
37147 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32r0), mainDstReg);
37148 mainMBB->addSuccessor(sinkMBB);
37149
37150 // sinkMBB:
37151 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
37152 .addReg(mainDstReg)
37153 .addMBB(mainMBB)
37154 .addReg(restoreDstReg)
37155 .addMBB(restoreMBB);
37156
37157 // restoreMBB:
37158 if (RegInfo->hasBasePointer(*MF)) {
37159 const bool Uses64BitFramePtr = Subtarget.isTarget64BitLP64();
37160 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
37161 X86FI->setRestoreBasePointer(MF);
37162 Register FramePtr = RegInfo->getFrameRegister(*MF);
37163 Register BasePtr = RegInfo->getBaseRegister();
37164 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
37165 addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr),
37166 FramePtr, true, X86FI->getRestoreBasePointerOffset())
37168 }
37169 BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
37170 BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
37171 restoreMBB->addSuccessor(sinkMBB);
37172
37173 MI.eraseFromParent();
37174 return sinkMBB;
37175}
37176
37177/// Fix the shadow stack using the previously saved SSP pointer.
37178/// \sa emitSetJmpShadowStackFix
37179/// \param [in] MI The temporary Machine Instruction for the builtin.
37180/// \param [in] MBB The Machine Basic Block that will be modified.
37181/// \return The sink MBB that will perform the future indirect branch.
37183X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
37184 MachineBasicBlock *MBB) const {
37185 const MIMetadata MIMD(MI);
37186 MachineFunction *MF = MBB->getParent();
37187 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37188 MachineRegisterInfo &MRI = MF->getRegInfo();
37189
37190 // Memory Reference
37191 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37192
37193 MVT PVT = getPointerTy(MF->getDataLayout());
37194 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37195
37196 // checkSspMBB:
37197 // xor vreg1, vreg1
37198 // rdssp vreg1
37199 // test vreg1, vreg1
37200 // je sinkMBB # Jump if Shadow Stack is not supported
37201 // fallMBB:
37202 // mov buf+24/12(%rip), vreg2
37203 // sub vreg1, vreg2
37204 // jbe sinkMBB # No need to fix the Shadow Stack
37205 // fixShadowMBB:
37206 // shr 3/2, vreg2
37207 // incssp vreg2 # fix the SSP according to the lower 8 bits
37208 // shr 8, vreg2
37209 // je sinkMBB
37210 // fixShadowLoopPrepareMBB:
37211 // shl vreg2
37212 // mov 128, vreg3
37213 // fixShadowLoopMBB:
37214 // incssp vreg3
37215 // dec vreg2
37216 // jne fixShadowLoopMBB # Iterate until you finish fixing
37217 // # the Shadow Stack
37218 // sinkMBB:
37219
37221 const BasicBlock *BB = MBB->getBasicBlock();
37222
37223 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
37224 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
37225 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
37226 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
37227 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
37228 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37229 MF->insert(I, checkSspMBB);
37230 MF->insert(I, fallMBB);
37231 MF->insert(I, fixShadowMBB);
37232 MF->insert(I, fixShadowLoopPrepareMBB);
37233 MF->insert(I, fixShadowLoopMBB);
37234 MF->insert(I, sinkMBB);
37235
37236 // Transfer the remainder of BB and its successor edges to sinkMBB.
37237 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
37238 MBB->end());
37240
37241 MBB->addSuccessor(checkSspMBB);
37242
37243 // Initialize a register with zero.
37244 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
37245 BuildMI(checkSspMBB, MIMD, TII->get(X86::MOV32r0), ZReg);
37246
37247 if (PVT == MVT::i64) {
37248 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
37249 BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg)
37250 .addImm(0)
37251 .addReg(ZReg)
37252 .addImm(X86::sub_32bit);
37253 ZReg = TmpZReg;
37254 }
37255
37256 // Read the current SSP Register value to the zeroed register.
37257 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37258 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37259 BuildMI(checkSspMBB, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37260
37261 // Check whether the result of the SSP register is zero and jump directly
37262 // to the sink.
37263 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
37264 BuildMI(checkSspMBB, MIMD, TII->get(TestRROpc))
37265 .addReg(SSPCopyReg)
37266 .addReg(SSPCopyReg);
37267 BuildMI(checkSspMBB, MIMD, TII->get(X86::JCC_1))
37268 .addMBB(sinkMBB)
37270 checkSspMBB->addSuccessor(sinkMBB);
37271 checkSspMBB->addSuccessor(fallMBB);
37272
37273 // Reload the previously saved SSP register value.
37274 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
37275 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37276 const int64_t SPPOffset = 3 * PVT.getStoreSize();
37277 MachineInstrBuilder MIB =
37278 BuildMI(fallMBB, MIMD, TII->get(PtrLoadOpc), PrevSSPReg);
37279 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37280 const MachineOperand &MO = MI.getOperand(i);
37281 if (i == X86::AddrDisp)
37282 MIB.addDisp(MO, SPPOffset);
37283 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37284 // preserve kill flags.
37285 MIB.addReg(MO.getReg());
37286 else
37287 MIB.add(MO);
37288 }
37289 MIB.setMemRefs(MMOs);
37290
37291 // Subtract the current SSP from the previous SSP.
37292 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
37293 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
37294 BuildMI(fallMBB, MIMD, TII->get(SubRROpc), SspSubReg)
37295 .addReg(PrevSSPReg)
37296 .addReg(SSPCopyReg);
37297
37298 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
37299 BuildMI(fallMBB, MIMD, TII->get(X86::JCC_1))
37300 .addMBB(sinkMBB)
37302 fallMBB->addSuccessor(sinkMBB);
37303 fallMBB->addSuccessor(fixShadowMBB);
37304
37305 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
37306 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
37307 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
37308 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
37309 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspFirstShrReg)
37310 .addReg(SspSubReg)
37311 .addImm(Offset);
37312
37313 // Increase SSP when looking only on the lower 8 bits of the delta.
37314 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
37315 BuildMI(fixShadowMBB, MIMD, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
37316
37317 // Reset the lower 8 bits.
37318 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
37319 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspSecondShrReg)
37320 .addReg(SspFirstShrReg)
37321 .addImm(8);
37322
37323 // Jump if the result of the shift is zero.
37324 BuildMI(fixShadowMBB, MIMD, TII->get(X86::JCC_1))
37325 .addMBB(sinkMBB)
37327 fixShadowMBB->addSuccessor(sinkMBB);
37328 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
37329
37330 // Do a single shift left.
37331 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64ri : X86::SHL32ri;
37332 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
37333 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(ShlR1Opc), SspAfterShlReg)
37334 .addReg(SspSecondShrReg)
37335 .addImm(1);
37336
37337 // Save the value 128 to a register (will be used next with incssp).
37338 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
37339 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
37340 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(MovRIOpc), Value128InReg)
37341 .addImm(128);
37342 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
37343
37344 // Since incssp only looks at the lower 8 bits, we might need to do several
37345 // iterations of incssp until we finish fixing the shadow stack.
37346 Register DecReg = MRI.createVirtualRegister(PtrRC);
37347 Register CounterReg = MRI.createVirtualRegister(PtrRC);
37348 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::PHI), CounterReg)
37349 .addReg(SspAfterShlReg)
37350 .addMBB(fixShadowLoopPrepareMBB)
37351 .addReg(DecReg)
37352 .addMBB(fixShadowLoopMBB);
37353
37354 // Every iteration we increase the SSP by 128.
37355 BuildMI(fixShadowLoopMBB, MIMD, TII->get(IncsspOpc)).addReg(Value128InReg);
37356
37357 // Every iteration we decrement the counter by 1.
37358 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
37359 BuildMI(fixShadowLoopMBB, MIMD, TII->get(DecROpc), DecReg).addReg(CounterReg);
37360
37361 // Jump if the counter is not zero yet.
37362 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::JCC_1))
37363 .addMBB(fixShadowLoopMBB)
37365 fixShadowLoopMBB->addSuccessor(sinkMBB);
37366 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
37367
37368 return sinkMBB;
37369}
37370
37372X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
37373 MachineBasicBlock *MBB) const {
37374 const MIMetadata MIMD(MI);
37375 MachineFunction *MF = MBB->getParent();
37376 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37377 MachineRegisterInfo &MRI = MF->getRegInfo();
37378
37379 // Memory Reference
37380 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37381
37382 MVT PVT = getPointerTy(MF->getDataLayout());
37383 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
37384 "Invalid Pointer Size!");
37385
37386 const TargetRegisterClass *RC =
37387 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37388 Register Tmp = MRI.createVirtualRegister(RC);
37389 // Since FP is only updated here but NOT referenced, it's treated as GPR.
37390 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37391 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
37392 Register SP = RegInfo->getStackRegister();
37393
37394 MachineInstrBuilder MIB;
37395
37396 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37397 const int64_t SPOffset = 2 * PVT.getStoreSize();
37398
37399 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37400 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
37401
37402 MachineBasicBlock *thisMBB = MBB;
37403
37404 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
37405 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
37406 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
37407 }
37408
37409 // Reload FP
37410 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), FP);
37411 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37412 const MachineOperand &MO = MI.getOperand(i);
37413 if (MO.isReg()) // Don't add the whole operand, we don't want to
37414 // preserve kill flags.
37415 MIB.addReg(MO.getReg());
37416 else
37417 MIB.add(MO);
37418 }
37419 MIB.setMemRefs(MMOs);
37421
37422 // Reload IP
37423 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), Tmp);
37424 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37425 const MachineOperand &MO = MI.getOperand(i);
37426 if (i == X86::AddrDisp)
37427 MIB.addDisp(MO, LabelOffset);
37428 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37429 // preserve kill flags.
37430 MIB.addReg(MO.getReg());
37431 else
37432 MIB.add(MO);
37433 }
37434 MIB.setMemRefs(MMOs);
37435
37436 // Reload SP
37437 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), SP);
37438 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37439 if (i == X86::AddrDisp)
37440 MIB.addDisp(MI.getOperand(i), SPOffset);
37441 else
37442 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
37443 // the last instruction of the expansion.
37444 }
37445 MIB.setMemRefs(MMOs);
37447
37448 // Jump
37449 BuildMI(*thisMBB, MI, MIMD, TII->get(IJmpOpc)).addReg(Tmp);
37450
37451 MI.eraseFromParent();
37452 return thisMBB;
37453}
37454
37455void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
37457 MachineBasicBlock *DispatchBB,
37458 int FI) const {
37459 const MIMetadata MIMD(MI);
37460 MachineFunction *MF = MBB->getParent();
37461 MachineRegisterInfo *MRI = &MF->getRegInfo();
37462 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37463
37464 MVT PVT = getPointerTy(MF->getDataLayout());
37465 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
37466
37467 unsigned Op = 0;
37468 Register VR;
37469
37470 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37472
37473 if (UseImmLabel) {
37474 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37475 } else {
37476 const TargetRegisterClass *TRC =
37477 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37478 VR = MRI->createVirtualRegister(TRC);
37479 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37480
37481 if (Subtarget.is64Bit())
37482 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA64r), VR)
37483 .addReg(X86::RIP)
37484 .addImm(1)
37485 .addReg(0)
37486 .addMBB(DispatchBB)
37487 .addReg(0);
37488 else
37489 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA32r), VR)
37490 .addReg(0) /* TII->getGlobalBaseReg(MF) */
37491 .addImm(1)
37492 .addReg(0)
37493 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
37494 .addReg(0);
37495 }
37496
37497 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MIMD, TII->get(Op));
37498 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
37499 if (UseImmLabel)
37500 MIB.addMBB(DispatchBB);
37501 else
37502 MIB.addReg(VR);
37503}
37504
37506X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
37507 MachineBasicBlock *BB) const {
37508 const MIMetadata MIMD(MI);
37509 MachineFunction *MF = BB->getParent();
37510 MachineRegisterInfo *MRI = &MF->getRegInfo();
37511 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37512 int FI = MF->getFrameInfo().getFunctionContextIndex();
37513
37514 // Get a mapping of the call site numbers to all of the landing pads they're
37515 // associated with.
37516 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
37517 unsigned MaxCSNum = 0;
37518 for (auto &MBB : *MF) {
37519 if (!MBB.isEHPad())
37520 continue;
37521
37522 MCSymbol *Sym = nullptr;
37523 for (const auto &MI : MBB) {
37524 if (MI.isDebugInstr())
37525 continue;
37526
37527 assert(MI.isEHLabel() && "expected EH_LABEL");
37528 Sym = MI.getOperand(0).getMCSymbol();
37529 break;
37530 }
37531
37532 if (!MF->hasCallSiteLandingPad(Sym))
37533 continue;
37534
37535 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
37536 CallSiteNumToLPad[CSI].push_back(&MBB);
37537 MaxCSNum = std::max(MaxCSNum, CSI);
37538 }
37539 }
37540
37541 // Get an ordered list of the machine basic blocks for the jump table.
37542 std::vector<MachineBasicBlock *> LPadList;
37543 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
37544 LPadList.reserve(CallSiteNumToLPad.size());
37545
37546 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
37547 for (auto &LP : CallSiteNumToLPad[CSI]) {
37548 LPadList.push_back(LP);
37549 InvokeBBs.insert_range(LP->predecessors());
37550 }
37551 }
37552
37553 assert(!LPadList.empty() &&
37554 "No landing pad destinations for the dispatch jump table!");
37555
37556 // Create the MBBs for the dispatch code.
37557
37558 // Shove the dispatch's address into the return slot in the function context.
37559 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
37560 DispatchBB->setIsEHPad(true);
37561
37562 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
37563 BuildMI(TrapBB, MIMD, TII->get(X86::TRAP));
37564 DispatchBB->addSuccessor(TrapBB);
37565
37566 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
37567 DispatchBB->addSuccessor(DispContBB);
37568
37569 // Insert MBBs.
37570 MF->push_back(DispatchBB);
37571 MF->push_back(DispContBB);
37572 MF->push_back(TrapBB);
37573
37574 // Insert code into the entry block that creates and registers the function
37575 // context.
37576 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
37577
37578 // Create the jump table and associated information
37579 unsigned JTE = getJumpTableEncoding();
37580 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
37581 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
37582
37583 const X86RegisterInfo &RI = TII->getRegisterInfo();
37584 // Add a register mask with no preserved registers. This results in all
37585 // registers being marked as clobbered.
37586 if (RI.hasBasePointer(*MF)) {
37587 const bool FPIs64Bit = Subtarget.isTarget64BitLP64();
37588 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
37589 MFI->setRestoreBasePointer(MF);
37590
37591 Register FP = RI.getFrameRegister(*MF);
37592 Register BP = RI.getBaseRegister();
37593 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
37594 addRegOffset(BuildMI(DispatchBB, MIMD, TII->get(Op), BP), FP, true,
37597 } else {
37598 BuildMI(DispatchBB, MIMD, TII->get(X86::NOOP))
37600 }
37601
37602 // IReg is used as an index in a memory operand and therefore can't be SP
37603 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
37604 addFrameReference(BuildMI(DispatchBB, MIMD, TII->get(X86::MOV32rm), IReg), FI,
37605 Subtarget.is64Bit() ? 8 : 4);
37606 BuildMI(DispatchBB, MIMD, TII->get(X86::CMP32ri))
37607 .addReg(IReg)
37608 .addImm(LPadList.size());
37609 BuildMI(DispatchBB, MIMD, TII->get(X86::JCC_1))
37610 .addMBB(TrapBB)
37612
37613 if (Subtarget.is64Bit()) {
37614 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37615 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
37616
37617 // leaq .LJTI0_0(%rip), BReg
37618 BuildMI(DispContBB, MIMD, TII->get(X86::LEA64r), BReg)
37619 .addReg(X86::RIP)
37620 .addImm(1)
37621 .addReg(0)
37622 .addJumpTableIndex(MJTI)
37623 .addReg(0);
37624 // movzx IReg64, IReg
37625 BuildMI(DispContBB, MIMD, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
37626 .addImm(0)
37627 .addReg(IReg)
37628 .addImm(X86::sub_32bit);
37629
37630 switch (JTE) {
37632 // jmpq *(BReg,IReg64,8)
37633 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64m))
37634 .addReg(BReg)
37635 .addImm(8)
37636 .addReg(IReg64)
37637 .addImm(0)
37638 .addReg(0);
37639 break;
37641 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
37642 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
37643 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37644
37645 // movl (BReg,IReg64,4), OReg
37646 BuildMI(DispContBB, MIMD, TII->get(X86::MOV32rm), OReg)
37647 .addReg(BReg)
37648 .addImm(4)
37649 .addReg(IReg64)
37650 .addImm(0)
37651 .addReg(0);
37652 // movsx OReg64, OReg
37653 BuildMI(DispContBB, MIMD, TII->get(X86::MOVSX64rr32), OReg64)
37654 .addReg(OReg);
37655 // addq BReg, OReg64, TReg
37656 BuildMI(DispContBB, MIMD, TII->get(X86::ADD64rr), TReg)
37657 .addReg(OReg64)
37658 .addReg(BReg);
37659 // jmpq *TReg
37660 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64r)).addReg(TReg);
37661 break;
37662 }
37663 default:
37664 llvm_unreachable("Unexpected jump table encoding");
37665 }
37666 } else {
37667 // jmpl *.LJTI0_0(,IReg,4)
37668 BuildMI(DispContBB, MIMD, TII->get(X86::JMP32m))
37669 .addReg(0)
37670 .addImm(4)
37671 .addReg(IReg)
37672 .addJumpTableIndex(MJTI)
37673 .addReg(0);
37674 }
37675
37676 // Add the jump table entries as successors to the MBB.
37677 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
37678 for (auto &LP : LPadList)
37679 if (SeenMBBs.insert(LP).second)
37680 DispContBB->addSuccessor(LP);
37681
37682 // N.B. the order the invoke BBs are processed in doesn't matter here.
37684 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
37685 for (MachineBasicBlock *MBB : InvokeBBs) {
37686 // Remove the landing pad successor from the invoke block and replace it
37687 // with the new dispatch block.
37688 // Keep a copy of Successors since it's modified inside the loop.
37689 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
37690 MBB->succ_rend());
37691 // FIXME: Avoid quadratic complexity.
37692 for (auto *MBBS : Successors) {
37693 if (MBBS->isEHPad()) {
37694 MBB->removeSuccessor(MBBS);
37695 MBBLPads.push_back(MBBS);
37696 }
37697 }
37698
37699 MBB->addSuccessor(DispatchBB);
37700
37701 // Find the invoke call and mark all of the callee-saved registers as
37702 // 'implicit defined' so that they're spilled. This prevents code from
37703 // moving instructions to before the EH block, where they will never be
37704 // executed.
37705 for (auto &II : reverse(*MBB)) {
37706 if (!II.isCall())
37707 continue;
37708
37709 DenseSet<Register> DefRegs;
37710 for (auto &MOp : II.operands())
37711 if (MOp.isReg())
37712 DefRegs.insert(MOp.getReg());
37713
37714 MachineInstrBuilder MIB(*MF, &II);
37715 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
37716 Register Reg = SavedRegs[RegIdx];
37717 if (!DefRegs.contains(Reg))
37719 }
37720
37721 break;
37722 }
37723 }
37724
37725 // Mark all former landing pads as non-landing pads. The dispatch is the only
37726 // landing pad now.
37727 for (auto &LP : MBBLPads)
37728 LP->setIsEHPad(false);
37729
37730 // The instruction is gone now.
37731 MI.eraseFromParent();
37732 return BB;
37733}
37734
37736X86TargetLowering::emitPatchableEventCall(MachineInstr &MI,
37737 MachineBasicBlock *BB) const {
37738 // Wrap patchable event calls in CALLSEQ_START/CALLSEQ_END, as tracing
37739 // calls may require proper stack alignment.
37740 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
37741 const MIMetadata MIMD(MI);
37742 MachineFunction &MF = *BB->getParent();
37743
37744 // Emit CALLSEQ_START right before the instruction.
37745 MF.getFrameInfo().setAdjustsStack(true);
37746 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
37747 MachineInstrBuilder CallseqStart =
37748 BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
37749 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
37750
37751 // Emit CALLSEQ_END right after the instruction.
37752 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
37753 MachineInstrBuilder CallseqEnd =
37754 BuildMI(MF, MIMD, TII.get(AdjStackUp)).addImm(0).addImm(0);
37755 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
37756
37757 return BB;
37758}
37759
37762 MachineBasicBlock *BB) const {
37763 MachineFunction *MF = BB->getParent();
37764 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37765 const MIMetadata MIMD(MI);
37766
37767 auto TMMImmToTMMReg = [](unsigned Imm) {
37768 assert (Imm < 8 && "Illegal tmm index");
37769 return X86::TMM0 + Imm;
37770 };
37771 auto TMMImmToTMMPair = [](unsigned Imm) {
37772 assert(Imm < 8 && "Illegal tmm pair index.");
37773 return X86::TMM0_TMM1 + Imm / 2;
37774 };
37775 switch (MI.getOpcode()) {
37776 default:
37777 llvm_unreachable("Unexpected instr type to insert");
37778 case X86::INDIRECT_THUNK_CALL32:
37779 case X86::INDIRECT_THUNK_CALL64:
37780 case X86::INDIRECT_THUNK_TCRETURN32:
37781 case X86::INDIRECT_THUNK_TCRETURN64:
37782 return EmitLoweredIndirectThunk(MI, BB);
37783 case X86::CATCHRET:
37784 return EmitLoweredCatchRet(MI, BB);
37785 case X86::SEG_ALLOCA_32:
37786 case X86::SEG_ALLOCA_64:
37787 return EmitLoweredSegAlloca(MI, BB);
37788 case X86::PROBED_ALLOCA_32:
37789 case X86::PROBED_ALLOCA_64:
37790 return EmitLoweredProbedAlloca(MI, BB);
37791 case X86::TLSCall_32:
37792 case X86::TLSCall_64:
37793 return EmitLoweredTLSCall(MI, BB);
37794 case X86::CMOV_FR16:
37795 case X86::CMOV_FR16X:
37796 case X86::CMOV_FR32:
37797 case X86::CMOV_FR32X:
37798 case X86::CMOV_FR64:
37799 case X86::CMOV_FR64X:
37800 case X86::CMOV_GR8:
37801 case X86::CMOV_GR16:
37802 case X86::CMOV_GR32:
37803 case X86::CMOV_RFP32:
37804 case X86::CMOV_RFP64:
37805 case X86::CMOV_RFP80:
37806 case X86::CMOV_VR64:
37807 case X86::CMOV_VR128:
37808 case X86::CMOV_VR128X:
37809 case X86::CMOV_VR256:
37810 case X86::CMOV_VR256X:
37811 case X86::CMOV_VR512:
37812 case X86::CMOV_VK1:
37813 case X86::CMOV_VK2:
37814 case X86::CMOV_VK4:
37815 case X86::CMOV_VK8:
37816 case X86::CMOV_VK16:
37817 case X86::CMOV_VK32:
37818 case X86::CMOV_VK64:
37819 return EmitLoweredSelect(MI, BB);
37820
37821 case X86::FP80_ADDr:
37822 case X86::FP80_ADDm32: {
37823 // Change the floating point control register to use double extended
37824 // precision when performing the addition.
37825 int OrigCWFrameIdx =
37826 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37827 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37828 OrigCWFrameIdx);
37829
37830 // Load the old value of the control word...
37831 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37832 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37833 OrigCWFrameIdx);
37834
37835 // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
37836 // precision.
37837 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37838 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37839 .addReg(OldCW, RegState::Kill)
37840 .addImm(0x300);
37841
37842 // Extract to 16 bits.
37843 Register NewCW16 =
37844 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37845 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37846 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37847
37848 // Prepare memory for FLDCW.
37849 int NewCWFrameIdx =
37850 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37851 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37852 NewCWFrameIdx)
37853 .addReg(NewCW16, RegState::Kill);
37854
37855 // Reload the modified control word now...
37856 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37857 NewCWFrameIdx);
37858
37859 // Do the addition.
37860 if (MI.getOpcode() == X86::FP80_ADDr) {
37861 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80))
37862 .add(MI.getOperand(0))
37863 .add(MI.getOperand(1))
37864 .add(MI.getOperand(2));
37865 } else {
37866 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80m32))
37867 .add(MI.getOperand(0))
37868 .add(MI.getOperand(1))
37869 .add(MI.getOperand(2))
37870 .add(MI.getOperand(3))
37871 .add(MI.getOperand(4))
37872 .add(MI.getOperand(5))
37873 .add(MI.getOperand(6));
37874 }
37875
37876 // Reload the original control word now.
37877 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37878 OrigCWFrameIdx);
37879
37880 MI.eraseFromParent(); // The pseudo instruction is gone now.
37881 return BB;
37882 }
37883
37884 case X86::FP32_TO_INT16_IN_MEM:
37885 case X86::FP32_TO_INT32_IN_MEM:
37886 case X86::FP32_TO_INT64_IN_MEM:
37887 case X86::FP64_TO_INT16_IN_MEM:
37888 case X86::FP64_TO_INT32_IN_MEM:
37889 case X86::FP64_TO_INT64_IN_MEM:
37890 case X86::FP80_TO_INT16_IN_MEM:
37891 case X86::FP80_TO_INT32_IN_MEM:
37892 case X86::FP80_TO_INT64_IN_MEM: {
37893 // Change the floating point control register to use "round towards zero"
37894 // mode when truncating to an integer value.
37895 int OrigCWFrameIdx =
37896 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37897 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37898 OrigCWFrameIdx);
37899
37900 // Load the old value of the control word...
37901 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37902 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37903 OrigCWFrameIdx);
37904
37905 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
37906 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37907 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37908 .addReg(OldCW, RegState::Kill).addImm(0xC00);
37909
37910 // Extract to 16 bits.
37911 Register NewCW16 =
37912 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37913 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37914 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37915
37916 // Prepare memory for FLDCW.
37917 int NewCWFrameIdx =
37918 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37919 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37920 NewCWFrameIdx)
37921 .addReg(NewCW16, RegState::Kill);
37922
37923 // Reload the modified control word now...
37924 addFrameReference(BuildMI(*BB, MI, MIMD,
37925 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
37926
37927 // Get the X86 opcode to use.
37928 unsigned Opc;
37929 switch (MI.getOpcode()) {
37930 // clang-format off
37931 default: llvm_unreachable("illegal opcode!");
37932 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
37933 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
37934 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
37935 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
37936 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
37937 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
37938 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
37939 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
37940 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
37941 // clang-format on
37942 }
37943
37945 addFullAddress(BuildMI(*BB, MI, MIMD, TII->get(Opc)), AM)
37946 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
37947
37948 // Reload the original control word now.
37949 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37950 OrigCWFrameIdx);
37951
37952 MI.eraseFromParent(); // The pseudo instruction is gone now.
37953 return BB;
37954 }
37955
37956 // xbegin
37957 case X86::XBEGIN:
37958 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
37959
37960 case X86::VAARG_64:
37961 case X86::VAARG_X32:
37962 return EmitVAARGWithCustomInserter(MI, BB);
37963
37964 case X86::EH_SjLj_SetJmp32:
37965 case X86::EH_SjLj_SetJmp64:
37966 return emitEHSjLjSetJmp(MI, BB);
37967
37968 case X86::EH_SjLj_LongJmp32:
37969 case X86::EH_SjLj_LongJmp64:
37970 return emitEHSjLjLongJmp(MI, BB);
37971
37972 case X86::Int_eh_sjlj_setup_dispatch:
37973 return EmitSjLjDispatchBlock(MI, BB);
37974
37975 case TargetOpcode::STATEPOINT:
37976 // As an implementation detail, STATEPOINT shares the STACKMAP format at
37977 // this point in the process. We diverge later.
37978 return emitPatchPoint(MI, BB);
37979
37980 case TargetOpcode::STACKMAP:
37981 case TargetOpcode::PATCHPOINT:
37982 return emitPatchPoint(MI, BB);
37983
37984 case TargetOpcode::PATCHABLE_EVENT_CALL:
37985 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
37986 return emitPatchableEventCall(MI, BB);
37987
37988 case X86::LCMPXCHG8B: {
37989 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
37990 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
37991 // requires a memory operand. If it happens that current architecture is
37992 // i686 and for current function we need a base pointer
37993 // - which is ESI for i686 - register allocator would not be able to
37994 // allocate registers for an address in form of X(%reg, %reg, Y)
37995 // - there never would be enough unreserved registers during regalloc
37996 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
37997 // We are giving a hand to register allocator by precomputing the address in
37998 // a new vreg using LEA.
37999
38000 // If it is not i686 or there is no base pointer - nothing to do here.
38001 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
38002 return BB;
38003
38004 // Even though this code does not necessarily needs the base pointer to
38005 // be ESI, we check for that. The reason: if this assert fails, there are
38006 // some changes happened in the compiler base pointer handling, which most
38007 // probably have to be addressed somehow here.
38008 assert(TRI->getBaseRegister() == X86::ESI &&
38009 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
38010 "base pointer in mind");
38011
38013 MVT SPTy = getPointerTy(MF->getDataLayout());
38014 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
38015 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
38016
38018 // Regalloc does not need any help when the memory operand of CMPXCHG8B
38019 // does not use index register.
38020 if (AM.IndexReg == X86::NoRegister)
38021 return BB;
38022
38023 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
38024 // four operand definitions that are E[ABCD] registers. We skip them and
38025 // then insert the LEA.
38026 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
38027 while (RMBBI != BB->rend() &&
38028 (RMBBI->definesRegister(X86::EAX, /*TRI=*/nullptr) ||
38029 RMBBI->definesRegister(X86::EBX, /*TRI=*/nullptr) ||
38030 RMBBI->definesRegister(X86::ECX, /*TRI=*/nullptr) ||
38031 RMBBI->definesRegister(X86::EDX, /*TRI=*/nullptr))) {
38032 ++RMBBI;
38033 }
38036 BuildMI(*BB, *MBBI, MIMD, TII->get(X86::LEA32r), computedAddrVReg), AM);
38037
38038 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
38039
38040 return BB;
38041 }
38042 case X86::LCMPXCHG16B_NO_RBX: {
38043 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38044 Register BasePtr = TRI->getBaseRegister();
38045 if (TRI->hasBasePointer(*MF) &&
38046 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
38047 if (!BB->isLiveIn(BasePtr))
38048 BB->addLiveIn(BasePtr);
38049 // Save RBX into a virtual register.
38050 Register SaveRBX =
38051 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38052 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
38053 .addReg(X86::RBX);
38054 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38056 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
38057 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38058 MIB.add(MI.getOperand(Idx));
38059 MIB.add(MI.getOperand(X86::AddrNumOperands));
38060 MIB.addReg(SaveRBX);
38061 } else {
38062 // Simple case, just copy the virtual register to RBX.
38063 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::RBX)
38064 .add(MI.getOperand(X86::AddrNumOperands));
38066 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B));
38067 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38068 MIB.add(MI.getOperand(Idx));
38069 }
38070 MI.eraseFromParent();
38071 return BB;
38072 }
38073 case X86::MWAITX: {
38074 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38075 Register BasePtr = TRI->getBaseRegister();
38076 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
38077 // If no need to save the base pointer, we generate MWAITXrrr,
38078 // else we generate pseudo MWAITX_SAVE_RBX.
38079 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
38080 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
38081 .addReg(MI.getOperand(0).getReg());
38082 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
38083 .addReg(MI.getOperand(1).getReg());
38084 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EBX)
38085 .addReg(MI.getOperand(2).getReg());
38086 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITXrrr));
38087 MI.eraseFromParent();
38088 } else {
38089 if (!BB->isLiveIn(BasePtr)) {
38090 BB->addLiveIn(BasePtr);
38091 }
38092 // Parameters can be copied into ECX and EAX but not EBX yet.
38093 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
38094 .addReg(MI.getOperand(0).getReg());
38095 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
38096 .addReg(MI.getOperand(1).getReg());
38097 assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
38098 // Save RBX into a virtual register.
38099 Register SaveRBX =
38100 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38101 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
38102 .addReg(X86::RBX);
38103 // Generate mwaitx pseudo.
38104 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38105 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITX_SAVE_RBX))
38106 .addDef(Dst) // Destination tied in with SaveRBX.
38107 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
38108 .addUse(SaveRBX); // Save of base pointer.
38109 MI.eraseFromParent();
38110 }
38111 return BB;
38112 }
38113 case TargetOpcode::PREALLOCATED_SETUP: {
38114 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
38115 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38116 MFI->setHasPreallocatedCall(true);
38117 int64_t PreallocatedId = MI.getOperand(0).getImm();
38118 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
38119 assert(StackAdjustment != 0 && "0 stack adjustment");
38120 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
38121 << StackAdjustment << "\n");
38122 BuildMI(*BB, MI, MIMD, TII->get(X86::SUB32ri), X86::ESP)
38123 .addReg(X86::ESP)
38124 .addImm(StackAdjustment);
38125 MI.eraseFromParent();
38126 return BB;
38127 }
38128 case TargetOpcode::PREALLOCATED_ARG: {
38129 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
38130 int64_t PreallocatedId = MI.getOperand(1).getImm();
38131 int64_t ArgIdx = MI.getOperand(2).getImm();
38132 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38133 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
38134 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
38135 << ", arg offset " << ArgOffset << "\n");
38136 // stack pointer + offset
38137 addRegOffset(BuildMI(*BB, MI, MIMD, TII->get(X86::LEA32r),
38138 MI.getOperand(0).getReg()),
38139 X86::ESP, false, ArgOffset);
38140 MI.eraseFromParent();
38141 return BB;
38142 }
38143 case X86::PTDPBSSD:
38144 case X86::PTDPBSUD:
38145 case X86::PTDPBUSD:
38146 case X86::PTDPBUUD:
38147 case X86::PTDPBF16PS:
38148 case X86::PTDPFP16PS:
38149 case X86::PTCMMIMFP16PS:
38150 case X86::PTCMMRLFP16PS:
38151 case X86::PTDPBF8PS:
38152 case X86::PTDPBHF8PS:
38153 case X86::PTDPHBF8PS:
38154 case X86::PTDPHF8PS:
38155 case X86::PTTDPBF16PS:
38156 case X86::PTTDPFP16PS:
38157 case X86::PTTCMMIMFP16PS:
38158 case X86::PTTCMMRLFP16PS:
38159 case X86::PTCONJTCMMIMFP16PS:
38160 case X86::PTMMULTF32PS:
38161 case X86::PTTMMULTF32PS: {
38162 unsigned Opc;
38163 switch (MI.getOpcode()) {
38164 default: llvm_unreachable("illegal opcode!");
38165 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
38166 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
38167 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
38168 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
38169 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
38170 case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
38171 case X86::PTCMMIMFP16PS:
38172 Opc = X86::TCMMIMFP16PS;
38173 break;
38174 case X86::PTCMMRLFP16PS:
38175 Opc = X86::TCMMRLFP16PS;
38176 break;
38177 case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break;
38178 case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break;
38179 case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break;
38180 case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break;
38181 case X86::PTTDPBF16PS:
38182 Opc = X86::TTDPBF16PS;
38183 break;
38184 case X86::PTTDPFP16PS:
38185 Opc = X86::TTDPFP16PS;
38186 break;
38187 case X86::PTTCMMIMFP16PS:
38188 Opc = X86::TTCMMIMFP16PS;
38189 break;
38190 case X86::PTTCMMRLFP16PS:
38191 Opc = X86::TTCMMRLFP16PS;
38192 break;
38193 case X86::PTCONJTCMMIMFP16PS:
38194 Opc = X86::TCONJTCMMIMFP16PS;
38195 break;
38196 case X86::PTMMULTF32PS:
38197 Opc = X86::TMMULTF32PS;
38198 break;
38199 case X86::PTTMMULTF32PS:
38200 Opc = X86::TTMMULTF32PS;
38201 break;
38202 }
38203
38204 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
38205 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38206 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
38207 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38208 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
38209
38210 MI.eraseFromParent(); // The pseudo is gone now.
38211 return BB;
38212 }
38213 case X86::PTILEZERO: {
38214 unsigned Imm = MI.getOperand(0).getImm();
38215 BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
38216 MI.eraseFromParent(); // The pseudo is gone now.
38217 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38219 return BB;
38220 }
38221 case X86::PTILEZEROV: {
38222 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38224 return BB;
38225 }
38226 case X86::PTILELOADDRS:
38227 case X86::PTILELOADDRST1:
38228 case X86::PTILELOADD:
38229 case X86::PTILELOADDT1:
38230 case X86::PTILESTORED: {
38231 unsigned Opc;
38232 switch (MI.getOpcode()) {
38233 default: llvm_unreachable("illegal opcode!");
38234#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
38235 case X86::PTILELOADD:
38236 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
38237 break;
38238 case X86::PTILELOADDT1:
38239 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDT1);
38240 break;
38241 case X86::PTILESTORED:
38242 Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
38243 break;
38244 case X86::PTILELOADDRS:
38245 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRS);
38246 break;
38247 case X86::PTILELOADDRST1:
38248 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRST1);
38249 break;
38250 }
38251#undef GET_EGPR_IF_ENABLED
38252
38253 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
38254 unsigned CurOp = 0;
38255 if (Opc != X86::TILESTORED && Opc != X86::TILESTORED_EVEX)
38256 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38258
38259 MIB.add(MI.getOperand(CurOp++)); // base
38260 MIB.add(MI.getOperand(CurOp++)); // scale
38261 MIB.add(MI.getOperand(CurOp++)); // index -- stride
38262 MIB.add(MI.getOperand(CurOp++)); // displacement
38263 MIB.add(MI.getOperand(CurOp++)); // segment
38264
38265 if (Opc == X86::TILESTORED || Opc == X86::TILESTORED_EVEX)
38266 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38268
38269 MI.eraseFromParent(); // The pseudo is gone now.
38270 return BB;
38271 }
38272 case X86::PT2RPNTLVWZ0:
38273 case X86::PT2RPNTLVWZ0T1:
38274 case X86::PT2RPNTLVWZ1:
38275 case X86::PT2RPNTLVWZ1T1:
38276 case X86::PT2RPNTLVWZ0RS:
38277 case X86::PT2RPNTLVWZ0RST1:
38278 case X86::PT2RPNTLVWZ1RS:
38279 case X86::PT2RPNTLVWZ1RST1: {
38280 const DebugLoc &DL = MI.getDebugLoc();
38281 unsigned Opc;
38282#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
38283 switch (MI.getOpcode()) {
38284 default:
38285 llvm_unreachable("Unexpected instruction!");
38286 case X86::PT2RPNTLVWZ0:
38287 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0);
38288 break;
38289 case X86::PT2RPNTLVWZ0T1:
38290 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1);
38291 break;
38292 case X86::PT2RPNTLVWZ1:
38293 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1);
38294 break;
38295 case X86::PT2RPNTLVWZ1T1:
38296 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1);
38297 break;
38298 case X86::PT2RPNTLVWZ0RS:
38299 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS);
38300 break;
38301 case X86::PT2RPNTLVWZ0RST1:
38302 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1);
38303 break;
38304 case X86::PT2RPNTLVWZ1RS:
38305 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS);
38306 break;
38307 case X86::PT2RPNTLVWZ1RST1:
38308 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1);
38309 break;
38310 }
38311#undef GET_EGPR_IF_ENABLED
38312 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38313 MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define);
38314
38315 MIB.add(MI.getOperand(1)); // base
38316 MIB.add(MI.getOperand(2)); // scale
38317 MIB.add(MI.getOperand(3)); // index
38318 MIB.add(MI.getOperand(4)); // displacement
38319 MIB.add(MI.getOperand(5)); // segment
38320 MI.eraseFromParent(); // The pseudo is gone now.
38321 return BB;
38322 }
38323 case X86::PTTRANSPOSED:
38324 case X86::PTCONJTFP16: {
38325 const DebugLoc &DL = MI.getDebugLoc();
38326 unsigned Opc = MI.getOpcode() == X86::PTTRANSPOSED ? X86::TTRANSPOSED
38327 : X86::TCONJTFP16;
38328
38329 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38330 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38331 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38332
38333 MI.eraseFromParent(); // The pseudo is gone now.
38334 return BB;
38335 }
38336 case X86::PTCVTROWPS2BF16Hrri:
38337 case X86::PTCVTROWPS2BF16Lrri:
38338 case X86::PTCVTROWPS2PHHrri:
38339 case X86::PTCVTROWPS2PHLrri:
38340 case X86::PTCVTROWD2PSrri:
38341 case X86::PTILEMOVROWrri: {
38342 const DebugLoc &DL = MI.getDebugLoc();
38343 unsigned Opc;
38344 switch (MI.getOpcode()) {
38345 default:
38346 llvm_unreachable("Unexpected instruction!");
38347 case X86::PTCVTROWD2PSrri:
38348 Opc = X86::TCVTROWD2PSrri;
38349 break;
38350 case X86::PTCVTROWPS2BF16Hrri:
38351 Opc = X86::TCVTROWPS2BF16Hrri;
38352 break;
38353 case X86::PTCVTROWPS2PHHrri:
38354 Opc = X86::TCVTROWPS2PHHrri;
38355 break;
38356 case X86::PTCVTROWPS2BF16Lrri:
38357 Opc = X86::TCVTROWPS2BF16Lrri;
38358 break;
38359 case X86::PTCVTROWPS2PHLrri:
38360 Opc = X86::TCVTROWPS2PHLrri;
38361 break;
38362 case X86::PTILEMOVROWrri:
38363 Opc = X86::TILEMOVROWrri;
38364 break;
38365 }
38366 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38367 MIB.add(MI.getOperand(0));
38368 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38369 MIB.addImm(MI.getOperand(2).getImm());
38370
38371 MI.eraseFromParent(); // The pseudo is gone now.
38372 return BB;
38373 }
38374 case X86::PTCVTROWPS2BF16Hrre:
38375 case X86::PTCVTROWPS2BF16Lrre:
38376 case X86::PTCVTROWPS2PHHrre:
38377 case X86::PTCVTROWPS2PHLrre:
38378 case X86::PTCVTROWD2PSrre:
38379 case X86::PTILEMOVROWrre: {
38380 const DebugLoc &DL = MI.getDebugLoc();
38381 unsigned Opc;
38382 switch (MI.getOpcode()) {
38383 default:
38384 llvm_unreachable("Unexpected instruction!");
38385 case X86::PTCVTROWD2PSrre:
38386 Opc = X86::TCVTROWD2PSrre;
38387 break;
38388 case X86::PTCVTROWPS2BF16Hrre:
38389 Opc = X86::TCVTROWPS2BF16Hrre;
38390 break;
38391 case X86::PTCVTROWPS2BF16Lrre:
38392 Opc = X86::TCVTROWPS2BF16Lrre;
38393 break;
38394 case X86::PTCVTROWPS2PHHrre:
38395 Opc = X86::TCVTROWPS2PHHrre;
38396 break;
38397 case X86::PTCVTROWPS2PHLrre:
38398 Opc = X86::TCVTROWPS2PHLrre;
38399 break;
38400 case X86::PTILEMOVROWrre:
38401 Opc = X86::TILEMOVROWrre;
38402 break;
38403 }
38404 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38405 MIB.add(MI.getOperand(0));
38406 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38407 MIB.add(MI.getOperand(2));
38408
38409 MI.eraseFromParent(); // The pseudo is gone now.
38410 return BB;
38411 }
38412 }
38413}
38414
38415//===----------------------------------------------------------------------===//
38416// X86 Optimization Hooks
38417//===----------------------------------------------------------------------===//
38418
38419bool
38421 const APInt &DemandedBits,
38422 const APInt &DemandedElts,
38423 TargetLoweringOpt &TLO) const {
38424 EVT VT = Op.getValueType();
38425 unsigned Opcode = Op.getOpcode();
38426 unsigned EltSize = VT.getScalarSizeInBits();
38427
38428 if (VT.isVector()) {
38429 // If the constant is only all signbits in the active bits, then we should
38430 // extend it to the entire constant to allow it act as a boolean constant
38431 // vector.
38432 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
38433 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
38434 return false;
38435 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
38436 if (!DemandedElts[i] || V.getOperand(i).isUndef())
38437 continue;
38438 const APInt &Val = V.getConstantOperandAPInt(i);
38439 if (Val.getBitWidth() > Val.getNumSignBits() &&
38440 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
38441 return true;
38442 }
38443 return false;
38444 };
38445 // For vectors - if we have a constant, then try to sign extend.
38446 // TODO: Handle AND cases.
38447 unsigned ActiveBits = DemandedBits.getActiveBits();
38448 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
38449 (Opcode == ISD::OR || Opcode == ISD::XOR || Opcode == X86ISD::ANDNP) &&
38450 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
38451 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
38452 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
38454 SDValue NewC =
38456 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
38457 SDValue NewOp =
38458 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
38459 return TLO.CombineTo(Op, NewOp);
38460 }
38461 return false;
38462 }
38463
38464 // Only optimize Ands to prevent shrinking a constant that could be
38465 // matched by movzx.
38466 if (Opcode != ISD::AND)
38467 return false;
38468
38469 // Make sure the RHS really is a constant.
38470 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
38471 if (!C)
38472 return false;
38473
38474 const APInt &Mask = C->getAPIntValue();
38475
38476 // Clear all non-demanded bits initially.
38477 APInt ShrunkMask = Mask & DemandedBits;
38478
38479 // Find the width of the shrunk mask.
38480 unsigned Width = ShrunkMask.getActiveBits();
38481
38482 // If the mask is all 0s there's nothing to do here.
38483 if (Width == 0)
38484 return false;
38485
38486 // Find the next power of 2 width, rounding up to a byte.
38487 Width = llvm::bit_ceil(std::max(Width, 8U));
38488 // Truncate the width to size to handle illegal types.
38489 Width = std::min(Width, EltSize);
38490
38491 // Calculate a possible zero extend mask for this constant.
38492 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
38493
38494 // If we aren't changing the mask, just return true to keep it and prevent
38495 // the caller from optimizing.
38496 if (ZeroExtendMask == Mask)
38497 return true;
38498
38499 // Make sure the new mask can be represented by a combination of mask bits
38500 // and non-demanded bits.
38501 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
38502 return false;
38503
38504 // Replace the constant with the zero extend mask.
38505 SDLoc DL(Op);
38506 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
38507 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
38508 return TLO.CombineTo(Op, NewOp);
38509}
38510
38512 KnownBits &Known,
38513 const APInt &DemandedElts,
38514 const SelectionDAG &DAG, unsigned Depth) {
38515 KnownBits Known2;
38516 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38517 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38518 Known = DAG.computeKnownBits(RHS, DemandedSrcElts, Depth + 1);
38519 Known2 = DAG.computeKnownBits(LHS, DemandedSrcElts, Depth + 1);
38520 Known = KnownBits::abdu(Known, Known2).zext(16);
38521 // Known = (((D0 + D1) + (D2 + D3)) + ((D4 + D5) + (D6 + D7)))
38522 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38523 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38524 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38525 Known = Known.zext(64);
38526}
38527
38529 KnownBits &Known,
38530 const APInt &DemandedElts,
38531 const SelectionDAG &DAG,
38532 unsigned Depth) {
38533 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38534
38535 // Multiply signed i16 elements to create i32 values and add Lo/Hi pairs.
38536 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38537 APInt DemandedLoElts =
38538 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38539 APInt DemandedHiElts =
38540 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38541 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38542 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38543 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38544 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38545 KnownBits Lo = KnownBits::mul(LHSLo.sext(32), RHSLo.sext(32));
38546 KnownBits Hi = KnownBits::mul(LHSHi.sext(32), RHSHi.sext(32));
38547 Known = KnownBits::add(Lo, Hi, /*NSW=*/false, /*NUW=*/false);
38548}
38549
38551 KnownBits &Known,
38552 const APInt &DemandedElts,
38553 const SelectionDAG &DAG,
38554 unsigned Depth) {
38555 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38556
38557 // Multiply unsigned/signed i8 elements to create i16 values and add_sat Lo/Hi
38558 // pairs.
38559 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38560 APInt DemandedLoElts =
38561 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38562 APInt DemandedHiElts =
38563 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38564 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38565 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38566 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38567 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38568 KnownBits Lo = KnownBits::mul(LHSLo.zext(16), RHSLo.sext(16));
38569 KnownBits Hi = KnownBits::mul(LHSHi.zext(16), RHSHi.sext(16));
38570 Known = KnownBits::sadd_sat(Lo, Hi);
38571}
38572
38574 const SDValue Op, const APInt &DemandedElts, unsigned Depth,
38575 const SelectionDAG &DAG,
38576 const function_ref<KnownBits(const KnownBits &, const KnownBits &)>
38577 KnownBitsFunc) {
38578 APInt DemandedEltsLHS, DemandedEltsRHS;
38579 getHorizDemandedEltsForFirstOperand(Op.getValueType().getSizeInBits(),
38580 DemandedElts, DemandedEltsLHS,
38581 DemandedEltsRHS);
38582
38583 const auto ComputeForSingleOpFunc =
38584 [&DAG, Depth, KnownBitsFunc](SDValue Op, APInt &DemandedEltsOp) {
38585 return KnownBitsFunc(
38586 DAG.computeKnownBits(Op, DemandedEltsOp, Depth + 1),
38587 DAG.computeKnownBits(Op, DemandedEltsOp << 1, Depth + 1));
38588 };
38589
38590 if (DemandedEltsRHS.isZero())
38591 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS);
38592 if (DemandedEltsLHS.isZero())
38593 return ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS);
38594
38595 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS)
38596 .intersectWith(ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS));
38597}
38598
38600 KnownBits &Known,
38601 const APInt &DemandedElts,
38602 const SelectionDAG &DAG,
38603 unsigned Depth) const {
38604 unsigned BitWidth = Known.getBitWidth();
38605 unsigned NumElts = DemandedElts.getBitWidth();
38606 unsigned Opc = Op.getOpcode();
38607 EVT VT = Op.getValueType();
38612 "Should use MaskedValueIsZero if you don't know whether Op"
38613 " is a target node!");
38614
38615 Known.resetAll();
38616 switch (Opc) {
38617 default: break;
38618 case X86ISD::MUL_IMM: {
38619 KnownBits Known2;
38620 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38621 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38622 Known = KnownBits::mul(Known, Known2);
38623 break;
38624 }
38625 case X86ISD::BSF: {
38627
38628 KnownBits Known2;
38629 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38630 if (Known2.isNonZero()) {
38631 // If we have a known 1, its position is our upper bound.
38632 unsigned PossibleTZ = Known2.countMaxTrailingZeros();
38633 unsigned LowBits = llvm::bit_width(PossibleTZ);
38634 Known.Zero.setBitsFrom(LowBits);
38635 } else if (!Op.getOperand(0).isUndef()) {
38636 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38637 Known = Known.intersectWith(Known2);
38638 }
38639 break;
38640 }
38641 case X86ISD::BSR: {
38642 // TODO: Bound with input known bits?
38644
38645 if (!Op.getOperand(0).isUndef() &&
38646 !DAG.isKnownNeverZero(Op.getOperand(1), Depth + 1)) {
38647 KnownBits Known2;
38648 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38649 Known = Known.intersectWith(Known2);
38650 }
38651 break;
38652 }
38653 case X86ISD::SETCC:
38654 Known.Zero.setBitsFrom(1);
38655 break;
38656 case X86ISD::MOVMSK: {
38657 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
38658 Known.Zero.setBitsFrom(NumLoBits);
38659 break;
38660 }
38661 case X86ISD::PEXTRB:
38662 case X86ISD::PEXTRW: {
38663 SDValue Src = Op.getOperand(0);
38664 EVT SrcVT = Src.getValueType();
38665 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
38666 Op.getConstantOperandVal(1));
38667 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
38668 Known = Known.anyextOrTrunc(BitWidth);
38669 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
38670 break;
38671 }
38672 case X86ISD::VSRAI:
38673 case X86ISD::VSHLI:
38674 case X86ISD::VSRLI: {
38675 unsigned ShAmt = Op.getConstantOperandVal(1);
38676 if (ShAmt >= VT.getScalarSizeInBits()) {
38677 // Out of range logical bit shifts are guaranteed to be zero.
38678 // Out of range arithmetic bit shifts splat the sign bit.
38679 if (Opc != X86ISD::VSRAI) {
38680 Known.setAllZero();
38681 break;
38682 }
38683
38684 ShAmt = VT.getScalarSizeInBits() - 1;
38685 }
38686
38687 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38688 if (Opc == X86ISD::VSHLI) {
38689 Known <<= ShAmt;
38690 // Low bits are known zero.
38691 Known.Zero.setLowBits(ShAmt);
38692 } else if (Opc == X86ISD::VSRLI) {
38693 Known >>= ShAmt;
38694 // High bits are known zero.
38695 Known.Zero.setHighBits(ShAmt);
38696 } else {
38697 Known.Zero.ashrInPlace(ShAmt);
38698 Known.One.ashrInPlace(ShAmt);
38699 }
38700 break;
38701 }
38702 case X86ISD::PACKUS: {
38703 // PACKUS is just a truncation if the upper half is zero.
38704 APInt DemandedLHS, DemandedRHS;
38705 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38706
38707 Known.One = APInt::getAllOnes(BitWidth * 2);
38708 Known.Zero = APInt::getAllOnes(BitWidth * 2);
38709
38710 KnownBits Known2;
38711 if (!!DemandedLHS) {
38712 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
38713 Known = Known.intersectWith(Known2);
38714 }
38715 if (!!DemandedRHS) {
38716 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
38717 Known = Known.intersectWith(Known2);
38718 }
38719
38720 if (Known.countMinLeadingZeros() < BitWidth)
38721 Known.resetAll();
38722 Known = Known.trunc(BitWidth);
38723 break;
38724 }
38725 case X86ISD::PSHUFB: {
38726 SDValue Src = Op.getOperand(0);
38727 SDValue Idx = Op.getOperand(1);
38728
38729 // If the index vector is never negative (MSB is zero), then all elements
38730 // come from the source vector. This is useful for cases where
38731 // PSHUFB is being used as a LUT (ctpop etc.) - the target shuffle handling
38732 // below will handle the more common constant shuffle mask case.
38733 KnownBits KnownIdx = DAG.computeKnownBits(Idx, DemandedElts, Depth + 1);
38734 if (KnownIdx.isNonNegative())
38735 Known = DAG.computeKnownBits(Src, Depth + 1);
38736 break;
38737 }
38738 case X86ISD::VBROADCAST: {
38739 SDValue Src = Op.getOperand(0);
38740 if (!Src.getSimpleValueType().isVector()) {
38741 Known = DAG.computeKnownBits(Src, Depth + 1);
38742 return;
38743 }
38744 break;
38745 }
38746 case X86ISD::AND: {
38747 if (Op.getResNo() == 0) {
38748 KnownBits Known2;
38749 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38750 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38751 Known &= Known2;
38752 }
38753 break;
38754 }
38755 case X86ISD::ANDNP: {
38756 KnownBits Known2;
38757 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38758 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38759
38760 // ANDNP = (~X & Y);
38761 Known.One &= Known2.Zero;
38762 Known.Zero |= Known2.One;
38763 break;
38764 }
38765 case X86ISD::FOR: {
38766 KnownBits Known2;
38767 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38768 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38769
38770 Known |= Known2;
38771 break;
38772 }
38773 case X86ISD::PSADBW: {
38774 SDValue LHS = Op.getOperand(0);
38775 SDValue RHS = Op.getOperand(1);
38776 assert(VT.getScalarType() == MVT::i64 &&
38777 LHS.getValueType() == RHS.getValueType() &&
38778 LHS.getValueType().getScalarType() == MVT::i8 &&
38779 "Unexpected PSADBW types");
38780 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38781 break;
38782 }
38783 case X86ISD::PCMPGT:
38784 case X86ISD::PCMPEQ: {
38785 KnownBits KnownLhs =
38786 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38787 KnownBits KnownRhs =
38788 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38789 std::optional<bool> Res = Opc == X86ISD::PCMPEQ
38790 ? KnownBits::eq(KnownLhs, KnownRhs)
38791 : KnownBits::sgt(KnownLhs, KnownRhs);
38792 if (Res) {
38793 if (*Res)
38794 Known.setAllOnes();
38795 else
38796 Known.setAllZero();
38797 }
38798 break;
38799 }
38800 case X86ISD::VPMADDWD: {
38801 SDValue LHS = Op.getOperand(0);
38802 SDValue RHS = Op.getOperand(1);
38803 assert(VT.getVectorElementType() == MVT::i32 &&
38804 LHS.getValueType() == RHS.getValueType() &&
38805 LHS.getValueType().getVectorElementType() == MVT::i16 &&
38806 "Unexpected PMADDWD types");
38807 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38808 break;
38809 }
38810 case X86ISD::VPMADDUBSW: {
38811 SDValue LHS = Op.getOperand(0);
38812 SDValue RHS = Op.getOperand(1);
38813 assert(VT.getVectorElementType() == MVT::i16 &&
38814 LHS.getValueType() == RHS.getValueType() &&
38815 LHS.getValueType().getVectorElementType() == MVT::i8 &&
38816 "Unexpected PMADDUBSW types");
38817 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38818 break;
38819 }
38820 case X86ISD::PMULUDQ: {
38821 KnownBits Known2;
38822 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38823 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38824
38825 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
38826 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
38827 Known = KnownBits::mul(Known, Known2);
38828 break;
38829 }
38830 case X86ISD::CMOV: {
38831 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
38832 // If we don't know any bits, early out.
38833 if (Known.isUnknown())
38834 break;
38835 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
38836
38837 // Only known if known in both the LHS and RHS.
38838 Known = Known.intersectWith(Known2);
38839 break;
38840 }
38841 case X86ISD::BEXTR:
38842 case X86ISD::BEXTRI: {
38843 SDValue Op0 = Op.getOperand(0);
38844 SDValue Op1 = Op.getOperand(1);
38845
38846 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
38847 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
38848 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
38849
38850 // If the length is 0, the result is 0.
38851 if (Length == 0) {
38852 Known.setAllZero();
38853 break;
38854 }
38855
38856 if ((Shift + Length) <= BitWidth) {
38857 Known = DAG.computeKnownBits(Op0, Depth + 1);
38858 Known = Known.extractBits(Length, Shift);
38859 Known = Known.zextOrTrunc(BitWidth);
38860 }
38861 }
38862 break;
38863 }
38864 case X86ISD::PDEP: {
38865 KnownBits Known2;
38866 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38867 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38868 // Zeros are retained from the mask operand. But not ones.
38869 Known.One.clearAllBits();
38870 // The result will have at least as many trailing zeros as the non-mask
38871 // operand since bits can only map to the same or higher bit position.
38872 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
38873 break;
38874 }
38875 case X86ISD::PEXT: {
38876 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38877 // The result has as many leading zeros as the number of zeroes in the mask.
38878 unsigned Count = Known.Zero.popcount();
38880 Known.One.clearAllBits();
38881 break;
38882 }
38883 case X86ISD::VTRUNC:
38884 case X86ISD::VTRUNCS:
38885 case X86ISD::VTRUNCUS:
38886 case X86ISD::CVTSI2P:
38887 case X86ISD::CVTUI2P:
38888 case X86ISD::CVTP2SI:
38889 case X86ISD::CVTP2UI:
38890 case X86ISD::MCVTP2SI:
38891 case X86ISD::MCVTP2UI:
38892 case X86ISD::CVTTP2SI:
38893 case X86ISD::CVTTP2UI:
38894 case X86ISD::MCVTTP2SI:
38895 case X86ISD::MCVTTP2UI:
38896 case X86ISD::MCVTSI2P:
38897 case X86ISD::MCVTUI2P:
38898 case X86ISD::VFPROUND:
38899 case X86ISD::VMFPROUND:
38900 case X86ISD::CVTPS2PH:
38901 case X86ISD::MCVTPS2PH:
38902 case X86ISD::MCVTTP2SIS:
38903 case X86ISD::MCVTTP2UIS: {
38904 // Truncations/Conversions - upper elements are known zero.
38905 EVT SrcVT = Op.getOperand(0).getValueType();
38906 if (SrcVT.isVector()) {
38907 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38908 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38909 Known.setAllZero();
38910 }
38911 break;
38912 }
38919 // Strict Conversions - upper elements are known zero.
38920 EVT SrcVT = Op.getOperand(1).getValueType();
38921 if (SrcVT.isVector()) {
38922 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38923 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38924 Known.setAllZero();
38925 }
38926 break;
38927 }
38928 case X86ISD::MOVQ2DQ: {
38929 // Move from MMX to XMM. Upper half of XMM should be 0.
38930 if (DemandedElts.countr_zero() >= (NumElts / 2))
38931 Known.setAllZero();
38932 break;
38933 }
38935 APInt UndefElts;
38936 SmallVector<APInt, 16> EltBits;
38937 if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,
38938 /*AllowWholeUndefs*/ false,
38939 /*AllowPartialUndefs*/ false)) {
38940 Known.Zero.setAllBits();
38941 Known.One.setAllBits();
38942 for (unsigned I = 0; I != NumElts; ++I) {
38943 if (!DemandedElts[I])
38944 continue;
38945 if (UndefElts[I]) {
38946 Known.resetAll();
38947 break;
38948 }
38949 KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);
38950 Known = Known.intersectWith(Known2);
38951 }
38952 return;
38953 }
38954 break;
38955 }
38956 case X86ISD::HADD:
38957 case X86ISD::HSUB: {
38959 Op, DemandedElts, Depth, DAG,
38960 [Opc](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
38962 /*Add=*/Opc == X86ISD::HADD, /*NSW=*/false, /*NUW=*/false,
38963 KnownLHS, KnownRHS);
38964 });
38965 break;
38966 }
38968 switch (Op->getConstantOperandVal(0)) {
38969 case Intrinsic::x86_sse2_pmadd_wd:
38970 case Intrinsic::x86_avx2_pmadd_wd:
38971 case Intrinsic::x86_avx512_pmaddw_d_512: {
38972 SDValue LHS = Op.getOperand(1);
38973 SDValue RHS = Op.getOperand(2);
38974 assert(VT.getScalarType() == MVT::i32 &&
38975 LHS.getValueType() == RHS.getValueType() &&
38976 LHS.getValueType().getScalarType() == MVT::i16 &&
38977 "Unexpected PMADDWD types");
38978 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38979 break;
38980 }
38981 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
38982 case Intrinsic::x86_avx2_pmadd_ub_sw:
38983 case Intrinsic::x86_avx512_pmaddubs_w_512: {
38984 SDValue LHS = Op.getOperand(1);
38985 SDValue RHS = Op.getOperand(2);
38986 assert(VT.getScalarType() == MVT::i16 &&
38987 LHS.getValueType() == RHS.getValueType() &&
38988 LHS.getValueType().getScalarType() == MVT::i8 &&
38989 "Unexpected PMADDUBSW types");
38990 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38991 break;
38992 }
38993 case Intrinsic::x86_sse2_psad_bw:
38994 case Intrinsic::x86_avx2_psad_bw:
38995 case Intrinsic::x86_avx512_psad_bw_512: {
38996 SDValue LHS = Op.getOperand(1);
38997 SDValue RHS = Op.getOperand(2);
38998 assert(VT.getScalarType() == MVT::i64 &&
38999 LHS.getValueType() == RHS.getValueType() &&
39000 LHS.getValueType().getScalarType() == MVT::i8 &&
39001 "Unexpected PSADBW types");
39002 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
39003 break;
39004 }
39005 }
39006 break;
39007 }
39008 case X86ISD::VPMADD52L:
39009 case X86ISD::VPMADD52H: {
39010 assert(Op.getValueType().isVector() &&
39011 Op.getValueType().getScalarType() == MVT::i64 &&
39012 "Unexpected VPMADD52 type");
39013 KnownBits K0 =
39014 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
39015 KnownBits K1 =
39016 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
39017 KnownBits KAcc =
39018 DAG.computeKnownBits(Op.getOperand(2), DemandedElts, Depth + 1);
39019 K0 = K0.trunc(52);
39020 K1 = K1.trunc(52);
39021 KnownBits KnownMul = (Op.getOpcode() == X86ISD::VPMADD52L)
39022 ? KnownBits::mul(K0, K1)
39023 : KnownBits::mulhu(K0, K1);
39024 KnownMul = KnownMul.zext(64);
39025 Known = KnownBits::add(KAcc, KnownMul);
39026 return;
39027 }
39028 }
39029
39030 // Handle target shuffles.
39031 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
39032 if (isTargetShuffle(Opc)) {
39035 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
39036 unsigned NumOps = Ops.size();
39037 unsigned NumElts = VT.getVectorNumElements();
39038 if (Mask.size() == NumElts) {
39039 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
39040 Known.Zero.setAllBits(); Known.One.setAllBits();
39041 for (unsigned i = 0; i != NumElts; ++i) {
39042 if (!DemandedElts[i])
39043 continue;
39044 int M = Mask[i];
39045 if (M == SM_SentinelUndef) {
39046 // For UNDEF elements, we don't know anything about the common state
39047 // of the shuffle result.
39048 Known.resetAll();
39049 break;
39050 }
39051 if (M == SM_SentinelZero) {
39052 Known.One.clearAllBits();
39053 continue;
39054 }
39055 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
39056 "Shuffle index out of range");
39057
39058 unsigned OpIdx = (unsigned)M / NumElts;
39059 unsigned EltIdx = (unsigned)M % NumElts;
39060 if (Ops[OpIdx].getValueType() != VT) {
39061 // TODO - handle target shuffle ops with different value types.
39062 Known.resetAll();
39063 break;
39064 }
39065 DemandedOps[OpIdx].setBit(EltIdx);
39066 }
39067 // Known bits are the values that are shared by every demanded element.
39068 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
39069 if (!DemandedOps[i])
39070 continue;
39071 KnownBits Known2 =
39072 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
39073 Known = Known.intersectWith(Known2);
39074 }
39075 }
39076 }
39077 }
39078}
39079
39081 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
39082 unsigned Depth) const {
39083 EVT VT = Op.getValueType();
39084 unsigned VTBits = VT.getScalarSizeInBits();
39085 unsigned Opcode = Op.getOpcode();
39086 switch (Opcode) {
39088 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
39089 return VTBits;
39090
39091 case X86ISD::VTRUNC: {
39092 SDValue Src = Op.getOperand(0);
39093 MVT SrcVT = Src.getSimpleValueType();
39094 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
39095 assert(VTBits < NumSrcBits && "Illegal truncation input type");
39096 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
39097 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
39098 if (Tmp > (NumSrcBits - VTBits))
39099 return Tmp - (NumSrcBits - VTBits);
39100 return 1;
39101 }
39102
39103 case X86ISD::PACKSS: {
39104 // PACKSS is just a truncation if the sign bits extend to the packed size.
39105 APInt DemandedLHS, DemandedRHS;
39106 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
39107 DemandedRHS);
39108
39109 // Helper to detect PACKSSDW(BITCAST(PACKSSDW(X)),BITCAST(PACKSSDW(Y)))
39110 // patterns often used to compact vXi64 allsignbit patterns.
39111 auto NumSignBitsPACKSS = [&](SDValue V, const APInt &Elts) -> unsigned {
39113 if (BC.getOpcode() == X86ISD::PACKSS &&
39114 BC.getScalarValueSizeInBits() == 16 &&
39115 V.getScalarValueSizeInBits() == 32) {
39118 if (BC0.getScalarValueSizeInBits() == 64 &&
39119 BC1.getScalarValueSizeInBits() == 64 &&
39120 DAG.ComputeNumSignBits(BC0, Depth + 1) == 64 &&
39121 DAG.ComputeNumSignBits(BC1, Depth + 1) == 64)
39122 return 32;
39123 }
39124 return DAG.ComputeNumSignBits(V, Elts, Depth + 1);
39125 };
39126
39127 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
39128 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
39129 if (!!DemandedLHS)
39130 Tmp0 = NumSignBitsPACKSS(Op.getOperand(0), DemandedLHS);
39131 if (!!DemandedRHS)
39132 Tmp1 = NumSignBitsPACKSS(Op.getOperand(1), DemandedRHS);
39133 unsigned Tmp = std::min(Tmp0, Tmp1);
39134 if (Tmp > (SrcBits - VTBits))
39135 return Tmp - (SrcBits - VTBits);
39136 return 1;
39137 }
39138
39139 case X86ISD::VBROADCAST: {
39140 SDValue Src = Op.getOperand(0);
39141 if (!Src.getSimpleValueType().isVector())
39142 return DAG.ComputeNumSignBits(Src, Depth + 1);
39143 break;
39144 }
39145
39146 case X86ISD::VSHLI: {
39147 SDValue Src = Op.getOperand(0);
39148 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
39149 if (ShiftVal.uge(VTBits))
39150 return VTBits; // Shifted all bits out --> zero.
39151 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39152 if (ShiftVal.uge(Tmp))
39153 return 1; // Shifted all sign bits out --> unknown.
39154 return Tmp - ShiftVal.getZExtValue();
39155 }
39156
39157 case X86ISD::VSRAI: {
39158 SDValue Src = Op.getOperand(0);
39159 APInt ShiftVal = Op.getConstantOperandAPInt(1);
39160 if (ShiftVal.uge(VTBits - 1))
39161 return VTBits; // Sign splat.
39162 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39163 ShiftVal += Tmp;
39164 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
39165 }
39166
39167 case X86ISD::FSETCC:
39168 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
39169 if (VT == MVT::f32 || VT == MVT::f64 ||
39170 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
39171 return VTBits;
39172 break;
39173
39174 case X86ISD::PCMPGT:
39175 case X86ISD::PCMPEQ:
39176 case X86ISD::CMPP:
39177 case X86ISD::VPCOM:
39178 case X86ISD::VPCOMU:
39179 // Vector compares return zero/all-bits result values.
39180 return VTBits;
39181
39182 case X86ISD::ANDNP: {
39183 unsigned Tmp0 =
39184 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
39185 if (Tmp0 == 1) return 1; // Early out.
39186 unsigned Tmp1 =
39187 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
39188 return std::min(Tmp0, Tmp1);
39189 }
39190
39191 case X86ISD::CMOV: {
39192 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
39193 if (Tmp0 == 1) return 1; // Early out.
39194 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
39195 return std::min(Tmp0, Tmp1);
39196 }
39197 }
39198
39199 // Handle target shuffles.
39200 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
39201 if (isTargetShuffle(Opcode)) {
39204 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
39205 unsigned NumOps = Ops.size();
39206 unsigned NumElts = VT.getVectorNumElements();
39207 if (Mask.size() == NumElts) {
39208 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
39209 for (unsigned i = 0; i != NumElts; ++i) {
39210 if (!DemandedElts[i])
39211 continue;
39212 int M = Mask[i];
39213 if (M == SM_SentinelUndef) {
39214 // For UNDEF elements, we don't know anything about the common state
39215 // of the shuffle result.
39216 return 1;
39217 } else if (M == SM_SentinelZero) {
39218 // Zero = all sign bits.
39219 continue;
39220 }
39221 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
39222 "Shuffle index out of range");
39223
39224 unsigned OpIdx = (unsigned)M / NumElts;
39225 unsigned EltIdx = (unsigned)M % NumElts;
39226 if (Ops[OpIdx].getValueType() != VT) {
39227 // TODO - handle target shuffle ops with different value types.
39228 return 1;
39229 }
39230 DemandedOps[OpIdx].setBit(EltIdx);
39231 }
39232 unsigned Tmp0 = VTBits;
39233 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
39234 if (!DemandedOps[i])
39235 continue;
39236 unsigned Tmp1 =
39237 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
39238 Tmp0 = std::min(Tmp0, Tmp1);
39239 }
39240 return Tmp0;
39241 }
39242 }
39243 }
39244
39245 // Fallback case.
39246 return 1;
39247}
39248
39250 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
39251 return N->getOperand(0);
39252 return N;
39253}
39254
39255// Helper to look for a normal load that can be narrowed into a vzload with the
39256// specified VT and memory VT. Returns SDValue() on failure.
39258 SelectionDAG &DAG) {
39259 // Can't if the load is volatile or atomic.
39260 if (!LN->isSimple())
39261 return SDValue();
39262
39263 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39264 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
39265 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
39266 LN->getPointerInfo(), LN->getBaseAlign(),
39267 LN->getMemOperand()->getFlags());
39268}
39269
39270// Attempt to match a combined shuffle mask against supported unary shuffle
39271// instructions.
39272// TODO: Investigate sharing more of this with shuffle lowering.
39273static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39274 bool AllowFloatDomain, bool AllowIntDomain,
39275 SDValue V1, const SelectionDAG &DAG,
39276 const X86Subtarget &Subtarget, unsigned &Shuffle,
39277 MVT &SrcVT, MVT &DstVT) {
39278 unsigned NumMaskElts = Mask.size();
39279 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
39280
39281 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
39282 if (Mask[0] == 0 &&
39283 (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
39284 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
39286 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
39287 Shuffle = X86ISD::VZEXT_MOVL;
39288 if (MaskEltSize == 16)
39289 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39290 else
39291 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39292 return true;
39293 }
39294 }
39295
39296 // Match against a ANY/SIGN/ZERO_EXTEND_VECTOR_INREG instruction.
39297 if (AllowIntDomain &&
39298 ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
39299 (MaskVT.is256BitVector() && Subtarget.hasInt256()) ||
39300 (MaskVT.is512BitVector() && Subtarget.useAVX512Regs()))) {
39301 unsigned MaxScale = 64 / MaskEltSize;
39302 bool UseSign = V1.getScalarValueSizeInBits() == MaskEltSize &&
39303 DAG.ComputeNumSignBits(V1) == MaskEltSize;
39304 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
39305 // Skip 512-bit VPMOV?XBW on non-AVX512BW targets.
39306 if (Scale == 2 && MaskVT == MVT::v64i8 && !Subtarget.useBWIRegs())
39307 continue;
39308 bool MatchAny = true;
39309 bool MatchZero = true;
39310 bool MatchSign = UseSign;
39311 unsigned NumDstElts = NumMaskElts / Scale;
39312 for (unsigned i = 0;
39313 i != NumDstElts && (MatchAny || MatchSign || MatchZero); ++i) {
39314 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
39315 MatchAny = MatchSign = MatchZero = false;
39316 break;
39317 }
39318 unsigned Pos = (i * Scale) + 1;
39319 unsigned Len = Scale - 1;
39320 MatchAny &= isUndefInRange(Mask, Pos, Len);
39321 MatchZero &= isUndefOrZeroInRange(Mask, Pos, Len);
39322 MatchSign &= isUndefOrEqualInRange(Mask, (int)i, Pos, Len);
39323 }
39324 if (MatchAny || MatchSign || MatchZero) {
39325 assert((MatchSign || MatchZero) &&
39326 "Failed to match sext/zext but matched aext?");
39327 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
39328 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType()
39329 : MVT::getIntegerVT(MaskEltSize);
39330 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
39331
39332 Shuffle = unsigned(
39333 MatchAny ? ISD::ANY_EXTEND
39334 : (MatchSign ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND));
39335 if (SrcVT.getVectorNumElements() != NumDstElts)
39336 Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);
39337
39338 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
39339 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
39340 return true;
39341 }
39342 }
39343 }
39344
39345 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
39346 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
39347 (MaskEltSize == 16 && Subtarget.hasFP16())) &&
39348 isUndefOrEqual(Mask[0], 0) &&
39349 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
39350 Shuffle = X86ISD::VZEXT_MOVL;
39351 if (MaskEltSize == 16)
39352 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39353 else
39354 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39355 return true;
39356 }
39357
39358 // Check if we have SSE3 which will let us use MOVDDUP etc. The
39359 // instructions are no slower than UNPCKLPD but has the option to
39360 // fold the input operand into even an unaligned memory load.
39361 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
39362 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {
39363 Shuffle = X86ISD::MOVDDUP;
39364 SrcVT = DstVT = MVT::v2f64;
39365 return true;
39366 }
39367 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39368 Shuffle = X86ISD::MOVSLDUP;
39369 SrcVT = DstVT = MVT::v4f32;
39370 return true;
39371 }
39372 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {
39373 Shuffle = X86ISD::MOVSHDUP;
39374 SrcVT = DstVT = MVT::v4f32;
39375 return true;
39376 }
39377 }
39378
39379 if (MaskVT.is256BitVector() && AllowFloatDomain) {
39380 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
39381 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39382 Shuffle = X86ISD::MOVDDUP;
39383 SrcVT = DstVT = MVT::v4f64;
39384 return true;
39385 }
39386 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39387 V1)) {
39388 Shuffle = X86ISD::MOVSLDUP;
39389 SrcVT = DstVT = MVT::v8f32;
39390 return true;
39391 }
39392 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,
39393 V1)) {
39394 Shuffle = X86ISD::MOVSHDUP;
39395 SrcVT = DstVT = MVT::v8f32;
39396 return true;
39397 }
39398 }
39399
39400 if (MaskVT.is512BitVector() && AllowFloatDomain) {
39401 assert(Subtarget.hasAVX512() &&
39402 "AVX512 required for 512-bit vector shuffles");
39403 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39404 V1)) {
39405 Shuffle = X86ISD::MOVDDUP;
39406 SrcVT = DstVT = MVT::v8f64;
39407 return true;
39408 }
39410 MaskVT, Mask,
39411 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {
39412 Shuffle = X86ISD::MOVSLDUP;
39413 SrcVT = DstVT = MVT::v16f32;
39414 return true;
39415 }
39417 MaskVT, Mask,
39418 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {
39419 Shuffle = X86ISD::MOVSHDUP;
39420 SrcVT = DstVT = MVT::v16f32;
39421 return true;
39422 }
39423 }
39424
39425 return false;
39426}
39427
39428// Attempt to match a combined shuffle mask against supported unary immediate
39429// permute instructions.
39430// TODO: Investigate sharing more of this with shuffle lowering.
39432 const APInt &Zeroable,
39433 bool AllowFloatDomain, bool AllowIntDomain,
39434 const SelectionDAG &DAG,
39435 const X86Subtarget &Subtarget,
39436 unsigned &Shuffle, MVT &ShuffleVT,
39437 unsigned &PermuteImm) {
39438 unsigned NumMaskElts = Mask.size();
39439 unsigned InputSizeInBits = MaskVT.getSizeInBits();
39440 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
39441 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
39442 bool ContainsZeros = isAnyZero(Mask);
39443
39444 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
39445 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
39446 // Check for lane crossing permutes.
39447 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
39448 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
39449 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
39450 Shuffle = X86ISD::VPERMI;
39451 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
39452 PermuteImm = getV4X86ShuffleImm(Mask);
39453 return true;
39454 }
39455 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
39456 SmallVector<int, 4> RepeatedMask;
39457 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
39458 Shuffle = X86ISD::VPERMI;
39459 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
39460 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
39461 return true;
39462 }
39463 }
39464 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
39465 // VPERMILPD can permute with a non-repeating shuffle.
39466 Shuffle = X86ISD::VPERMILPI;
39467 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
39468 PermuteImm = 0;
39469 for (int i = 0, e = Mask.size(); i != e; ++i) {
39470 int M = Mask[i];
39471 if (M == SM_SentinelUndef)
39472 continue;
39473 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
39474 PermuteImm |= (M & 1) << i;
39475 }
39476 return true;
39477 }
39478 }
39479
39480 // We are checking for shuffle match or shift match. Loop twice so we can
39481 // order which we try and match first depending on target preference.
39482 for (unsigned Order = 0; Order < 2; ++Order) {
39483 if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {
39484 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
39485 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
39486 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
39487 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
39488 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
39489 SmallVector<int, 4> RepeatedMask;
39490 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39491 // Narrow the repeated mask to create 32-bit element permutes.
39492 SmallVector<int, 4> WordMask = RepeatedMask;
39493 if (MaskScalarSizeInBits == 64)
39494 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
39495
39496 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
39497 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
39498 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
39499 PermuteImm = getV4X86ShuffleImm(WordMask);
39500 return true;
39501 }
39502 }
39503
39504 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
39505 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
39506 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39507 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39508 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39509 SmallVector<int, 4> RepeatedMask;
39510 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39511 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
39512 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
39513
39514 // PSHUFLW: permute lower 4 elements only.
39515 if (isUndefOrInRange(LoMask, 0, 4) &&
39516 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
39517 Shuffle = X86ISD::PSHUFLW;
39518 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39519 PermuteImm = getV4X86ShuffleImm(LoMask);
39520 return true;
39521 }
39522
39523 // PSHUFHW: permute upper 4 elements only.
39524 if (isUndefOrInRange(HiMask, 4, 8) &&
39525 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
39526 // Offset the HiMask so that we can create the shuffle immediate.
39527 int OffsetHiMask[4];
39528 for (int i = 0; i != 4; ++i)
39529 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
39530
39531 Shuffle = X86ISD::PSHUFHW;
39532 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39533 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
39534 return true;
39535 }
39536 }
39537 }
39538 } else {
39539 // Attempt to match against bit rotates.
39540 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
39541 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
39542 Subtarget.hasAVX512())) {
39543 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
39544 Subtarget, Mask);
39545 if (0 < RotateAmt) {
39546 Shuffle = X86ISD::VROTLI;
39547 PermuteImm = (unsigned)RotateAmt;
39548 return true;
39549 }
39550 }
39551 }
39552 // Attempt to match against byte/bit shifts.
39553 if (AllowIntDomain &&
39554 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39555 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39556 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39557 int ShiftAmt =
39558 matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,
39559 Zeroable, Subtarget);
39560 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
39561 32 <= ShuffleVT.getScalarSizeInBits())) {
39562 // Byte shifts can be slower so only match them on second attempt.
39563 if (Order == 0 &&
39564 (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))
39565 continue;
39566
39567 PermuteImm = (unsigned)ShiftAmt;
39568 return true;
39569 }
39570
39571 }
39572 }
39573
39574 return false;
39575}
39576
39577// Attempt to match a combined unary shuffle mask against supported binary
39578// shuffle instructions.
39579// TODO: Investigate sharing more of this with shuffle lowering.
39580static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39581 bool AllowFloatDomain, bool AllowIntDomain,
39582 SDValue &V1, SDValue &V2, const SDLoc &DL,
39583 SelectionDAG &DAG, const X86Subtarget &Subtarget,
39584 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
39585 bool IsUnary) {
39586 unsigned NumMaskElts = Mask.size();
39587 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39588 unsigned SizeInBits = MaskVT.getSizeInBits();
39589
39590 if (MaskVT.is128BitVector()) {
39591 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
39592 AllowFloatDomain) {
39593 V2 = V1;
39594 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
39595 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
39596 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39597 return true;
39598 }
39599 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&
39600 AllowFloatDomain) {
39601 V2 = V1;
39602 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
39603 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39604 return true;
39605 }
39606 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&
39607 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
39608 std::swap(V1, V2);
39609 Shuffle = X86ISD::MOVSD;
39610 SrcVT = DstVT = MVT::v2f64;
39611 return true;
39612 }
39613 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&
39614 (AllowFloatDomain || !Subtarget.hasSSE41())) {
39615 Shuffle = X86ISD::MOVSS;
39616 SrcVT = DstVT = MVT::v4f32;
39617 return true;
39618 }
39619 if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},
39620 DAG) &&
39621 Subtarget.hasFP16()) {
39622 Shuffle = X86ISD::MOVSH;
39623 SrcVT = DstVT = MVT::v8f16;
39624 return true;
39625 }
39626 }
39627
39628 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
39629 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
39630 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
39631 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
39632 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
39633 Subtarget)) {
39634 DstVT = MaskVT;
39635 return true;
39636 }
39637 }
39638 // TODO: Can we handle this inside matchShuffleWithPACK?
39639 if (MaskVT == MVT::v4i32 && Subtarget.hasSSE2() &&
39640 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2, 4, 6}, DAG) &&
39641 V1.getScalarValueSizeInBits() == 64 &&
39642 V2.getScalarValueSizeInBits() == 64) {
39643 // Use (SSE41) PACKUSWD if the leading zerobits goto the lowest 16-bits.
39644 unsigned MinLZV1 = DAG.computeKnownBits(V1).countMinLeadingZeros();
39645 unsigned MinLZV2 = DAG.computeKnownBits(V2).countMinLeadingZeros();
39646 if (Subtarget.hasSSE41() && MinLZV1 >= 48 && MinLZV2 >= 48) {
39647 SrcVT = MVT::v4i32;
39648 DstVT = MVT::v8i16;
39649 Shuffle = X86ISD::PACKUS;
39650 return true;
39651 }
39652 // Use PACKUSBW if the leading zerobits goto the lowest 8-bits.
39653 if (MinLZV1 >= 56 && MinLZV2 >= 56) {
39654 SrcVT = MVT::v8i16;
39655 DstVT = MVT::v16i8;
39656 Shuffle = X86ISD::PACKUS;
39657 return true;
39658 }
39659 // Use PACKSSWD if the signbits extend to the lowest 16-bits.
39660 if (DAG.ComputeNumSignBits(V1) > 48 && DAG.ComputeNumSignBits(V2) > 48) {
39661 SrcVT = MVT::v4i32;
39662 DstVT = MVT::v8i16;
39663 Shuffle = X86ISD::PACKSS;
39664 return true;
39665 }
39666 }
39667
39668 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
39669 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
39670 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39671 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
39672 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39673 (MaskVT.is512BitVector() && Subtarget.hasAVX512() &&
39674 (32 <= EltSizeInBits || Subtarget.hasBWI()))) {
39675 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
39676 Subtarget)) {
39677 SrcVT = DstVT = MaskVT;
39678 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
39679 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
39680 return true;
39681 }
39682 }
39683
39684 // Attempt to match against a OR if we're performing a blend shuffle and the
39685 // non-blended source element is zero in each case.
39686 // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
39687 if (SizeInBits == V1.getValueSizeInBits() &&
39688 SizeInBits == V2.getValueSizeInBits() &&
39689 (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
39690 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
39691 bool IsBlend = true;
39692 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
39693 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
39694 unsigned Scale1 = NumV1Elts / NumMaskElts;
39695 unsigned Scale2 = NumV2Elts / NumMaskElts;
39696 APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
39697 APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
39698 for (unsigned i = 0; i != NumMaskElts; ++i) {
39699 int M = Mask[i];
39700 if (M == SM_SentinelUndef)
39701 continue;
39702 if (M == SM_SentinelZero) {
39703 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39704 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39705 continue;
39706 }
39707 if (M == (int)i) {
39708 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39709 continue;
39710 }
39711 if (M == (int)(i + NumMaskElts)) {
39712 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39713 continue;
39714 }
39715 IsBlend = false;
39716 break;
39717 }
39718 if (IsBlend) {
39719 if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&
39720 DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {
39721 Shuffle = ISD::OR;
39722 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39723 return true;
39724 }
39725 if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
39726 // FIXME: handle mismatched sizes?
39727 // TODO: investigate if `ISD::OR` handling in
39728 // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
39729 auto computeKnownBitsElementWise = [&DAG](SDValue V) {
39730 unsigned NumElts = V.getValueType().getVectorNumElements();
39731 KnownBits Known(NumElts);
39732 for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
39733 APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
39734 KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
39735 if (PeepholeKnown.isZero())
39736 Known.Zero.setBit(EltIdx);
39737 if (PeepholeKnown.isAllOnes())
39738 Known.One.setBit(EltIdx);
39739 }
39740 return Known;
39741 };
39742
39743 KnownBits V1Known = computeKnownBitsElementWise(V1);
39744 KnownBits V2Known = computeKnownBitsElementWise(V2);
39745
39746 for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
39747 int M = Mask[i];
39748 if (M == SM_SentinelUndef)
39749 continue;
39750 if (M == SM_SentinelZero) {
39751 IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
39752 continue;
39753 }
39754 if (M == (int)i) {
39755 IsBlend &= V2Known.Zero[i] || V1Known.One[i];
39756 continue;
39757 }
39758 if (M == (int)(i + NumMaskElts)) {
39759 IsBlend &= V1Known.Zero[i] || V2Known.One[i];
39760 continue;
39761 }
39762 llvm_unreachable("will not get here.");
39763 }
39764 if (IsBlend) {
39765 Shuffle = ISD::OR;
39766 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39767 return true;
39768 }
39769 }
39770 }
39771 }
39772
39773 return false;
39774}
39775
39777 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
39778 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
39779 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
39780 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
39781 unsigned NumMaskElts = Mask.size();
39782 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39783
39784 // Attempt to match against VALIGND/VALIGNQ rotate.
39785 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
39786 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
39787 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
39788 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39789 MVT AlignVT = MVT::getVectorVT(MVT::getIntegerVT(EltSizeInBits),
39790 MaskVT.getSizeInBits() / EltSizeInBits);
39791 if (!isAnyZero(Mask)) {
39792 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
39793 if (0 < Rotation) {
39794 Shuffle = X86ISD::VALIGN;
39795 ShuffleVT = AlignVT;
39796 PermuteImm = Rotation;
39797 return true;
39798 }
39799 }
39800 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
39801 unsigned ZeroLo = Zeroable.countr_one();
39802 unsigned ZeroHi = Zeroable.countl_one();
39803 assert((ZeroLo + ZeroHi) < NumMaskElts && "Zeroable shuffle detected");
39804 if (ZeroLo) {
39805 SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);
39806 std::iota(ShiftMask.begin() + ZeroLo, ShiftMask.end(), 0);
39807 if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {
39808 V2 = getZeroVector(AlignVT, Subtarget, DAG, DL);
39809 Shuffle = X86ISD::VALIGN;
39810 ShuffleVT = AlignVT;
39811 PermuteImm = NumMaskElts - ZeroLo;
39812 return true;
39813 }
39814 }
39815 if (ZeroHi) {
39816 SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);
39817 std::iota(ShiftMask.begin(), ShiftMask.begin() + NumMaskElts - ZeroHi,
39818 ZeroHi);
39819 if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {
39820 V2 = V1;
39821 V1 = getZeroVector(AlignVT, Subtarget, DAG, DL);
39822 Shuffle = X86ISD::VALIGN;
39823 ShuffleVT = AlignVT;
39824 PermuteImm = ZeroHi;
39825 return true;
39826 }
39827 }
39828 }
39829
39830 // Attempt to match against PALIGNR byte rotate.
39831 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
39832 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39833 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39834 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
39835 if (0 < ByteRotation) {
39836 Shuffle = X86ISD::PALIGNR;
39837 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
39838 PermuteImm = ByteRotation;
39839 return true;
39840 }
39841 }
39842
39843 // Attempt to combine to X86ISD::BLENDI.
39844 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
39845 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
39846 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
39847 uint64_t BlendMask = 0;
39848 bool ForceV1Zero = false, ForceV2Zero = false;
39849 SmallVector<int, 8> TargetMask(Mask);
39850 if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,
39851 ForceV2Zero, BlendMask)) {
39852 if (MaskVT == MVT::v16i16) {
39853 // We can only use v16i16 PBLENDW if the lanes are repeated.
39854 SmallVector<int, 8> RepeatedMask;
39855 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
39856 RepeatedMask)) {
39857 assert(RepeatedMask.size() == 8 &&
39858 "Repeated mask size doesn't match!");
39859 PermuteImm = 0;
39860 for (int i = 0; i < 8; ++i)
39861 if (RepeatedMask[i] >= 8)
39862 PermuteImm |= 1 << i;
39863 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39864 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39865 Shuffle = X86ISD::BLENDI;
39866 ShuffleVT = MaskVT;
39867 return true;
39868 }
39869 } else {
39870 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39871 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39872 PermuteImm = (unsigned)BlendMask;
39873 Shuffle = X86ISD::BLENDI;
39874 ShuffleVT = MaskVT;
39875 return true;
39876 }
39877 }
39878 }
39879
39880 // Attempt to combine to INSERTPS, but only if it has elements that need to
39881 // be set to zero.
39882 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39883 MaskVT.is128BitVector() && isAnyZero(Mask) &&
39884 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39885 Shuffle = X86ISD::INSERTPS;
39886 ShuffleVT = MVT::v4f32;
39887 return true;
39888 }
39889
39890 // Attempt to combine to SHUFPD.
39891 if (AllowFloatDomain && EltSizeInBits == 64 &&
39892 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39893 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39894 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39895 bool ForceV1Zero = false, ForceV2Zero = false;
39896 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
39897 PermuteImm, Mask, Zeroable)) {
39898 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39899 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39900 Shuffle = X86ISD::SHUFP;
39901 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
39902 return true;
39903 }
39904 }
39905
39906 // Attempt to combine to SHUFPS.
39907 if (AllowFloatDomain && EltSizeInBits == 32 &&
39908 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
39909 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39910 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39911 SmallVector<int, 4> RepeatedMask;
39912 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
39913 // Match each half of the repeated mask, to determine if its just
39914 // referencing one of the vectors, is zeroable or entirely undef.
39915 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
39916 int M0 = RepeatedMask[Offset];
39917 int M1 = RepeatedMask[Offset + 1];
39918
39919 if (isUndefInRange(RepeatedMask, Offset, 2)) {
39920 return DAG.getUNDEF(MaskVT);
39921 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
39922 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
39923 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
39924 return getZeroVector(MaskVT, Subtarget, DAG, DL);
39925 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
39926 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39927 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39928 return V1;
39929 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
39930 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39931 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39932 return V2;
39933 }
39934
39935 return SDValue();
39936 };
39937
39938 int ShufMask[4] = {-1, -1, -1, -1};
39939 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
39940 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
39941
39942 if (Lo && Hi) {
39943 V1 = Lo;
39944 V2 = Hi;
39945 Shuffle = X86ISD::SHUFP;
39946 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
39947 PermuteImm = getV4X86ShuffleImm(ShufMask);
39948 return true;
39949 }
39950 }
39951 }
39952
39953 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
39954 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39955 MaskVT.is128BitVector() &&
39956 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39957 Shuffle = X86ISD::INSERTPS;
39958 ShuffleVT = MVT::v4f32;
39959 return true;
39960 }
39961
39962 return false;
39963}
39964
39966 ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
39967 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
39968 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
39969 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
39970 const X86Subtarget &Subtarget);
39971
39972/// Combine an arbitrary chain of shuffles into a single instruction if
39973/// possible.
39974///
39975/// This is the leaf of the recursive combine below. When we have found some
39976/// chain of single-use x86 shuffle instructions and accumulated the combined
39977/// shuffle mask represented by them, this will try to pattern match that mask
39978/// into either a single instruction if there is a special purpose instruction
39979/// for this operation, or into a PSHUFB instruction which is a fully general
39980/// instruction but should only be used to replace chains over a certain depth.
39982 ArrayRef<SDValue> Inputs, unsigned RootOpc, MVT RootVT,
39983 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
39984 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
39985 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
39986 const X86Subtarget &Subtarget) {
39987 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
39988 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
39989 "Unexpected number of shuffle inputs!");
39990 unsigned RootSizeInBits = RootVT.getSizeInBits();
39991 unsigned NumRootElts = RootVT.getVectorNumElements();
39992
39993 // Canonicalize shuffle input op to the requested type.
39994 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
39995 if (VT.getSizeInBits() > Op.getValueSizeInBits())
39996 Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
39997 else if (VT.getSizeInBits() < Op.getValueSizeInBits())
39998 Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
39999 return DAG.getBitcast(VT, Op);
40000 };
40001
40002 // Find the inputs that enter the chain. Note that multiple uses are OK
40003 // here, we're not going to remove the operands we find.
40004 bool UnaryShuffle = (Inputs.size() == 1);
40005 SDValue V1 = peekThroughBitcasts(Inputs[0]);
40006 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
40007 : peekThroughBitcasts(Inputs[1]));
40008
40009 MVT VT1 = V1.getSimpleValueType();
40010 MVT VT2 = V2.getSimpleValueType();
40011 assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&
40012 (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch");
40013
40014 SDValue Res;
40015
40016 unsigned NumBaseMaskElts = BaseMask.size();
40017 if (NumBaseMaskElts == 1) {
40018 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
40019 return CanonicalizeShuffleInput(RootVT, V1);
40020 }
40021
40022 bool OptForSize = DAG.shouldOptForSize();
40023 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
40024 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
40025 (RootVT.isFloatingPoint() && Depth >= 1) ||
40026 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
40027
40028 // If we are shuffling a splat (and not introducing zeros) then we can just
40029 // use it directly. This works for smaller elements as well as they already
40030 // repeat across each mask element.
40031 if (UnaryShuffle && !isAnyZero(BaseMask) &&
40032 V1.getValueSizeInBits() >= RootSizeInBits &&
40033 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
40034 DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {
40035 return CanonicalizeShuffleInput(RootVT, V1);
40036 }
40037
40038 SmallVector<int, 64> Mask(BaseMask);
40039
40040 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
40041 // etc. can be simplified.
40042 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
40043 SmallVector<int> ScaledMask, IdentityMask;
40044 unsigned NumElts = VT1.getVectorNumElements();
40045 if (Mask.size() <= NumElts &&
40046 scaleShuffleElements(Mask, NumElts, ScaledMask)) {
40047 for (unsigned i = 0; i != NumElts; ++i)
40048 IdentityMask.push_back(i);
40049 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,
40050 V2))
40051 return CanonicalizeShuffleInput(RootVT, V1);
40052 }
40053 }
40054
40055 // Handle 128/256-bit lane shuffles of 512-bit vectors.
40056 if (RootVT.is512BitVector() &&
40057 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
40058 // If the upper subvectors are zeroable, then an extract+insert is more
40059 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
40060 // to zero the upper subvectors.
40061 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
40062 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40063 return SDValue(); // Nothing to do!
40064 assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&
40065 "Unexpected lane shuffle");
40066 Res = CanonicalizeShuffleInput(RootVT, V1);
40067 unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
40068 bool UseZero = isAnyZero(Mask);
40069 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
40070 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
40071 }
40072
40073 // Narrow shuffle mask to v4x128.
40074 SmallVector<int, 4> ScaledMask;
40075 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
40076 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
40077
40078 // Try to lower to vshuf64x2/vshuf32x4.
40079 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
40080 ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
40081 SelectionDAG &DAG) {
40082 int PermMask[4] = {-1, -1, -1, -1};
40083 // Ensure elements came from the same Op.
40084 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
40085 for (int i = 0; i < 4; ++i) {
40086 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value");
40087 if (ScaledMask[i] < 0)
40088 continue;
40089
40090 SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
40091 unsigned OpIndex = i / 2;
40092 if (Ops[OpIndex].isUndef())
40093 Ops[OpIndex] = Op;
40094 else if (Ops[OpIndex] != Op)
40095 return SDValue();
40096
40097 PermMask[i] = ScaledMask[i] % 4;
40098 }
40099
40100 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
40101 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
40102 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
40103 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
40104 };
40105
40106 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
40107 // doesn't work because our mask is for 128 bits and we don't have an MVT
40108 // to match that.
40109 bool PreferPERMQ = UnaryShuffle && !isFreeToSplitVector(V1, DAG) &&
40110 isUndefOrInRange(ScaledMask[0], 0, 2) &&
40111 isUndefOrInRange(ScaledMask[1], 0, 2) &&
40112 isUndefOrInRange(ScaledMask[2], 2, 4) &&
40113 isUndefOrInRange(ScaledMask[3], 2, 4) &&
40114 (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
40115 ScaledMask[0] == (ScaledMask[2] % 2)) &&
40116 (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
40117 ScaledMask[1] == (ScaledMask[3] % 2));
40118
40119 if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
40120 if (Depth == 0 && RootOpc == X86ISD::SHUF128)
40121 return SDValue(); // Nothing to do!
40122 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
40123 if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
40124 return DAG.getBitcast(RootVT, V);
40125 }
40126 }
40127
40128 // Handle 128-bit lane shuffles of 256-bit vectors.
40129 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
40130 // If the upper half is zeroable, then an extract+insert is more optimal
40131 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
40132 // zero the upper half.
40133 if (isUndefOrZero(Mask[1])) {
40134 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40135 return SDValue(); // Nothing to do!
40136 assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle");
40137 Res = CanonicalizeShuffleInput(RootVT, V1);
40138 Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
40139 return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
40140 256);
40141 }
40142
40143 // If we're inserting the low subvector, an insert-subvector 'concat'
40144 // pattern is quicker than VPERM2X128.
40145 if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
40146 !Subtarget.hasAVX2()) {
40147 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40148 return SDValue(); // Nothing to do!
40149 SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
40150 SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
40151 Hi = extractSubVector(Hi, 0, DAG, DL, 128);
40152 return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
40153 }
40154
40155 // Don't lower to VPERM2X128 here if we have AVX2+, prefer to use
40156 // VPERMQ/VPERMPD for unary shuffles unless we need to use the zeroing
40157 // feature.
40158 // Prefer blends for sequential shuffles unless we are optimizing for size.
40159 if (UnaryShuffle &&
40160 !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
40161 (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
40162 if (Depth == 0 && RootOpc == X86ISD::VPERM2X128)
40163 return SDValue(); // Nothing to do!
40164 unsigned PermMask = 0;
40165 PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
40166 PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
40167 return DAG.getNode(
40168 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
40169 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
40170 }
40171
40172 if (Depth == 0 && RootOpc == X86ISD::SHUF128)
40173 return SDValue(); // Nothing to do!
40174
40175 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
40176 if (!UnaryShuffle && !IsMaskedShuffle) {
40177 assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&
40178 "Unexpected shuffle sentinel value");
40179 // Prefer blends to X86ISD::VPERM2X128.
40180 if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
40181 if (Depth == 0 && RootOpc == X86ISD::VPERM2X128)
40182 return SDValue(); // Nothing to do!
40183 unsigned PermMask = 0;
40184 PermMask |= ((Mask[0] & 3) << 0);
40185 PermMask |= ((Mask[1] & 3) << 4);
40186 SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
40187 SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
40188 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
40189 CanonicalizeShuffleInput(RootVT, LHS),
40190 CanonicalizeShuffleInput(RootVT, RHS),
40191 DAG.getTargetConstant(PermMask, DL, MVT::i8));
40192 }
40193 }
40194 }
40195
40196 // For masks that have been widened to 128-bit elements or more,
40197 // narrow back down to 64-bit elements.
40198 if (BaseMaskEltSizeInBits > 64) {
40199 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
40200 int MaskScale = BaseMaskEltSizeInBits / 64;
40201 SmallVector<int, 64> ScaledMask;
40202 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40203 Mask = std::move(ScaledMask);
40204 }
40205
40206 // For masked shuffles, we're trying to match the root width for better
40207 // writemask folding, attempt to scale the mask.
40208 // TODO - variable shuffles might need this to be widened again.
40209 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
40210 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
40211 int MaskScale = NumRootElts / Mask.size();
40212 SmallVector<int, 64> ScaledMask;
40213 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40214 Mask = std::move(ScaledMask);
40215 }
40216
40217 unsigned NumMaskElts = Mask.size();
40218 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
40219 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40220
40221 // Determine the effective mask value type.
40222 FloatDomain &= (32 <= MaskEltSizeInBits);
40223 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
40224 : MVT::getIntegerVT(MaskEltSizeInBits);
40225 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
40226
40227 // Only allow legal mask types.
40228 if (!TLI.isTypeLegal(MaskVT))
40229 return SDValue();
40230
40231 // Attempt to match the mask against known shuffle patterns.
40232 MVT ShuffleSrcVT, ShuffleVT;
40233 unsigned Shuffle, PermuteImm;
40234
40235 // Which shuffle domains are permitted?
40236 // Permit domain crossing at higher combine depths.
40237 // TODO: Should we indicate which domain is preferred if both are allowed?
40238 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
40239 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
40240 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
40241
40242 // Determine zeroable mask elements.
40243 APInt KnownUndef, KnownZero;
40244 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
40245 APInt Zeroable = KnownUndef | KnownZero;
40246
40247 if (UnaryShuffle) {
40248 // Attempt to match against broadcast-from-vector.
40249 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
40250 if ((Subtarget.hasAVX2() ||
40251 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
40252 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
40253 if (isUndefOrEqual(Mask, 0)) {
40254 if (V1.getValueType() == MaskVT &&
40256 X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
40257 if (Depth == 0 && RootOpc == X86ISD::VBROADCAST)
40258 return SDValue(); // Nothing to do!
40259 Res = V1.getOperand(0);
40260 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40261 return DAG.getBitcast(RootVT, Res);
40262 }
40263 if (Subtarget.hasAVX2()) {
40264 if (Depth == 0 && RootOpc == X86ISD::VBROADCAST)
40265 return SDValue(); // Nothing to do!
40266 Res = CanonicalizeShuffleInput(MaskVT, V1);
40267 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40268 return DAG.getBitcast(RootVT, Res);
40269 }
40270 }
40271 }
40272
40273 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
40274 DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
40275 (!IsMaskedShuffle ||
40276 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40277 if (Depth == 0 && RootOpc == Shuffle)
40278 return SDValue(); // Nothing to do!
40279 Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40280 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
40281 return DAG.getBitcast(RootVT, Res);
40282 }
40283
40284 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40285 AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
40286 PermuteImm) &&
40287 (!IsMaskedShuffle ||
40288 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40289 if (Depth == 0 && RootOpc == Shuffle)
40290 return SDValue(); // Nothing to do!
40291 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
40292 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
40293 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40294 return DAG.getBitcast(RootVT, Res);
40295 }
40296 }
40297
40298 // Attempt to combine to INSERTPS, but only if the inserted element has come
40299 // from a scalar.
40300 // TODO: Handle other insertions here as well?
40301 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
40302 Subtarget.hasSSE41() &&
40303 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {
40304 if (MaskEltSizeInBits == 32) {
40305 SDValue SrcV1 = V1, SrcV2 = V2;
40306 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
40307 DAG) &&
40308 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
40309 if (Depth == 0 && RootOpc == X86ISD::INSERTPS)
40310 return SDValue(); // Nothing to do!
40311 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40312 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
40313 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
40314 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40315 return DAG.getBitcast(RootVT, Res);
40316 }
40317 }
40318 if (MaskEltSizeInBits == 64 &&
40319 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&
40321 V2.getScalarValueSizeInBits() <= 32) {
40322 if (Depth == 0 && RootOpc == X86ISD::INSERTPS)
40323 return SDValue(); // Nothing to do!
40324 PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);
40325 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40326 CanonicalizeShuffleInput(MVT::v4f32, V1),
40327 CanonicalizeShuffleInput(MVT::v4f32, V2),
40328 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40329 return DAG.getBitcast(RootVT, Res);
40330 }
40331 }
40332
40333 SDValue NewV1 = V1; // Save operands in case early exit happens.
40334 SDValue NewV2 = V2;
40335 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
40336 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
40337 ShuffleVT, UnaryShuffle) &&
40338 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40339 if (Depth == 0 && RootOpc == Shuffle)
40340 return SDValue(); // Nothing to do!
40341 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
40342 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
40343 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
40344 return DAG.getBitcast(RootVT, Res);
40345 }
40346
40347 NewV1 = V1; // Save operands in case early exit happens.
40348 NewV2 = V2;
40349 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40350 AllowIntDomain, NewV1, NewV2, DL, DAG,
40351 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
40352 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40353 if (Depth == 0 && RootOpc == Shuffle)
40354 return SDValue(); // Nothing to do!
40355 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
40356 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
40357 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
40358 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40359 return DAG.getBitcast(RootVT, Res);
40360 }
40361
40362 // Typically from here on, we need an integer version of MaskVT.
40363 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
40364 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
40365
40366 // Annoyingly, SSE4A instructions don't map into the above match helpers.
40367 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
40368 uint64_t BitLen, BitIdx;
40369 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
40370 Zeroable)) {
40371 if (Depth == 0 && RootOpc == X86ISD::EXTRQI)
40372 return SDValue(); // Nothing to do!
40373 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40374 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
40375 DAG.getTargetConstant(BitLen, DL, MVT::i8),
40376 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40377 return DAG.getBitcast(RootVT, Res);
40378 }
40379
40380 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
40381 if (Depth == 0 && RootOpc == X86ISD::INSERTQI)
40382 return SDValue(); // Nothing to do!
40383 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40384 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
40385 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
40386 DAG.getTargetConstant(BitLen, DL, MVT::i8),
40387 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40388 return DAG.getBitcast(RootVT, Res);
40389 }
40390 }
40391
40392 // Match shuffle against TRUNCATE patterns.
40393 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
40394 // Match against a VTRUNC instruction, accounting for src/dst sizes.
40395 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
40396 Subtarget)) {
40397 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
40398 ShuffleSrcVT.getVectorNumElements();
40399 unsigned Opc =
40401 if (Depth == 0 && RootOpc == Opc)
40402 return SDValue(); // Nothing to do!
40403 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40404 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
40405 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
40406 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
40407 return DAG.getBitcast(RootVT, Res);
40408 }
40409
40410 // Do we need a more general binary truncation pattern?
40411 if (RootSizeInBits < 512 &&
40412 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
40413 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
40414 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
40415 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
40416 // Bail if this was already a truncation or PACK node.
40417 // We sometimes fail to match PACK if we demand known undef elements.
40418 if (Depth == 0 &&
40419 (RootOpc == ISD::TRUNCATE || RootOpc == X86ISD::PACKSS ||
40420 RootOpc == X86ISD::PACKUS))
40421 return SDValue(); // Nothing to do!
40422 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40423 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
40424 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40425 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
40426 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40427 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
40428 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
40429 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
40430 return DAG.getBitcast(RootVT, Res);
40431 }
40432 }
40433
40434 // Don't try to re-form single instruction chains under any circumstances now
40435 // that we've done encoding canonicalization for them.
40436 if (Depth < 1)
40437 return SDValue();
40438
40439 int NumVariableMasks = llvm::count_if(SrcNodes, [](const SDNode *N) {
40440 return isTargetShuffleVariableMask(N->getOpcode());
40441 });
40442 bool HasSlowVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) {
40443 return (N->getOpcode() == X86ISD::VPERMV3 ||
40444 N->getOpcode() == X86ISD::VPERMV);
40445 });
40446
40447 // Depth threshold above which we can efficiently use variable mask shuffles.
40448 int VariableCrossLaneShuffleDepth =
40449 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
40450 int VariablePerLaneShuffleDepth =
40451 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
40452 AllowVariableCrossLaneMask &=
40453 (Depth >= VariableCrossLaneShuffleDepth) || NumVariableMasks;
40454 AllowVariablePerLaneMask &=
40455 (Depth >= VariablePerLaneShuffleDepth) || NumVariableMasks;
40456 // VPERM2W/VPERM2B are 3 uops on Skylake and Icelake so we require a
40457 // higher depth before combining them.
40458 int BWIVPERMV3ShuffleDepth =
40459 VariableCrossLaneShuffleDepth + 2 - NumVariableMasks;
40460 bool AllowBWIVPERMV3 =
40461 (Depth >= BWIVPERMV3ShuffleDepth || HasSlowVariableMask);
40462
40463 // If root was a VPERMV/VPERMV3 node, always allow a variable shuffle.
40464 if ((UnaryShuffle && RootOpc == X86ISD::VPERMV) || RootOpc == X86ISD::VPERMV3)
40465 AllowVariableCrossLaneMask = AllowVariablePerLaneMask = true;
40466
40467 bool MaskContainsZeros = isAnyZero(Mask);
40468
40469 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
40470 // If we have a single input lane-crossing shuffle then lower to VPERMV.
40471 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
40472 if (Subtarget.hasAVX2() &&
40473 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
40474 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
40475 Res = CanonicalizeShuffleInput(MaskVT, V1);
40476 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
40477 return DAG.getBitcast(RootVT, Res);
40478 }
40479 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
40480 if ((Subtarget.hasAVX512() &&
40481 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40482 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40483 (Subtarget.hasBWI() &&
40484 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40485 (Subtarget.hasVBMI() &&
40486 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
40487 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40488 V2 = DAG.getUNDEF(MaskVT);
40489 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40490 return DAG.getBitcast(RootVT, Res);
40491 }
40492 }
40493
40494 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
40495 // vector as the second source (non-VLX will pad to 512-bit shuffles).
40496 if (UnaryShuffle && AllowVariableCrossLaneMask &&
40497 ((Subtarget.hasAVX512() &&
40498 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40499 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40500 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
40501 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40502 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40503 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40504 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40505 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40506 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
40507 for (unsigned i = 0; i != NumMaskElts; ++i)
40508 if (Mask[i] == SM_SentinelZero)
40509 Mask[i] = NumMaskElts + i;
40510 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40511 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
40512 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40513 return DAG.getBitcast(RootVT, Res);
40514 }
40515
40516 // If that failed and either input is extracted then try to combine as a
40517 // shuffle with the larger type.
40519 Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40520 AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40521 IsMaskedShuffle, DAG, DL, Subtarget))
40522 return WideShuffle;
40523
40524 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
40525 // (non-VLX will pad to 512-bit shuffles).
40526 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
40527 ((Subtarget.hasAVX512() &&
40528 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40529 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40530 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
40531 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
40532 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40533 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40534 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40535 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40536 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40537 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40538 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40539 return DAG.getBitcast(RootVT, Res);
40540 }
40541 return SDValue();
40542 }
40543
40544 // See if we can combine a single input shuffle with zeros to a bit-mask,
40545 // which is much simpler than any shuffle.
40546 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
40547 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
40548 TLI.isTypeLegal(MaskVT)) {
40549 APInt Zero = APInt::getZero(MaskEltSizeInBits);
40550 APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
40551 APInt UndefElts(NumMaskElts, 0);
40552 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
40553 for (unsigned i = 0; i != NumMaskElts; ++i) {
40554 int M = Mask[i];
40555 if (M == SM_SentinelUndef) {
40556 UndefElts.setBit(i);
40557 continue;
40558 }
40559 if (M == SM_SentinelZero)
40560 continue;
40561 EltBits[i] = AllOnes;
40562 }
40563 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
40564 Res = CanonicalizeShuffleInput(MaskVT, V1);
40565 unsigned AndOpcode =
40567 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
40568 return DAG.getBitcast(RootVT, Res);
40569 }
40570
40571 // If we have a single input shuffle with different shuffle patterns in the
40572 // the 128-bit lanes use the variable mask to VPERMILPS.
40573 // TODO Combine other mask types at higher depths.
40574 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40575 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
40576 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
40577 SmallVector<SDValue, 16> VPermIdx;
40578 for (int M : Mask) {
40579 SDValue Idx =
40580 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
40581 VPermIdx.push_back(Idx);
40582 }
40583 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
40584 Res = CanonicalizeShuffleInput(MaskVT, V1);
40585 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
40586 return DAG.getBitcast(RootVT, Res);
40587 }
40588
40589 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
40590 // to VPERMIL2PD/VPERMIL2PS.
40591 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
40592 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
40593 MaskVT == MVT::v8f32)) {
40594 // VPERMIL2 Operation.
40595 // Bits[3] - Match Bit.
40596 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
40597 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
40598 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
40599 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
40600 SmallVector<int, 8> VPerm2Idx;
40601 unsigned M2ZImm = 0;
40602 for (int M : Mask) {
40603 if (M == SM_SentinelUndef) {
40604 VPerm2Idx.push_back(-1);
40605 continue;
40606 }
40607 if (M == SM_SentinelZero) {
40608 M2ZImm = 2;
40609 VPerm2Idx.push_back(8);
40610 continue;
40611 }
40612 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
40613 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
40614 VPerm2Idx.push_back(Index);
40615 }
40616 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40617 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40618 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
40619 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
40620 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
40621 return DAG.getBitcast(RootVT, Res);
40622 }
40623
40624 // If we have 3 or more shuffle instructions or a chain involving a variable
40625 // mask, we can replace them with a single PSHUFB instruction profitably.
40626 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
40627 // instructions, but in practice PSHUFB tends to be *very* fast so we're
40628 // more aggressive.
40629 if (UnaryShuffle && AllowVariablePerLaneMask &&
40630 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
40631 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
40632 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
40633 SmallVector<SDValue, 16> PSHUFBMask;
40634 int NumBytes = RootVT.getSizeInBits() / 8;
40635 int Ratio = NumBytes / NumMaskElts;
40636 for (int i = 0; i < NumBytes; ++i) {
40637 int M = Mask[i / Ratio];
40638 if (M == SM_SentinelUndef) {
40639 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
40640 continue;
40641 }
40642 if (M == SM_SentinelZero) {
40643 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40644 continue;
40645 }
40646 M = Ratio * M + i % Ratio;
40647 assert((M / 16) == (i / 16) && "Lane crossing detected");
40648 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40649 }
40650 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
40651 Res = CanonicalizeShuffleInput(ByteVT, V1);
40652 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
40653 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
40654 return DAG.getBitcast(RootVT, Res);
40655 }
40656
40657 // With XOP, if we have a 128-bit binary input shuffle we can always combine
40658 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
40659 // slower than PSHUFB on targets that support both.
40660 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
40661 Subtarget.hasXOP()) {
40662 // VPPERM Mask Operation
40663 // Bits[4:0] - Byte Index (0 - 31)
40664 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
40665 SmallVector<SDValue, 16> VPPERMMask;
40666 int NumBytes = 16;
40667 int Ratio = NumBytes / NumMaskElts;
40668 for (int i = 0; i < NumBytes; ++i) {
40669 int M = Mask[i / Ratio];
40670 if (M == SM_SentinelUndef) {
40671 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
40672 continue;
40673 }
40674 if (M == SM_SentinelZero) {
40675 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40676 continue;
40677 }
40678 M = Ratio * M + i % Ratio;
40679 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40680 }
40681 MVT ByteVT = MVT::v16i8;
40682 V1 = CanonicalizeShuffleInput(ByteVT, V1);
40683 V2 = CanonicalizeShuffleInput(ByteVT, V2);
40684 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
40685 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
40686 return DAG.getBitcast(RootVT, Res);
40687 }
40688
40689 // If that failed and either input is extracted then try to combine as a
40690 // shuffle with the larger type.
40692 Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40693 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
40694 DAG, DL, Subtarget))
40695 return WideShuffle;
40696
40697 // If we have a dual input shuffle then lower to VPERMV3,
40698 // (non-VLX will pad to 512-bit shuffles)
40699 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40700 ((Subtarget.hasAVX512() &&
40701 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
40702 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
40703 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
40704 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
40705 MaskVT == MVT::v16i32)) ||
40706 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40707 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
40708 MaskVT == MVT::v32i16)) ||
40709 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40710 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
40711 MaskVT == MVT::v64i8)))) {
40712 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40713 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40714 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40715 return DAG.getBitcast(RootVT, Res);
40716 }
40717
40718 // Failed to find any combines.
40719 return SDValue();
40720}
40721
40722// Combine an arbitrary chain of shuffles + extract_subvectors into a single
40723// instruction if possible.
40724//
40725// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
40726// type size to attempt to combine:
40727// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
40728// -->
40729// extract_subvector(shuffle(x,y,m2),0)
40731 ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
40732 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
40733 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
40734 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
40735 const X86Subtarget &Subtarget) {
40736 unsigned NumMaskElts = BaseMask.size();
40737 unsigned NumInputs = Inputs.size();
40738 if (NumInputs == 0)
40739 return SDValue();
40740
40741 unsigned RootSizeInBits = RootVT.getSizeInBits();
40742 unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
40743 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
40744
40745 // Peek through subvectors to find widest legal vector.
40746 // TODO: Handle ISD::TRUNCATE
40747 unsigned WideSizeInBits = RootSizeInBits;
40748 for (SDValue Input : Inputs) {
40750 while (1) {
40751 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
40752 Input = peekThroughBitcasts(Input.getOperand(0));
40753 continue;
40754 }
40755 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40756 Input.getOperand(0).isUndef() &&
40757 isNullConstant(Input.getOperand(2))) {
40758 Input = peekThroughBitcasts(Input.getOperand(1));
40759 continue;
40760 }
40761 break;
40762 }
40763 if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
40764 WideSizeInBits < Input.getValueSizeInBits())
40765 WideSizeInBits = Input.getValueSizeInBits();
40766 }
40767
40768 // Bail if we fail to find a source larger than the existing root.
40769 if (WideSizeInBits <= RootSizeInBits ||
40770 (WideSizeInBits % RootSizeInBits) != 0)
40771 return SDValue();
40772
40773 // Create new mask for larger type.
40774 SmallVector<int, 64> WideMask;
40775 growShuffleMask(BaseMask, WideMask, RootSizeInBits, WideSizeInBits);
40776
40777 // Attempt to peek through inputs and adjust mask when we extract from an
40778 // upper subvector.
40779 int AdjustedMasks = 0;
40780 SmallVector<SDValue, 4> WideInputs(Inputs);
40781 for (unsigned I = 0; I != NumInputs; ++I) {
40782 SDValue &Input = WideInputs[I];
40784 while (1) {
40785 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40786 Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
40787 uint64_t Idx = Input.getConstantOperandVal(1);
40788 if (Idx != 0) {
40789 ++AdjustedMasks;
40790 unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
40791 Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
40792
40793 int lo = I * WideMask.size();
40794 int hi = (I + 1) * WideMask.size();
40795 for (int &M : WideMask)
40796 if (lo <= M && M < hi)
40797 M += Idx;
40798 }
40799 Input = peekThroughBitcasts(Input.getOperand(0));
40800 continue;
40801 }
40802 // TODO: Handle insertions into upper subvectors.
40803 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40804 Input.getOperand(0).isUndef() &&
40805 isNullConstant(Input.getOperand(2))) {
40806 Input = peekThroughBitcasts(Input.getOperand(1));
40807 continue;
40808 }
40809 break;
40810 }
40811 }
40812
40813 // Remove unused/repeated shuffle source ops.
40814 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
40815 assert(!WideInputs.empty() && "Shuffle with no inputs detected");
40816
40817 // Bail if we're always extracting from the lowest subvectors,
40818 // combineX86ShuffleChain should match this for the current width, or the
40819 // shuffle still references too many inputs.
40820 if (AdjustedMasks == 0 || WideInputs.size() > 2)
40821 return SDValue();
40822
40823 // Minor canonicalization of the accumulated shuffle mask to make it easier
40824 // to match below. All this does is detect masks with sequential pairs of
40825 // elements, and shrink them to the half-width mask. It does this in a loop
40826 // so it will reduce the size of the mask to the minimal width mask which
40827 // performs an equivalent shuffle.
40828 while (WideMask.size() > 1) {
40829 SmallVector<int, 64> WidenedMask;
40830 if (!canWidenShuffleElements(WideMask, WidenedMask))
40831 break;
40832 WideMask = std::move(WidenedMask);
40833 }
40834
40835 // Canonicalization of binary shuffle masks to improve pattern matching by
40836 // commuting the inputs.
40837 if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
40839 std::swap(WideInputs[0], WideInputs[1]);
40840 }
40841
40842 // Increase depth for every upper subvector we've peeked through.
40843 Depth += AdjustedMasks;
40844
40845 // Attempt to combine wider chain.
40846 // TODO: Can we use a better Root?
40847 SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
40848 WideInputs.back().getValueSizeInBits()
40849 ? WideInputs.front()
40850 : WideInputs.back();
40851 assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
40852 "WideRootSize mismatch");
40853
40854 if (SDValue WideShuffle = combineX86ShuffleChain(
40855 WideInputs, RootOpcode, WideRoot.getSimpleValueType(), WideMask,
40856 Depth, SrcNodes, AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40857 IsMaskedShuffle, DAG, SDLoc(WideRoot), Subtarget)) {
40858 WideShuffle = extractSubVector(WideShuffle, 0, DAG, DL, RootSizeInBits);
40859 return DAG.getBitcast(RootVT, WideShuffle);
40860 }
40861
40862 return SDValue();
40863}
40864
40865// Canonicalize the combined shuffle mask chain with horizontal ops.
40866// NOTE: This may update the Ops and Mask.
40869 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
40870 const X86Subtarget &Subtarget) {
40871 if (Mask.empty() || Ops.empty())
40872 return SDValue();
40873
40875 for (SDValue Op : Ops)
40877
40878 // All ops must be the same horizop + type.
40879 SDValue BC0 = BC[0];
40880 EVT VT0 = BC0.getValueType();
40881 unsigned Opcode0 = BC0.getOpcode();
40882 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
40883 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
40884 }))
40885 return SDValue();
40886
40887 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
40888 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
40889 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
40890 if (!isHoriz && !isPack)
40891 return SDValue();
40892
40893 // Do all ops have a single use?
40894 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
40895 return Op.hasOneUse() &&
40897 });
40898
40899 int NumElts = VT0.getVectorNumElements();
40900 int NumLanes = VT0.getSizeInBits() / 128;
40901 int NumEltsPerLane = NumElts / NumLanes;
40902 int NumHalfEltsPerLane = NumEltsPerLane / 2;
40903 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
40904 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
40905
40906 if (NumEltsPerLane >= 4 &&
40907 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
40908 SmallVector<int> LaneMask, ScaledMask;
40909 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
40910 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
40911 // See if we can remove the shuffle by resorting the HOP chain so that
40912 // the HOP args are pre-shuffled.
40913 // TODO: Generalize to any sized/depth chain.
40914 // TODO: Add support for PACKSS/PACKUS.
40915 if (isHoriz) {
40916 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
40917 auto GetHOpSrc = [&](int M) {
40918 if (M == SM_SentinelUndef)
40919 return DAG.getUNDEF(VT0);
40920 if (M == SM_SentinelZero)
40921 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
40922 SDValue Src0 = BC[M / 4];
40923 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
40924 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
40925 return Src1.getOperand(M % 2);
40926 return SDValue();
40927 };
40928 SDValue M0 = GetHOpSrc(ScaledMask[0]);
40929 SDValue M1 = GetHOpSrc(ScaledMask[1]);
40930 SDValue M2 = GetHOpSrc(ScaledMask[2]);
40931 SDValue M3 = GetHOpSrc(ScaledMask[3]);
40932 if (M0 && M1 && M2 && M3) {
40933 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
40934 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
40935 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40936 }
40937 }
40938 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
40939 if (Ops.size() >= 2) {
40940 SDValue LHS, RHS;
40941 auto GetHOpSrc = [&](int M, int &OutM) {
40942 // TODO: Support SM_SentinelZero
40943 if (M < 0)
40944 return M == SM_SentinelUndef;
40945 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
40946 if (!LHS || LHS == Src) {
40947 LHS = Src;
40948 OutM = (M % 2);
40949 return true;
40950 }
40951 if (!RHS || RHS == Src) {
40952 RHS = Src;
40953 OutM = (M % 2) + 2;
40954 return true;
40955 }
40956 return false;
40957 };
40958 int PostMask[4] = {-1, -1, -1, -1};
40959 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
40960 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
40961 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
40962 GetHOpSrc(ScaledMask[3], PostMask[3])) {
40963 LHS = DAG.getBitcast(SrcVT, LHS);
40964 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
40965 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40966 // Use SHUFPS for the permute so this will work on SSE2 targets,
40967 // shuffle combining and domain handling will simplify this later on.
40968 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
40969 Res = DAG.getBitcast(ShuffleVT, Res);
40970 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
40971 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
40972 }
40973 }
40974 }
40975 }
40976
40977 if (2 < Ops.size())
40978 return SDValue();
40979
40980 SDValue BC1 = BC[BC.size() - 1];
40981 if (Mask.size() == VT0.getVectorNumElements()) {
40982 // Canonicalize binary shuffles of horizontal ops that use the
40983 // same sources to an unary shuffle.
40984 // TODO: Try to perform this fold even if the shuffle remains.
40985 if (Ops.size() == 2) {
40986 auto ContainsOps = [](SDValue HOp, SDValue Op) {
40987 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
40988 };
40989 // Commute if all BC0's ops are contained in BC1.
40990 if (ContainsOps(BC1, BC0.getOperand(0)) &&
40991 ContainsOps(BC1, BC0.getOperand(1))) {
40993 std::swap(Ops[0], Ops[1]);
40994 std::swap(BC0, BC1);
40995 }
40996
40997 // If BC1 can be represented by BC0, then convert to unary shuffle.
40998 if (ContainsOps(BC0, BC1.getOperand(0)) &&
40999 ContainsOps(BC0, BC1.getOperand(1))) {
41000 for (int &M : Mask) {
41001 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
41002 continue;
41003 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
41004 M -= NumElts + (SubLane * NumHalfEltsPerLane);
41005 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
41006 M += NumHalfEltsPerLane;
41007 }
41008 }
41009 }
41010
41011 // Canonicalize unary horizontal ops to only refer to lower halves.
41012 for (int i = 0; i != NumElts; ++i) {
41013 int &M = Mask[i];
41014 if (isUndefOrZero(M))
41015 continue;
41016 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
41017 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
41018 M -= NumHalfEltsPerLane;
41019 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
41020 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
41021 M -= NumHalfEltsPerLane;
41022 }
41023 }
41024
41025 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
41026 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
41027 // represents the LHS/RHS inputs for the lower/upper halves.
41028 SmallVector<int, 16> TargetMask128, WideMask128;
41029 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
41030 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
41031 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
41032 bool SingleOp = (Ops.size() == 1);
41033 if (isPack || OneUseOps ||
41034 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
41035 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
41036 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
41037 Lo = Lo.getOperand(WideMask128[0] & 1);
41038 Hi = Hi.getOperand(WideMask128[1] & 1);
41039 if (SingleOp) {
41040 SDValue Undef = DAG.getUNDEF(SrcVT);
41041 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
41042 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
41043 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
41044 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
41045 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
41046 }
41047 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
41048 }
41049 }
41050
41051 // If we are post-shuffling a 256-bit hop and not requiring the upper
41052 // elements, then try to narrow to a 128-bit hop directly.
41053 SmallVector<int, 16> WideMask64;
41054 if (Ops.size() == 1 && NumLanes == 2 &&
41055 scaleShuffleElements(Mask, 4, WideMask64) &&
41056 isUndefInRange(WideMask64, 2, 2)) {
41057 int M0 = WideMask64[0];
41058 int M1 = WideMask64[1];
41059 if (isInRange(M0, 0, 4) && isInRange(M1, 0, 4)) {
41061 unsigned Idx0 = (M0 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
41062 unsigned Idx1 = (M1 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
41063 SDValue V0 = extract128BitVector(BC[0].getOperand(M0 & 1), Idx0, DAG, DL);
41064 SDValue V1 = extract128BitVector(BC[0].getOperand(M1 & 1), Idx1, DAG, DL);
41065 SDValue Res = DAG.getNode(Opcode0, DL, HalfVT, V0, V1);
41066 return widenSubVector(Res, false, Subtarget, DAG, DL, 256);
41067 }
41068 }
41069
41070 return SDValue();
41071}
41072
41073// Attempt to constant fold all of the constant source ops.
41074// Returns true if the entire shuffle is folded to a constant.
41075// TODO: Extend this to merge multiple constant Ops and update the mask.
41077 ArrayRef<int> Mask,
41078 ArrayRef<const SDNode *> SrcNodes,
41079 SelectionDAG &DAG, const SDLoc &DL,
41080 const X86Subtarget &Subtarget) {
41081 unsigned SizeInBits = VT.getSizeInBits();
41082 unsigned NumMaskElts = Mask.size();
41083 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
41084 unsigned NumOps = Ops.size();
41085
41086 // Extract constant bits from each source op.
41087 SmallVector<APInt, 16> UndefEltsOps(NumOps);
41089 for (unsigned I = 0; I != NumOps; ++I)
41090 if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],
41091 RawBitsOps[I],
41092 /*AllowWholeUndefs*/ true,
41093 /*AllowPartialUndefs*/ true))
41094 return SDValue();
41095
41096 // If we're optimizing for size, only fold if at least one of the constants is
41097 // only used once or the combined shuffle has included a variable mask
41098 // shuffle, this is to avoid constant pool bloat.
41099 bool IsOptimizingSize = DAG.shouldOptForSize();
41100 bool HasVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) {
41101 return isTargetShuffleVariableMask(N->getOpcode());
41102 });
41103 if (IsOptimizingSize && !HasVariableMask &&
41104 llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))
41105 return SDValue();
41106
41107 // Shuffle the constant bits according to the mask.
41108 APInt UndefElts(NumMaskElts, 0);
41109 APInt ZeroElts(NumMaskElts, 0);
41110 APInt ConstantElts(NumMaskElts, 0);
41111 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
41112 APInt::getZero(MaskSizeInBits));
41113 for (unsigned i = 0; i != NumMaskElts; ++i) {
41114 int M = Mask[i];
41115 if (M == SM_SentinelUndef) {
41116 UndefElts.setBit(i);
41117 continue;
41118 } else if (M == SM_SentinelZero) {
41119 ZeroElts.setBit(i);
41120 continue;
41121 }
41122 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
41123
41124 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
41125 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
41126
41127 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
41128 if (SrcUndefElts[SrcMaskIdx]) {
41129 UndefElts.setBit(i);
41130 continue;
41131 }
41132
41133 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
41134 APInt &Bits = SrcEltBits[SrcMaskIdx];
41135 if (!Bits) {
41136 ZeroElts.setBit(i);
41137 continue;
41138 }
41139
41140 ConstantElts.setBit(i);
41141 ConstantBitData[i] = Bits;
41142 }
41143 assert((UndefElts | ZeroElts | ConstantElts).isAllOnes());
41144
41145 // Attempt to create a zero vector.
41146 if ((UndefElts | ZeroElts).isAllOnes())
41147 return getZeroVector(VT, Subtarget, DAG, DL);
41148
41149 // Create the constant data.
41150 MVT MaskSVT;
41151 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
41152 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
41153 else
41154 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
41155
41156 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
41157 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
41158 return SDValue();
41159
41160 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
41161 return DAG.getBitcast(VT, CstOp);
41162}
41163
41164namespace llvm {
41165 namespace X86 {
41166 enum {
41168 };
41169 } // namespace X86
41170} // namespace llvm
41171
41172/// Fully generic combining of x86 shuffle instructions.
41173///
41174/// This should be the last combine run over the x86 shuffle instructions. Once
41175/// they have been fully optimized, this will recursively consider all chains
41176/// of single-use shuffle instructions, build a generic model of the cumulative
41177/// shuffle operation, and check for simpler instructions which implement this
41178/// operation. We use this primarily for two purposes:
41179///
41180/// 1) Collapse generic shuffles to specialized single instructions when
41181/// equivalent. In most cases, this is just an encoding size win, but
41182/// sometimes we will collapse multiple generic shuffles into a single
41183/// special-purpose shuffle.
41184/// 2) Look for sequences of shuffle instructions with 3 or more total
41185/// instructions, and replace them with the slightly more expensive SSSE3
41186/// PSHUFB instruction if available. We do this as the last combining step
41187/// to ensure we avoid using PSHUFB if we can implement the shuffle with
41188/// a suitable short sequence of other instructions. The PSHUFB will either
41189/// use a register or have to read from memory and so is slightly (but only
41190/// slightly) more expensive than the other shuffle instructions.
41191///
41192/// Because this is inherently a quadratic operation (for each shuffle in
41193/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
41194/// This should never be an issue in practice as the shuffle lowering doesn't
41195/// produce sequences of more than 8 instructions.
41196///
41197/// FIXME: We will currently miss some cases where the redundant shuffling
41198/// would simplify under the threshold for PSHUFB formation because of
41199/// combine-ordering. To fix this, we should do the redundant instruction
41200/// combining in this recursive walk.
41202 ArrayRef<SDValue> SrcOps, int SrcOpIndex, unsigned RootOpc, MVT RootVT,
41203 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
41204 unsigned MaxDepth, bool AllowVariableCrossLaneMask,
41205 bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG,
41206 const SDLoc &DL, const X86Subtarget &Subtarget) {
41207 assert(!RootMask.empty() &&
41208 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
41209 "Illegal shuffle root mask");
41210 assert(RootVT.isVector() && "Shuffles operate on vector types!");
41211 unsigned RootSizeInBits = RootVT.getSizeInBits();
41212 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41213
41214 // Bound the depth of our recursive combine because this is ultimately
41215 // quadratic in nature.
41216 if (Depth >= MaxDepth)
41217 return SDValue();
41218
41219 // Directly rip through bitcasts to find the underlying operand.
41220 SDValue Op = SrcOps[SrcOpIndex];
41222
41223 EVT VT = Op.getValueType();
41224 if (!VT.isVector() || !VT.isSimple())
41225 return SDValue(); // Bail if we hit a non-simple non-vector.
41226
41227 // FIXME: Just bail on f16 for now.
41228 if (VT.getVectorElementType() == MVT::f16)
41229 return SDValue();
41230
41231 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
41232 "Can only combine shuffles upto size of the root op.");
41233
41234 // Create a demanded elts mask from the referenced elements of Op.
41235 APInt OpDemandedElts = APInt::getZero(RootMask.size());
41236 for (int M : RootMask) {
41237 int BaseIdx = RootMask.size() * SrcOpIndex;
41238 if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))
41239 OpDemandedElts.setBit(M - BaseIdx);
41240 }
41241 if (RootSizeInBits != VT.getSizeInBits()) {
41242 // Op is smaller than Root - extract the demanded elts for the subvector.
41243 unsigned Scale = RootSizeInBits / VT.getSizeInBits();
41244 unsigned NumOpMaskElts = RootMask.size() / Scale;
41245 assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch");
41246 assert(OpDemandedElts
41247 .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)
41248 .isZero() &&
41249 "Out of range elements referenced in root mask");
41250 OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);
41251 }
41252 OpDemandedElts =
41253 APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());
41254
41255 // Extract target shuffle mask and resolve sentinels and inputs.
41256 SmallVector<int, 64> OpMask;
41257 SmallVector<SDValue, 2> OpInputs;
41258 APInt OpUndef, OpZero;
41259 if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
41260 OpZero, DAG, Depth, false)) {
41261 // Shuffle inputs must not be larger than the shuffle result.
41262 // TODO: Relax this for single input faux shuffles (e.g. trunc).
41263 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
41264 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
41265 }))
41266 return SDValue();
41267 } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41268 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
41269 !isNullConstant(Op.getOperand(1))) {
41270 SDValue SrcVec = Op.getOperand(0);
41271 int ExtractIdx = Op.getConstantOperandVal(1);
41272 unsigned NumElts = VT.getVectorNumElements();
41273 OpInputs.assign({SrcVec});
41274 OpMask.assign(NumElts, SM_SentinelUndef);
41275 std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
41276 OpZero = OpUndef = APInt::getZero(NumElts);
41277 } else {
41278 return SDValue();
41279 }
41280
41281 // If the shuffle result was smaller than the root, we need to adjust the
41282 // mask indices and pad the mask with undefs.
41283 if (RootSizeInBits > VT.getSizeInBits()) {
41284 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
41285 unsigned OpMaskSize = OpMask.size();
41286 if (OpInputs.size() > 1) {
41287 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
41288 for (int &M : OpMask) {
41289 if (M < 0)
41290 continue;
41291 int EltIdx = M % OpMaskSize;
41292 int OpIdx = M / OpMaskSize;
41293 M = (PaddedMaskSize * OpIdx) + EltIdx;
41294 }
41295 }
41296 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
41297 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
41298 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
41299 }
41300
41303
41304 // We don't need to merge masks if the root is empty.
41305 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
41306 if (EmptyRoot) {
41307 // Only resolve zeros if it will remove an input, otherwise we might end
41308 // up in an infinite loop.
41309 bool ResolveKnownZeros = true;
41310 if (!OpZero.isZero()) {
41311 APInt UsedInputs = APInt::getZero(OpInputs.size());
41312 for (int i = 0, e = OpMask.size(); i != e; ++i) {
41313 int M = OpMask[i];
41314 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
41315 continue;
41316 UsedInputs.setBit(M / OpMask.size());
41317 if (UsedInputs.isAllOnes()) {
41318 ResolveKnownZeros = false;
41319 break;
41320 }
41321 }
41322 }
41323 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
41324 ResolveKnownZeros);
41325
41326 Mask = OpMask;
41327 Ops.append(OpInputs.begin(), OpInputs.end());
41328 } else {
41329 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
41330
41331 // Add the inputs to the Ops list, avoiding duplicates.
41332 Ops.append(SrcOps.begin(), SrcOps.end());
41333
41334 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
41335 // Attempt to find an existing match.
41337 for (int i = 0, e = Ops.size(); i < e; ++i)
41338 if (InputBC == peekThroughBitcasts(Ops[i]))
41339 return i;
41340 // Match failed - should we replace an existing Op?
41341 if (InsertionPoint >= 0) {
41343 return InsertionPoint;
41344 }
41345 // Add to the end of the Ops list.
41346 Ops.push_back(Input);
41347 return Ops.size() - 1;
41348 };
41349
41350 SmallVector<int, 2> OpInputIdx;
41351 for (SDValue OpInput : OpInputs)
41352 OpInputIdx.push_back(
41353 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
41354
41355 assert(((RootMask.size() > OpMask.size() &&
41356 RootMask.size() % OpMask.size() == 0) ||
41357 (OpMask.size() > RootMask.size() &&
41358 OpMask.size() % RootMask.size() == 0) ||
41359 OpMask.size() == RootMask.size()) &&
41360 "The smaller number of elements must divide the larger.");
41361
41362 // This function can be performance-critical, so we rely on the power-of-2
41363 // knowledge that we have about the mask sizes to replace div/rem ops with
41364 // bit-masks and shifts.
41366 "Non-power-of-2 shuffle mask sizes");
41368 "Non-power-of-2 shuffle mask sizes");
41369 unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());
41370 unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());
41371
41372 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
41373 unsigned RootRatio =
41374 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
41375 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
41376 assert((RootRatio == 1 || OpRatio == 1) &&
41377 "Must not have a ratio for both incoming and op masks!");
41378
41379 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
41380 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
41381 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
41382 unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);
41383 unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);
41384
41385 Mask.resize(MaskWidth, SM_SentinelUndef);
41386
41387 // Merge this shuffle operation's mask into our accumulated mask. Note that
41388 // this shuffle's mask will be the first applied to the input, followed by
41389 // the root mask to get us all the way to the root value arrangement. The
41390 // reason for this order is that we are recursing up the operation chain.
41391 for (unsigned i = 0; i < MaskWidth; ++i) {
41392 unsigned RootIdx = i >> RootRatioLog2;
41393 if (RootMask[RootIdx] < 0) {
41394 // This is a zero or undef lane, we're done.
41395 Mask[i] = RootMask[RootIdx];
41396 continue;
41397 }
41398
41399 unsigned RootMaskedIdx =
41400 RootRatio == 1
41401 ? RootMask[RootIdx]
41402 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
41403
41404 // Just insert the scaled root mask value if it references an input other
41405 // than the SrcOp we're currently inserting.
41406 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
41407 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
41408 Mask[i] = RootMaskedIdx;
41409 continue;
41410 }
41411
41412 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
41413 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
41414 if (OpMask[OpIdx] < 0) {
41415 // The incoming lanes are zero or undef, it doesn't matter which ones we
41416 // are using.
41417 Mask[i] = OpMask[OpIdx];
41418 continue;
41419 }
41420
41421 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
41422 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
41423 : (OpMask[OpIdx] << OpRatioLog2) +
41424 (RootMaskedIdx & (OpRatio - 1));
41425
41426 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
41427 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
41428 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
41429 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
41430
41431 Mask[i] = OpMaskedIdx;
41432 }
41433 }
41434
41435 // Peek through any free bitcasts to insert_subvector vector widenings or
41436 // extract_subvector nodes back to root size.
41437 // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
41438 for (auto [I, Op] : enumerate(Ops)) {
41439 SDValue BC = Op;
41440 while (1) {
41441 if (BC.getOpcode() == ISD::BITCAST && BC.hasOneUse()) {
41442 BC = BC.getOperand(0);
41443 continue;
41444 }
41445 if (BC.getOpcode() == ISD::INSERT_SUBVECTOR &&
41446 BC.getOperand(0).isUndef() && isNullConstant(BC.getOperand(2))) {
41447 // Set out of bounds mask indices to undef.
41448 Op = BC = BC.getOperand(1);
41449 unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
41450 int Lo = I * Mask.size();
41451 int Hi = (I + 1) * Mask.size();
41452 int NewHi = Lo + (Mask.size() / Scale);
41453 for (int &M : Mask) {
41454 if (Lo <= M && NewHi <= M && M < Hi)
41455 M = SM_SentinelUndef;
41456 }
41457 continue;
41458 }
41459 if (BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41460 (RootSizeInBits % BC.getOperand(0).getValueSizeInBits()) == 0 &&
41461 isNullConstant(BC.getOperand(1))) {
41462 Op = BC = BC.getOperand(0);
41463 continue;
41464 }
41465 break;
41466 }
41467 }
41468
41469 // Remove unused/repeated shuffle source ops.
41471
41472 // Handle the all undef/zero/ones cases early.
41473 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
41474 return DAG.getUNDEF(RootVT);
41475 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
41476 return getZeroVector(RootVT, Subtarget, DAG, DL);
41477 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
41479 return getOnesVector(RootVT, DAG, DL);
41480
41481 assert(!Ops.empty() && "Shuffle with no inputs detected");
41482
41483 // Update the list of shuffle nodes that have been combined so far.
41484 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes);
41485 CombinedNodes.push_back(Op.getNode());
41486
41487 // See if we can recurse into each shuffle source op (if it's a target
41488 // shuffle). The source op should only be generally combined if it either has
41489 // a single use (i.e. current Op) or all its users have already been combined,
41490 // if not then we can still combine but should prevent generation of variable
41491 // shuffles to avoid constant pool bloat.
41492 // Don't recurse if we already have more source ops than we can combine in
41493 // the remaining recursion depth.
41494 if (Ops.size() < (MaxDepth - Depth)) {
41495 for (int i = 0, e = Ops.size(); i < e; ++i) {
41496 // For empty roots, we need to resolve zeroable elements before combining
41497 // them with other shuffles.
41498 SmallVector<int, 64> ResolvedMask = Mask;
41499 if (EmptyRoot)
41500 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
41501 bool AllowCrossLaneVar = false;
41502 bool AllowPerLaneVar = false;
41503 if (Ops[i].getNode()->hasOneUse() ||
41504 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
41505 AllowCrossLaneVar = AllowVariableCrossLaneMask;
41506 AllowPerLaneVar = AllowVariablePerLaneMask;
41507 }
41509 Ops, i, RootOpc, RootVT, ResolvedMask, CombinedNodes, Depth + 1,
41510 MaxDepth, AllowCrossLaneVar, AllowPerLaneVar, IsMaskedShuffle,
41511 DAG, DL, Subtarget))
41512 return Res;
41513 }
41514 }
41515
41516 // Attempt to constant fold all of the constant source ops.
41518 RootVT, Ops, Mask, CombinedNodes, DAG, DL, Subtarget))
41519 return Cst;
41520
41521 // If constant fold failed and we only have constants - then we have
41522 // multiple uses by a single non-variable shuffle - just bail.
41523 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
41524 APInt UndefElts;
41525 SmallVector<APInt> RawBits;
41526 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
41527 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
41528 RawBits,
41529 /*AllowWholeUndefs*/ true,
41530 /*AllowPartialUndefs*/ true);
41531 })) {
41532 return SDValue();
41533 }
41534
41535 // Canonicalize the combined shuffle mask chain with horizontal ops.
41536 // NOTE: This will update the Ops and Mask.
41538 Ops, Mask, RootSizeInBits, DL, DAG, Subtarget))
41539 return DAG.getBitcast(RootVT, HOp);
41540
41541 // Try to refine our inputs given our knowledge of target shuffle mask.
41542 for (auto I : enumerate(Ops)) {
41543 int OpIdx = I.index();
41544 SDValue &Op = I.value();
41545
41546 // What range of shuffle mask element values results in picking from Op?
41547 int Lo = OpIdx * Mask.size();
41548 int Hi = Lo + Mask.size();
41549
41550 // Which elements of Op do we demand, given the mask's granularity?
41551 APInt OpDemandedElts(Mask.size(), 0);
41552 for (int MaskElt : Mask) {
41553 if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
41554 int OpEltIdx = MaskElt - Lo;
41555 OpDemandedElts.setBit(OpEltIdx);
41556 }
41557 }
41558
41559 // Is the shuffle result smaller than the root?
41560 if (Op.getValueSizeInBits() < RootSizeInBits) {
41561 // We padded the mask with undefs. But we now need to undo that.
41562 unsigned NumExpectedVectorElts = Mask.size();
41563 unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
41564 unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
41565 assert(!OpDemandedElts.extractBits(
41566 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&
41567 "Demanding the virtual undef widening padding?");
41568 OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
41569 }
41570
41571 // The Op itself may be of different VT, so we need to scale the mask.
41572 unsigned NumOpElts = Op.getValueType().getVectorNumElements();
41573 APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
41574
41575 // Can this operand be simplified any further, given it's demanded elements?
41577 Op, OpScaledDemandedElts, DAG))
41578 Op = NewOp;
41579 }
41580 // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
41581
41582 // Widen any subvector shuffle inputs we've collected.
41583 // TODO: Remove this to avoid generating temporary nodes, we should only
41584 // widen once combineX86ShuffleChain has found a match.
41585 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
41586 return Op.getValueSizeInBits() < RootSizeInBits;
41587 })) {
41588 for (SDValue &Op : Ops)
41589 if (Op.getValueSizeInBits() < RootSizeInBits)
41590 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
41591 RootSizeInBits);
41592 // Reresolve - we might have repeated subvector sources.
41594 }
41595
41596 // Handle the all undef/zero/ones cases.
41597 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
41598 return DAG.getUNDEF(RootVT);
41599 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
41600 return getZeroVector(RootVT, Subtarget, DAG, DL);
41601 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
41603 return getOnesVector(RootVT, DAG, DL);
41604
41605 assert(!Ops.empty() && "Shuffle with no inputs detected");
41606
41607 // We can only combine unary and binary shuffle mask cases.
41608 if (Ops.size() <= 2) {
41609 // Minor canonicalization of the accumulated shuffle mask to make it easier
41610 // to match below. All this does is detect masks with sequential pairs of
41611 // elements, and shrink them to the half-width mask. It does this in a loop
41612 // so it will reduce the size of the mask to the minimal width mask which
41613 // performs an equivalent shuffle.
41614 while (Mask.size() > 1) {
41615 SmallVector<int, 64> WidenedMask;
41616 if (!canWidenShuffleElements(Mask, WidenedMask))
41617 break;
41618 Mask = std::move(WidenedMask);
41619 }
41620
41621 // Canonicalization of binary shuffle masks to improve pattern matching by
41622 // commuting the inputs.
41623 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
41625 std::swap(Ops[0], Ops[1]);
41626 }
41627
41628 // Try to combine into a single shuffle instruction.
41629 if (SDValue Shuffle = combineX86ShuffleChain(
41630 Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
41631 AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
41632 IsMaskedShuffle, DAG, DL, Subtarget))
41633 return Shuffle;
41634
41635 // If all the operands come from the same larger vector, fallthrough and try
41636 // to use combineX86ShuffleChainWithExtract.
41639 if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
41640 (RootSizeInBits / Mask.size()) != 64 ||
41641 LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41642 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41643 LHS.getOperand(0) != RHS.getOperand(0))
41644 return SDValue();
41645 }
41646
41647 // If that failed and any input is extracted then try to combine as a
41648 // shuffle with the larger type.
41650 Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
41651 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
41652 DAG, DL, Subtarget);
41653}
41654
41655/// Helper entry wrapper to combineX86ShufflesRecursively.
41657 const X86Subtarget &Subtarget) {
41659 {Op}, 0, Op.getOpcode(), Op.getSimpleValueType(), {0}, {}, /*Depth=*/0,
41660 X86::MaxShuffleCombineDepth, /*AllowVariableCrossLaneMask=*/true,
41661 /*AllowVariablePerLaneMask=*/true, isMaskableNode(Op, Subtarget), DAG,
41662 SDLoc(Op), Subtarget);
41663}
41664
41665/// Get the PSHUF-style mask from PSHUF node.
41666///
41667/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
41668/// PSHUF-style masks that can be reused with such instructions.
41670 MVT VT = N.getSimpleValueType();
41673 bool HaveMask = getTargetShuffleMask(N, false, Ops, Mask);
41674 (void)HaveMask;
41675 assert(HaveMask);
41676
41677 // If we have more than 128-bits, only the low 128-bits of shuffle mask
41678 // matter. Check that the upper masks are repeats and remove them.
41679 if (VT.getSizeInBits() > 128) {
41680 int LaneElts = 128 / VT.getScalarSizeInBits();
41681#ifndef NDEBUG
41682 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
41683 for (int j = 0; j < LaneElts; ++j)
41684 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
41685 "Mask doesn't repeat in high 128-bit lanes!");
41686#endif
41687 Mask.resize(LaneElts);
41688 }
41689
41690 switch (N.getOpcode()) {
41691 case X86ISD::PSHUFD:
41692 return Mask;
41693 case X86ISD::PSHUFLW:
41694 Mask.resize(4);
41695 return Mask;
41696 case X86ISD::PSHUFHW:
41697 Mask.erase(Mask.begin(), Mask.begin() + 4);
41698 for (int &M : Mask)
41699 M -= 4;
41700 return Mask;
41701 default:
41702 llvm_unreachable("No valid shuffle instruction found!");
41703 }
41704}
41705
41706/// Get the expanded blend mask from a BLENDI node.
41707/// For v16i16 nodes, this will splat the repeated i8 mask.
41709 assert(V.getOpcode() == X86ISD::BLENDI && "Unknown blend shuffle");
41710 unsigned NumElts = V.getSimpleValueType().getVectorNumElements();
41711 APInt Mask = V.getConstantOperandAPInt(2);
41712 if (Mask.getBitWidth() > NumElts)
41713 Mask = Mask.trunc(NumElts);
41714 if (NumElts == 16) {
41715 assert(Mask.getBitWidth() == 8 && "Unexpected v16i16 blend mask width");
41716 Mask = APInt::getSplat(16, Mask);
41717 }
41718 assert(Mask.getBitWidth() == NumElts && "Unexpected blend mask width");
41719 return Mask;
41720}
41721
41722/// Search for a combinable shuffle across a chain ending in pshufd.
41723///
41724/// We walk up the chain and look for a combinable shuffle, skipping over
41725/// shuffles that we could hoist this shuffle's transformation past without
41726/// altering anything.
41729 const SDLoc &DL,
41730 SelectionDAG &DAG) {
41731 assert(N.getOpcode() == X86ISD::PSHUFD &&
41732 "Called with something other than an x86 128-bit half shuffle!");
41733
41734 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
41735 // of the shuffles in the chain so that we can form a fresh chain to replace
41736 // this one.
41738 SDValue V = N.getOperand(0);
41739 for (; V.hasOneUse(); V = V.getOperand(0)) {
41740 switch (V.getOpcode()) {
41741 default:
41742 return SDValue(); // Nothing combined!
41743
41744 case ISD::BITCAST:
41745 // Skip bitcasts as we always know the type for the target specific
41746 // instructions.
41747 continue;
41748
41749 case X86ISD::PSHUFD:
41750 // Found another dword shuffle.
41751 break;
41752
41753 case X86ISD::PSHUFLW:
41754 // Check that the low words (being shuffled) are the identity in the
41755 // dword shuffle, and the high words are self-contained.
41756 if (Mask[0] != 0 || Mask[1] != 1 ||
41757 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
41758 return SDValue();
41759
41760 Chain.push_back(V);
41761 continue;
41762
41763 case X86ISD::PSHUFHW:
41764 // Check that the high words (being shuffled) are the identity in the
41765 // dword shuffle, and the low words are self-contained.
41766 if (Mask[2] != 2 || Mask[3] != 3 ||
41767 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
41768 return SDValue();
41769
41770 Chain.push_back(V);
41771 continue;
41772
41773 case X86ISD::UNPCKL:
41774 case X86ISD::UNPCKH:
41775 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
41776 // shuffle into a preceding word shuffle.
41777 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
41778 V.getSimpleValueType().getVectorElementType() != MVT::i16)
41779 return SDValue();
41780
41781 // Search for a half-shuffle which we can combine with.
41782 unsigned CombineOp =
41783 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
41784 if (V.getOperand(0) != V.getOperand(1) ||
41785 !V->isOnlyUserOf(V.getOperand(0).getNode()))
41786 return SDValue();
41787 Chain.push_back(V);
41788 V = V.getOperand(0);
41789 do {
41790 switch (V.getOpcode()) {
41791 default:
41792 return SDValue(); // Nothing to combine.
41793
41794 case X86ISD::PSHUFLW:
41795 case X86ISD::PSHUFHW:
41796 if (V.getOpcode() == CombineOp)
41797 break;
41798
41799 Chain.push_back(V);
41800
41801 [[fallthrough]];
41802 case ISD::BITCAST:
41803 V = V.getOperand(0);
41804 continue;
41805 }
41806 break;
41807 } while (V.hasOneUse());
41808 break;
41809 }
41810 // Break out of the loop if we break out of the switch.
41811 break;
41812 }
41813
41814 if (!V.hasOneUse())
41815 // We fell out of the loop without finding a viable combining instruction.
41816 return SDValue();
41817
41818 // Merge this node's mask and our incoming mask.
41820 for (int &M : Mask)
41821 M = VMask[M];
41822 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
41823 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
41824
41825 // Rebuild the chain around this new shuffle.
41826 while (!Chain.empty()) {
41827 SDValue W = Chain.pop_back_val();
41828
41829 if (V.getValueType() != W.getOperand(0).getValueType())
41830 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
41831
41832 switch (W.getOpcode()) {
41833 default:
41834 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
41835
41836 case X86ISD::UNPCKL:
41837 case X86ISD::UNPCKH:
41838 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
41839 break;
41840
41841 case X86ISD::PSHUFD:
41842 case X86ISD::PSHUFLW:
41843 case X86ISD::PSHUFHW:
41844 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
41845 break;
41846 }
41847 }
41848 if (V.getValueType() != N.getValueType())
41849 V = DAG.getBitcast(N.getValueType(), V);
41850
41851 // Return the new chain to replace N.
41852 return V;
41853}
41854
41855// Attempt to commute shufps LHS loads:
41856// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
41858 SelectionDAG &DAG) {
41859 // TODO: Add vXf64 support.
41860 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
41861 return SDValue();
41862
41863 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
41864 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
41865 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
41866 return SDValue();
41867 SDValue N0 = V.getOperand(0);
41868 SDValue N1 = V.getOperand(1);
41869 unsigned Imm = V.getConstantOperandVal(2);
41870 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
41871 if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
41873 return SDValue();
41874 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
41875 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
41876 DAG.getTargetConstant(Imm, DL, MVT::i8));
41877 };
41878
41879 switch (N.getOpcode()) {
41880 case X86ISD::VPERMILPI:
41881 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
41882 unsigned Imm = N.getConstantOperandVal(1);
41883 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
41884 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41885 }
41886 break;
41887 case X86ISD::SHUFP: {
41888 SDValue N0 = N.getOperand(0);
41889 SDValue N1 = N.getOperand(1);
41890 unsigned Imm = N.getConstantOperandVal(2);
41891 if (N0 == N1) {
41892 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
41893 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
41894 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41895 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
41896 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
41897 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
41898 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
41899 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
41900 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
41901 }
41902 break;
41903 }
41904 }
41905
41906 return SDValue();
41907}
41908
41909// Attempt to fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y))
41910// iff we don't demand the same element index for both X and Y.
41911static SDValue
41913 const APInt &DemandedElts, SelectionDAG &DAG,
41914 const X86Subtarget &Subtarget, const SDLoc &DL) {
41915 assert(isBlendOrUndef(BlendMask) && "Blend shuffle expected");
41916 if (!N0.hasOneUse() || !N1.hasOneUse())
41917 return SDValue();
41918
41919 unsigned NumElts = VT.getVectorNumElements();
41922
41923 // See if both operands are shuffles, and that we can scale the shuffle masks
41924 // to the same width as the blend mask.
41925 // TODO: Support SM_SentinelZero?
41926 SmallVector<SDValue, 2> Ops0, Ops1;
41927 SmallVector<int, 32> Mask0, Mask1, ScaledMask0, ScaledMask1;
41928 if (!getTargetShuffleMask(BC0, /*AllowSentinelZero=*/false, Ops0, Mask0) ||
41929 !getTargetShuffleMask(BC1, /*AllowSentinelZero=*/false, Ops1, Mask1) ||
41930 !scaleShuffleElements(Mask0, NumElts, ScaledMask0) ||
41931 !scaleShuffleElements(Mask1, NumElts, ScaledMask1))
41932 return SDValue();
41933
41934 // Determine the demanded elts from both permutes.
41935 APInt Demanded0, DemandedLHS0, DemandedRHS0;
41936 APInt Demanded1, DemandedLHS1, DemandedRHS1;
41937 if (!getShuffleDemandedElts(NumElts, BlendMask, DemandedElts, Demanded0,
41938 Demanded1,
41939 /*AllowUndefElts=*/true) ||
41940 !getShuffleDemandedElts(NumElts, ScaledMask0, Demanded0, DemandedLHS0,
41941 DemandedRHS0, /*AllowUndefElts=*/true) ||
41942 !getShuffleDemandedElts(NumElts, ScaledMask1, Demanded1, DemandedLHS1,
41943 DemandedRHS1, /*AllowUndefElts=*/true))
41944 return SDValue();
41945
41946 // Confirm that we only use a single operand from both permutes and that we
41947 // don't demand the same index from both.
41948 if (!DemandedRHS0.isZero() || !DemandedRHS1.isZero() ||
41949 DemandedLHS0.intersects(DemandedLHS1))
41950 return SDValue();
41951
41952 // Use the permute demanded elts masks as the new blend mask.
41953 // Create the new permute mask as a blend of the 2 original permute masks.
41954 SmallVector<int, 32> NewBlendMask(NumElts, SM_SentinelUndef);
41955 SmallVector<int, 32> NewPermuteMask(NumElts, SM_SentinelUndef);
41956 for (unsigned I = 0; I != NumElts; ++I) {
41957 if (Demanded0[I]) {
41958 int M = ScaledMask0[I];
41959 if (0 <= M) {
41960 assert(isUndefOrEqual(NewBlendMask[M], M) &&
41961 "BlendMask demands LHS AND RHS");
41962 NewBlendMask[M] = M;
41963 NewPermuteMask[I] = M;
41964 }
41965 } else if (Demanded1[I]) {
41966 int M = ScaledMask1[I];
41967 if (0 <= M) {
41968 assert(isUndefOrEqual(NewBlendMask[M], M + NumElts) &&
41969 "BlendMask demands LHS AND RHS");
41970 NewBlendMask[M] = M + NumElts;
41971 NewPermuteMask[I] = M;
41972 }
41973 }
41974 }
41975 assert(isBlendOrUndef(NewBlendMask) && "Bad blend");
41976 assert(isUndefOrInRange(NewPermuteMask, 0, NumElts) && "Bad permute");
41977
41978 // v16i16 shuffles can explode in complexity very easily, only accept them if
41979 // the blend mask is the same in the 128-bit subvectors (or can widen to
41980 // v8i32) and the permute can be widened as well.
41981 if (VT == MVT::v16i16) {
41982 if (!is128BitLaneRepeatedShuffleMask(VT, NewBlendMask) &&
41983 !canWidenShuffleElements(NewBlendMask))
41984 return SDValue();
41985 if (!canWidenShuffleElements(NewPermuteMask))
41986 return SDValue();
41987 }
41988
41989 // Don't introduce lane-crossing permutes without AVX2, unless it can be
41990 // widened to a lane permute (vperm2f128).
41991 if (VT.is256BitVector() && !Subtarget.hasAVX2() &&
41993 NewPermuteMask) &&
41994 !canScaleShuffleElements(NewPermuteMask, 2))
41995 return SDValue();
41996
41997 SDValue NewBlend =
41998 DAG.getVectorShuffle(VT, DL, DAG.getBitcast(VT, Ops0[0]),
41999 DAG.getBitcast(VT, Ops1[0]), NewBlendMask);
42000 return DAG.getVectorShuffle(VT, DL, NewBlend, DAG.getUNDEF(VT),
42001 NewPermuteMask);
42002}
42003
42004// TODO - move this to TLI like isBinOp?
42005static bool isUnaryOp(unsigned Opcode) {
42006 switch (Opcode) {
42007 case ISD::CTLZ:
42008 case ISD::CTTZ:
42009 case ISD::CTPOP:
42010 return true;
42011 }
42012 return false;
42013}
42014
42015// Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
42016// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
42018 const SDLoc &DL) {
42019 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42020 EVT ShuffleVT = N.getValueType();
42021 unsigned Opc = N.getOpcode();
42022
42023 auto IsMergeableWithShuffle = [Opc, &DAG](SDValue Op, bool FoldShuf = true) {
42024 // AllZeros/AllOnes constants are freely shuffled and will peek through
42025 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
42026 // merge with target shuffles if it has one use so shuffle combining is
42027 // likely to kick in. Shuffles of splats are expected to be removed.
42028 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
42029 ISD::isBuildVectorAllZeros(Op.getNode()) ||
42033 (Op.getOpcode() == Opc && Op->hasOneUse()) ||
42034 (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
42035 (Op.getOpcode() == ISD::CONCAT_VECTORS && Op->hasOneUse()) ||
42036 (FoldShuf && isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
42037 DAG.isSplatValue(Op, /*AllowUndefs*/ false);
42038 };
42039 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
42040 // Ensure we only shuffle whole vector src elements, unless its a logical
42041 // binops where we can more aggressively move shuffles from dst to src.
42042 return isLogicOp(BinOp) ||
42043 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
42044 };
42045
42046 switch (Opc) {
42047 // Unary and Unary+Permute Shuffles.
42048 case X86ISD::PSHUFB: {
42049 // Don't merge PSHUFB if it contains zero'd elements.
42050 SmallVector<int> Mask;
42052 if (!getTargetShuffleMask(N, false, Ops, Mask))
42053 break;
42054 [[fallthrough]];
42055 }
42056 case X86ISD::VBROADCAST:
42057 case X86ISD::MOVDDUP:
42058 case X86ISD::PSHUFD:
42059 case X86ISD::PSHUFHW:
42060 case X86ISD::PSHUFLW:
42061 case X86ISD::VPERMV:
42062 case X86ISD::VPERMI:
42063 case X86ISD::VPERMILPI: {
42064 unsigned SrcIdx = Opc == X86ISD::VPERMV ? 1 : 0;
42065 if (N.getOperand(SrcIdx).getValueType() == ShuffleVT &&
42066 N->isOnlyUserOf(N.getOperand(SrcIdx).getNode())) {
42067 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(SrcIdx));
42068 unsigned SrcOpcode = N0.getOpcode();
42069 EVT OpVT = N0.getValueType();
42070 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
42073 bool FoldShuf = Opc != X86ISD::VPERMI && Opc != X86ISD::VPERMV;
42074 if (IsMergeableWithShuffle(Op00, FoldShuf) ||
42075 IsMergeableWithShuffle(Op01, FoldShuf)) {
42076 SDValue LHS, RHS;
42077 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42078 Op01 = DAG.getBitcast(ShuffleVT, Op01);
42079 if (Opc == X86ISD::VPERMV) {
42080 LHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op00);
42081 RHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op01);
42082 } else if (N.getNumOperands() == 2) {
42083 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
42084 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
42085 } else {
42086 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
42087 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
42088 }
42089 return DAG.getBitcast(ShuffleVT,
42090 DAG.getNode(SrcOpcode, DL, OpVT,
42091 DAG.getBitcast(OpVT, LHS),
42092 DAG.getBitcast(OpVT, RHS)));
42093 }
42094 }
42095 if (SrcOpcode == ISD::SINT_TO_FP && IsSafeToMoveShuffle(N0, SrcOpcode) &&
42096 OpVT.getScalarSizeInBits() ==
42098 SDValue Res = DAG.getBitcast(ShuffleVT, N0.getOperand(0));
42099 if (Opc == X86ISD::VPERMV)
42100 Res = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Res);
42101 else if (N.getNumOperands() == 2)
42102 Res = DAG.getNode(Opc, DL, ShuffleVT, Res, N.getOperand(1));
42103 else
42104 Res = DAG.getNode(Opc, DL, ShuffleVT, Res);
42105 Res = DAG.getBitcast(N0.getOperand(0).getValueType(), Res);
42106 return DAG.getBitcast(ShuffleVT, DAG.getNode(SrcOpcode, DL, OpVT, Res));
42107 }
42108 }
42109 break;
42110 }
42111 // Binary and Binary+Permute Shuffles.
42112 case X86ISD::INSERTPS: {
42113 // Don't merge INSERTPS if it contains zero'd elements.
42114 unsigned InsertPSMask = N.getConstantOperandVal(2);
42115 unsigned ZeroMask = InsertPSMask & 0xF;
42116 if (ZeroMask != 0)
42117 break;
42118 [[fallthrough]];
42119 }
42120 case X86ISD::MOVSD:
42121 case X86ISD::MOVSS:
42122 case X86ISD::BLENDI:
42123 case X86ISD::SHUFP:
42124 case X86ISD::UNPCKH:
42125 case X86ISD::UNPCKL: {
42126 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
42127 N->isOnlyUserOf(N.getOperand(1).getNode())) {
42128 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
42129 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
42130 unsigned SrcOpcode = N0.getOpcode();
42131 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
42132 N0.getValueType() == N1.getValueType() &&
42133 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42134 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42139 // Ensure the total number of shuffles doesn't increase by folding this
42140 // shuffle through to the source ops.
42141 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
42142 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
42143 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
42144 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
42145 SDValue LHS, RHS;
42146 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42147 Op10 = DAG.getBitcast(ShuffleVT, Op10);
42148 Op01 = DAG.getBitcast(ShuffleVT, Op01);
42149 Op11 = DAG.getBitcast(ShuffleVT, Op11);
42150 if (N.getNumOperands() == 3) {
42151 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
42152 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
42153 } else {
42154 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
42155 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
42156 }
42157 EVT OpVT = N0.getValueType();
42158 return DAG.getBitcast(ShuffleVT,
42159 DAG.getNode(SrcOpcode, DL, OpVT,
42160 DAG.getBitcast(OpVT, LHS),
42161 DAG.getBitcast(OpVT, RHS)));
42162 }
42163 }
42164 if (isUnaryOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
42165 N0.getValueType() == N1.getValueType() &&
42166 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42167 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42170 SDValue Res;
42171 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42172 Op10 = DAG.getBitcast(ShuffleVT, Op10);
42173 if (N.getNumOperands() == 3) {
42174 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
42175 } else {
42176 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
42177 }
42178 EVT OpVT = N0.getValueType();
42179 return DAG.getBitcast(
42180 ShuffleVT,
42181 DAG.getNode(SrcOpcode, DL, OpVT, DAG.getBitcast(OpVT, Res)));
42182 }
42183 // TODO: We can generalize this for other shuffles/conversions.
42184 if (Opc == X86ISD::UNPCKL && SrcOpcode == X86ISD::CVTPH2PS &&
42185 N1.getOpcode() == SrcOpcode &&
42186 N0.getValueType() == N1.getValueType() &&
42187 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() &&
42188 ShuffleVT.getScalarSizeInBits() == N0.getScalarValueSizeInBits() &&
42189 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42190 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42191 EVT OpSrcVT = N0.getOperand(0).getValueType();
42192 EVT OpDstVT = N0.getValueType();
42193 SDValue Res =
42194 DAG.getNode(Opc, DL, OpSrcVT, N0.getOperand(0), N1.getOperand(0));
42195 return DAG.getBitcast(ShuffleVT,
42196 DAG.getNode(SrcOpcode, DL, OpDstVT, Res));
42197 }
42198 }
42199 break;
42200 }
42201 }
42202 return SDValue();
42203}
42204
42205/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
42207 SelectionDAG &DAG,
42208 const SDLoc &DL) {
42209 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
42210
42211 MVT VT = V.getSimpleValueType();
42212 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
42213 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
42214 unsigned SrcOpc0 = Src0.getOpcode();
42215 unsigned SrcOpc1 = Src1.getOpcode();
42216 EVT SrcVT0 = Src0.getValueType();
42217 EVT SrcVT1 = Src1.getValueType();
42218
42219 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
42220 return SDValue();
42221
42222 switch (SrcOpc0) {
42223 case X86ISD::MOVDDUP: {
42224 SDValue LHS = Src0.getOperand(0);
42225 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
42226 SDValue Res =
42227 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
42228 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
42229 return DAG.getBitcast(VT, Res);
42230 }
42231 case X86ISD::VPERMILPI:
42232 // TODO: Handle v4f64 permutes with different low/high lane masks.
42233 if (SrcVT0 == MVT::v4f64) {
42234 uint64_t Mask = Src0.getConstantOperandVal(1);
42235 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
42236 break;
42237 }
42238 [[fallthrough]];
42239 case X86ISD::VSHLI:
42240 case X86ISD::VSRLI:
42241 case X86ISD::VSRAI:
42242 case X86ISD::PSHUFD:
42243 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
42244 SDValue LHS = Src0.getOperand(0);
42245 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
42246 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
42247 V.getOperand(2));
42248 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
42249 return DAG.getBitcast(VT, Res);
42250 }
42251 break;
42252 }
42253
42254 return SDValue();
42255}
42256
42257/// Try to combine x86 target specific shuffles.
42259 SelectionDAG &DAG,
42261 const X86Subtarget &Subtarget) {
42262 using namespace SDPatternMatch;
42263
42264 MVT VT = N.getSimpleValueType();
42265 unsigned NumElts = VT.getVectorNumElements();
42267 unsigned Opcode = N.getOpcode();
42268 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42269
42270 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
42271 return R;
42272
42273 // Handle specific target shuffles.
42274 switch (Opcode) {
42275 case X86ISD::MOVDDUP: {
42276 SDValue Src = N.getOperand(0);
42277 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
42278 if (VT == MVT::v2f64 && Src.hasOneUse() &&
42279 ISD::isNormalLoad(Src.getNode())) {
42280 LoadSDNode *LN = cast<LoadSDNode>(Src);
42281 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
42282 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
42283 DCI.CombineTo(N.getNode(), Movddup);
42284 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42286 return N; // Return N so it doesn't get rechecked!
42287 }
42288 }
42289
42290 return SDValue();
42291 }
42292 case X86ISD::VBROADCAST: {
42293 SDValue Src = N.getOperand(0);
42294 SDValue BC = peekThroughBitcasts(Src);
42295 EVT SrcVT = Src.getValueType();
42296 EVT BCVT = BC.getValueType();
42297
42298 // If broadcasting from another shuffle, attempt to simplify it.
42299 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
42300 if (isTargetShuffle(BC.getOpcode()) &&
42301 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
42302 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
42303 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
42305 for (unsigned i = 0; i != Scale; ++i)
42306 DemandedMask[i] = i;
42308 {BC}, 0, BC.getOpcode(), BC.getSimpleValueType(), DemandedMask,
42309 {}, /*Depth=*/0, X86::MaxShuffleCombineDepth,
42310 /*AllowVariableCrossLaneMask=*/true,
42311 /*AllowVariablePerLaneMask=*/true,
42312 /*IsMaskedShuffle=*/false, DAG, DL, Subtarget))
42313 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
42314 DAG.getBitcast(SrcVT, Res));
42315 }
42316
42317 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
42318 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
42319 if (Src.getOpcode() == ISD::BITCAST &&
42320 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
42321 TLI.isTypeLegal(BCVT) &&
42323 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
42324 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
42326 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
42327 }
42328
42329 // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
42330 // If we're re-broadcasting a smaller type then broadcast with that type and
42331 // bitcast.
42332 // TODO: Do this for any splat?
42333 if (Src.getOpcode() == ISD::BITCAST &&
42334 (BC.getOpcode() == X86ISD::VBROADCAST ||
42336 (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&
42337 (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {
42338 MVT NewVT =
42340 VT.getSizeInBits() / BCVT.getScalarSizeInBits());
42341 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
42342 }
42343
42344 // Reduce broadcast source vector to lowest 128-bits.
42345 if (SrcVT.getSizeInBits() > 128)
42346 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
42347 extract128BitVector(Src, 0, DAG, DL));
42348
42349 // broadcast(scalar_to_vector(x)) -> broadcast(x).
42350 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42351 Src.getValueType().getScalarType() == Src.getOperand(0).getValueType())
42352 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
42353
42354 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
42355 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
42356 isNullConstant(Src.getOperand(1)) &&
42357 Src.getValueType() ==
42358 Src.getOperand(0).getValueType().getScalarType() &&
42359 TLI.isTypeLegal(Src.getOperand(0).getValueType()))
42360 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
42361
42362 // Share broadcast with the longest vector and extract low subvector (free).
42363 // Ensure the same SDValue from the SDNode use is being used.
42364 for (SDNode *User : Src->users())
42365 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
42366 Src == User->getOperand(0) &&
42367 User->getValueSizeInBits(0).getFixedValue() >
42368 VT.getFixedSizeInBits()) {
42369 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
42370 VT.getSizeInBits());
42371 }
42372
42373 // vbroadcast(scalarload X) -> vbroadcast_load X
42374 // For float loads, extract other uses of the scalar from the broadcast.
42375 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
42376 ISD::isNormalLoad(Src.getNode())) {
42377 LoadSDNode *LN = cast<LoadSDNode>(Src);
42378 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42379 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42380 SDValue BcastLd =
42382 LN->getMemoryVT(), LN->getMemOperand());
42383 // If the load value is used only by N, replace it via CombineTo N.
42384 bool NoReplaceExtract = Src.hasOneUse();
42385 DCI.CombineTo(N.getNode(), BcastLd);
42386 if (NoReplaceExtract) {
42387 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42389 } else {
42390 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
42391 DAG.getVectorIdxConstant(0, DL));
42392 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
42393 }
42394 return N; // Return N so it doesn't get rechecked!
42395 }
42396
42397 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
42398 // i16. So shrink it ourselves if we can make a broadcast_load.
42399 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
42400 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
42401 assert(Subtarget.hasAVX2() && "Expected AVX2");
42402 SDValue TruncIn = Src.getOperand(0);
42403
42404 // If this is a truncate of a non extending load we can just narrow it to
42405 // use a broadcast_load.
42406 if (ISD::isNormalLoad(TruncIn.getNode())) {
42407 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
42408 // Unless its volatile or atomic.
42409 if (LN->isSimple()) {
42410 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42411 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42412 SDValue BcastLd = DAG.getMemIntrinsicNode(
42413 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
42414 LN->getPointerInfo(), LN->getBaseAlign(),
42415 LN->getMemOperand()->getFlags());
42416 DCI.CombineTo(N.getNode(), BcastLd);
42417 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42418 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42419 return N; // Return N so it doesn't get rechecked!
42420 }
42421 }
42422
42423 // If this is a truncate of an i16 extload, we can directly replace it.
42424 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
42425 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
42426 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
42427 if (LN->getMemoryVT().getSizeInBits() == 16) {
42428 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42429 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42430 SDValue BcastLd =
42432 LN->getMemoryVT(), LN->getMemOperand());
42433 DCI.CombineTo(N.getNode(), BcastLd);
42434 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42435 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42436 return N; // Return N so it doesn't get rechecked!
42437 }
42438 }
42439
42440 // If this is a truncate of load that has been shifted right, we can
42441 // offset the pointer and use a narrower load.
42442 if (TruncIn.getOpcode() == ISD::SRL &&
42443 TruncIn.getOperand(0).hasOneUse() &&
42444 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
42445 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
42446 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
42447 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
42448 // Make sure the shift amount and the load size are divisible by 16.
42449 // Don't do this if the load is volatile or atomic.
42450 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
42451 LN->isSimple()) {
42452 unsigned Offset = ShiftAmt / 8;
42453 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42456 SDValue Ops[] = { LN->getChain(), Ptr };
42457 SDValue BcastLd = DAG.getMemIntrinsicNode(
42458 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
42460 LN->getMemOperand()->getFlags());
42461 DCI.CombineTo(N.getNode(), BcastLd);
42462 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42463 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42464 return N; // Return N so it doesn't get rechecked!
42465 }
42466 }
42467 }
42468
42469 // vbroadcast(vzload X) -> vbroadcast_load X
42470 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
42472 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
42473 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42474 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42475 SDValue BcastLd =
42477 LN->getMemoryVT(), LN->getMemOperand());
42478 DCI.CombineTo(N.getNode(), BcastLd);
42479 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42481 return N; // Return N so it doesn't get rechecked!
42482 }
42483 }
42484
42485 // vbroadcast(vector load X) -> vbroadcast_load
42486 if (Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
42487 LoadSDNode *LN = cast<LoadSDNode>(Src);
42488 // Unless the load is volatile or atomic.
42489 if (LN->isSimple()) {
42490 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42491 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42492 SDValue BcastLd = DAG.getMemIntrinsicNode(
42494 LN->getPointerInfo(), LN->getBaseAlign(),
42495 LN->getMemOperand()->getFlags());
42496 DCI.CombineTo(N.getNode(), BcastLd);
42497 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42499 return N; // Return N so it doesn't get rechecked!
42500 }
42501 }
42502
42503 return SDValue();
42504 }
42505 case X86ISD::VZEXT_MOVL: {
42506 SDValue N0 = N.getOperand(0);
42507
42508 // Fold (vzmovl (shift x, y)) -> (shift (vzmovl x), y)
42509 // Zeroing out the upper elements means we're just shifting a zero value.
42510 // TODO: Try harder to move vzmovl upward towards SCALAR_TO_VECTOR nodes.
42511 // TODO: Move this to canonicalizeShuffleWithOp once we add zero handling.
42512 if (N0.getOpcode() == X86ISD::VSHL || N0.getOpcode() == X86ISD::VSHLI ||
42513 N0.getOpcode() == X86ISD::VSRL || N0.getOpcode() == X86ISD::VSRLI ||
42514 N0.getOpcode() == X86ISD::VSRA || N0.getOpcode() == X86ISD::VSRAI) {
42515 if (N0.hasOneUse())
42516 return DAG.getNode(
42517 N0.getOpcode(), DL, VT,
42518 DAG.getNode(X86ISD::VZEXT_MOVL, DL, VT, N0.getOperand(0)),
42519 N0.getOperand(1));
42520 }
42521
42522 // If this a vzmovl of a full vector load, replace it with a vzload, unless
42523 // the load is volatile.
42524 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
42525 auto *LN = cast<LoadSDNode>(N0);
42526 if (SDValue VZLoad =
42527 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
42528 DCI.CombineTo(N.getNode(), VZLoad);
42529 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42531 return N;
42532 }
42533 }
42534
42535 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
42536 // and can just use a VZEXT_LOAD.
42537 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
42538 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
42539 auto *LN = cast<MemSDNode>(N0);
42540 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
42541 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42542 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42543 SDValue VZLoad =
42545 LN->getMemoryVT(), LN->getMemOperand());
42546 DCI.CombineTo(N.getNode(), VZLoad);
42547 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42549 return N;
42550 }
42551 }
42552
42553 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
42554 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
42555 // if the upper bits of the i64 are zero.
42556 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42557 N0.getOperand(0).hasOneUse() &&
42558 N0.getOperand(0).getValueType() == MVT::i64) {
42559 SDValue In = N0.getOperand(0);
42560 APInt Mask = APInt::getHighBitsSet(64, 32);
42561 if (DAG.MaskedValueIsZero(In, Mask)) {
42562 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
42563 MVT VecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
42564 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
42565 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
42566 return DAG.getBitcast(VT, Movl);
42567 }
42568 }
42569
42570 // Load a scalar integer constant directly to XMM instead of transferring an
42571 // immediate value from GPR.
42572 // vzext_movl (scalar_to_vector C) --> load [C,0...]
42573 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
42574 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
42575 // Create a vector constant - scalar constant followed by zeros.
42576 EVT ScalarVT = N0.getOperand(0).getValueType();
42577 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
42578 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
42579 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
42580 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
42581
42582 // Load the vector constant from constant pool.
42583 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
42584 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
42585 MachinePointerInfo MPI =
42587 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
42588 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
42590 }
42591 }
42592
42593 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
42594 // insert into a zero vector. This helps get VZEXT_MOVL closer to
42595 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
42596 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
42597 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
42599
42600 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
42601 isNullConstant(V.getOperand(2))) {
42602 SDValue In = V.getOperand(1);
42604 In.getValueSizeInBits() /
42605 VT.getScalarSizeInBits());
42606 In = DAG.getBitcast(SubVT, In);
42607 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
42608 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
42609 getZeroVector(VT, Subtarget, DAG, DL), Movl,
42610 V.getOperand(2));
42611 }
42612 }
42613
42614 return SDValue();
42615 }
42616 case X86ISD::BLENDI: {
42617 SDValue N0 = N.getOperand(0);
42618 SDValue N1 = N.getOperand(1);
42619 unsigned EltBits = VT.getScalarSizeInBits();
42620
42621 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) {
42622 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
42623 // TODO: Handle MVT::v16i16 repeated blend mask.
42624 if (N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
42625 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
42626 unsigned SrcBits = SrcVT.getScalarSizeInBits();
42627 if ((EltBits % SrcBits) == 0 && SrcBits >= 32) {
42628 unsigned NewSize = SrcVT.getVectorNumElements();
42629 APInt BlendMask = getBLENDIBlendMask(N);
42630 APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize);
42631 return DAG.getBitcast(
42632 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
42633 N1.getOperand(0),
42634 DAG.getTargetConstant(NewBlendMask.getZExtValue(),
42635 DL, MVT::i8)));
42636 }
42637 }
42638 // Share PSHUFB masks:
42639 // blend(pshufb(x,m1),pshufb(y,m2))
42640 // --> m3 = blend(m1,m2)
42641 // blend(pshufb(x,m3),pshufb(y,m3))
42642 if (N0.hasOneUse() && N1.hasOneUse()) {
42643 SmallVector<int> Mask, ByteMask;
42647 if (LHS.getOpcode() == X86ISD::PSHUFB &&
42648 RHS.getOpcode() == X86ISD::PSHUFB &&
42649 LHS.getOperand(1) != RHS.getOperand(1) &&
42650 LHS.getOperand(1).hasOneUse() && RHS.getOperand(1).hasOneUse() &&
42651 getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) {
42652 assert(Ops.size() == 2 && LHS == peekThroughOneUseBitcasts(Ops[0]) &&
42654 "BLENDI decode mismatch");
42655 MVT ShufVT = LHS.getSimpleValueType();
42656 SDValue MaskLHS = LHS.getOperand(1);
42657 SDValue MaskRHS = RHS.getOperand(1);
42658 llvm::narrowShuffleMaskElts(EltBits / 8, Mask, ByteMask);
42660 ShufVT, {MaskLHS, MaskRHS}, ByteMask,
42661 {LHS.getNode(), RHS.getNode()}, DAG, DL, Subtarget)) {
42662 SDValue NewLHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42663 LHS.getOperand(0), NewMask);
42664 SDValue NewRHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42665 RHS.getOperand(0), NewMask);
42666 return DAG.getNode(X86ISD::BLENDI, DL, VT,
42667 DAG.getBitcast(VT, NewLHS),
42668 DAG.getBitcast(VT, NewRHS), N.getOperand(2));
42669 }
42670 }
42671 }
42672 }
42673 return SDValue();
42674 }
42675 case X86ISD::SHUFP: {
42676 // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
42677 // This is a more relaxed shuffle combiner that can ignore oneuse limits.
42678 // TODO: Support types other than v4f32.
42679 if (VT == MVT::v4f32) {
42680 bool Updated = false;
42681 SmallVector<int> Mask;
42683 if (getTargetShuffleMask(N, false, Ops, Mask) && Ops.size() == 2) {
42684 for (int i = 0; i != 2; ++i) {
42685 SmallVector<SDValue> SubOps;
42686 SmallVector<int> SubMask, SubScaledMask;
42688 // TODO: Scaling might be easier if we specify the demanded elts.
42689 if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
42690 scaleShuffleElements(SubMask, 4, SubScaledMask) &&
42691 SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
42692 int Ofs = i * 2;
42693 Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
42694 Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
42695 Ops[i] = DAG.getBitcast(VT, SubOps[0]);
42696 Updated = true;
42697 }
42698 }
42699 }
42700 if (Updated) {
42701 for (int &M : Mask)
42702 M %= 4;
42703 Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
42704 return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
42705 }
42706 }
42707 return SDValue();
42708 }
42709 case X86ISD::VPERMI: {
42710 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
42711 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
42712 SDValue N0 = N.getOperand(0);
42713 SDValue N1 = N.getOperand(1);
42714 unsigned EltSizeInBits = VT.getScalarSizeInBits();
42715 if (N0.getOpcode() == ISD::BITCAST &&
42716 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
42717 SDValue Src = N0.getOperand(0);
42718 EVT SrcVT = Src.getValueType();
42719 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
42720 return DAG.getBitcast(VT, Res);
42721 }
42722 return SDValue();
42723 }
42724 case X86ISD::SHUF128: {
42725 // If we're permuting the upper 256-bits subvectors of a concatenation, then
42726 // see if we can peek through and access the subvector directly.
42727 if (VT.is512BitVector()) {
42728 // 512-bit mask uses 4 x i2 indices - if the msb is always set then only
42729 // the upper subvector is used.
42730 SDValue LHS = peekThroughBitcasts(N->getOperand(0));
42731 SDValue RHS = peekThroughBitcasts(N->getOperand(1));
42732 uint64_t Mask = N->getConstantOperandVal(2);
42733 SmallVector<SDValue> LHSOps, RHSOps;
42734 SDValue NewLHS, NewRHS;
42735 if ((Mask & 0x0A) == 0x0A &&
42736 collectConcatOps(LHS.getNode(), LHSOps, DAG) && LHSOps.size() == 2) {
42737 NewLHS = widenSubVector(LHSOps[1], false, Subtarget, DAG, DL, 512);
42738 Mask &= ~0x0A;
42739 }
42740 if ((Mask & 0xA0) == 0xA0 &&
42741 collectConcatOps(RHS.getNode(), RHSOps, DAG) && RHSOps.size() == 2) {
42742 NewRHS = widenSubVector(RHSOps[1], false, Subtarget, DAG, DL, 512);
42743 Mask &= ~0xA0;
42744 }
42745 if (NewLHS || NewRHS)
42746 return DAG.getNode(X86ISD::SHUF128, DL, VT,
42747 DAG.getBitcast(VT, NewLHS ? NewLHS : LHS),
42748 DAG.getBitcast(VT, NewRHS ? NewRHS : RHS),
42749 DAG.getTargetConstant(Mask, DL, MVT::i8));
42750 }
42751 return SDValue();
42752 }
42753 case X86ISD::VPERM2X128: {
42754 SDValue LHS = N->getOperand(0);
42755 SDValue RHS = N->getOperand(1);
42756 unsigned Imm = N.getConstantOperandVal(2) & 255;
42757
42758 // Canonicalize unary/repeated operands to LHS.
42759 if (LHS.isUndef() && !RHS.isUndef())
42760 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, RHS, LHS,
42761 DAG.getTargetConstant(Imm ^ 0x22, DL, MVT::i8));
42762 if (LHS == RHS)
42763 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, LHS, DAG.getUNDEF(VT),
42764 DAG.getTargetConstant(Imm & ~0x22, DL, MVT::i8));
42765
42766 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
42767 if (LHS.getOpcode() == ISD::BITCAST &&
42768 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
42769 EVT SrcVT = LHS.getOperand(0).getValueType();
42770 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
42771 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
42772 DAG.getBitcast(SrcVT, LHS),
42773 DAG.getBitcast(SrcVT, RHS),
42774 N->getOperand(2)));
42775 }
42776 }
42777
42778 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
42780 return Res;
42781
42782 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
42783 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
42784 auto FindSubVector128 = [&](unsigned Idx) {
42785 if (Idx > 3)
42786 return SDValue();
42787 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
42788 SmallVector<SDValue> SubOps;
42789 if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
42790 return SubOps[Idx & 1];
42791 unsigned NumElts = Src.getValueType().getVectorNumElements();
42792 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
42793 Src.getOperand(1).getValueSizeInBits() == 128 &&
42794 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
42795 return Src.getOperand(1);
42796 }
42797 return SDValue();
42798 };
42799 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
42800 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
42801 MVT SubVT = VT.getHalfNumVectorElementsVT();
42802 SubLo = DAG.getBitcast(SubVT, SubLo);
42803 SubHi = DAG.getBitcast(SubVT, SubHi);
42804 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
42805 }
42806 }
42807
42808 // Attempt to match VBROADCAST*128 subvector broadcast load.
42809 if (RHS.isUndef()) {
42811 DecodeVPERM2X128Mask(4, Imm, Mask);
42812 if (isUndefOrInRange(Mask, 0, 4)) {
42813 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, LHS);
42814 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, LHS);
42815 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() &&
42816 X86::mayFoldLoad(LHS, Subtarget, /*AssumeSingleUse=*/true)) {
42817 MVT MemVT = VT.getHalfNumVectorElementsVT();
42818 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
42820 cast<LoadSDNode>(LHS), Ofs, DAG);
42821 }
42822 }
42823 }
42824
42825 return SDValue();
42826 }
42827 case X86ISD::PSHUFD:
42828 case X86ISD::PSHUFLW:
42829 case X86ISD::PSHUFHW: {
42830 SDValue N0 = N.getOperand(0);
42831 SDValue N1 = N.getOperand(1);
42832 if (N0->hasOneUse()) {
42834 switch (V.getOpcode()) {
42835 case X86ISD::VSHL:
42836 case X86ISD::VSRL:
42837 case X86ISD::VSRA:
42838 case X86ISD::VSHLI:
42839 case X86ISD::VSRLI:
42840 case X86ISD::VSRAI:
42841 case X86ISD::VROTLI:
42842 case X86ISD::VROTRI: {
42843 MVT InnerVT = V.getSimpleValueType();
42844 if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) {
42845 SDValue Res = DAG.getNode(Opcode, DL, VT,
42846 DAG.getBitcast(VT, V.getOperand(0)), N1);
42847 Res = DAG.getBitcast(InnerVT, Res);
42848 Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1));
42849 return DAG.getBitcast(VT, Res);
42850 }
42851 break;
42852 }
42853 }
42854 }
42855
42856 Mask = getPSHUFShuffleMask(N);
42857 assert(Mask.size() == 4);
42858 break;
42859 }
42860 case X86ISD::MOVSD:
42861 case X86ISD::MOVSH:
42862 case X86ISD::MOVSS: {
42863 SDValue N0 = N.getOperand(0);
42864 SDValue N1 = N.getOperand(1);
42865
42866 // Canonicalize scalar FPOps:
42867 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
42868 // If commutable, allow OP(N1[0], N0[0]).
42869 unsigned Opcode1 = N1.getOpcode();
42870 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
42871 Opcode1 == ISD::FDIV) {
42872 SDValue N10 = N1.getOperand(0);
42873 SDValue N11 = N1.getOperand(1);
42874 if (N10 == N0 ||
42875 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
42876 if (N10 != N0)
42877 std::swap(N10, N11);
42878 MVT SVT = VT.getVectorElementType();
42879 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
42880 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
42881 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
42882 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
42883 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
42884 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
42885 }
42886 }
42887
42888 return SDValue();
42889 }
42890 case X86ISD::INSERTPS: {
42891 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
42892 SDValue Op0 = N.getOperand(0);
42893 SDValue Op1 = N.getOperand(1);
42894 unsigned InsertPSMask = N.getConstantOperandVal(2);
42895 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
42896 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
42897 unsigned ZeroMask = InsertPSMask & 0xF;
42898
42899 // If we zero out all elements from Op0 then we don't need to reference it.
42900 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
42901 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
42902 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42903
42904 // If we zero out the element from Op1 then we don't need to reference it.
42905 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
42906 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42907 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42908
42909 // Attempt to merge insertps Op1 with an inner target shuffle node.
42910 SmallVector<int, 8> TargetMask1;
42912 APInt KnownUndef1, KnownZero1;
42913 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
42914 KnownZero1)) {
42915 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
42916 // Zero/UNDEF insertion - zero out element and remove dependency.
42917 InsertPSMask |= (1u << DstIdx);
42918 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42919 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42920 }
42921 // Update insertps mask srcidx and reference the source input directly.
42922 int M = TargetMask1[SrcIdx];
42923 assert(0 <= M && M < 8 && "Shuffle index out of range");
42924 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
42925 Op1 = Ops1[M < 4 ? 0 : 1];
42926 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42927 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42928 }
42929
42930 // Attempt to merge insertps Op0 with an inner target shuffle node.
42931 SmallVector<int, 8> TargetMask0;
42933 APInt KnownUndef0, KnownZero0;
42934 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
42935 KnownZero0)) {
42936 bool Updated = false;
42937 bool UseInput00 = false;
42938 bool UseInput01 = false;
42939 for (int i = 0; i != 4; ++i) {
42940 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
42941 // No change if element is already zero or the inserted element.
42942 continue;
42943 }
42944
42945 if (KnownUndef0[i] || KnownZero0[i]) {
42946 // If the target mask is undef/zero then we must zero the element.
42947 InsertPSMask |= (1u << i);
42948 Updated = true;
42949 continue;
42950 }
42951
42952 // The input vector element must be inline.
42953 int M = TargetMask0[i];
42954 if (M != i && M != (i + 4))
42955 return SDValue();
42956
42957 // Determine which inputs of the target shuffle we're using.
42958 UseInput00 |= (0 <= M && M < 4);
42959 UseInput01 |= (4 <= M);
42960 }
42961
42962 // If we're not using both inputs of the target shuffle then use the
42963 // referenced input directly.
42964 if (UseInput00 && !UseInput01) {
42965 Updated = true;
42966 Op0 = Ops0[0];
42967 } else if (!UseInput00 && UseInput01) {
42968 Updated = true;
42969 Op0 = Ops0[1];
42970 }
42971
42972 if (Updated)
42973 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42974 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42975 }
42976
42977 // If we're inserting an element from a vbroadcast load, fold the
42978 // load into the X86insertps instruction. We need to convert the scalar
42979 // load to a vector and clear the source lane of the INSERTPS control.
42980 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
42981 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
42982 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
42983 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
42984 MemIntr->getBasePtr(),
42985 MemIntr->getMemOperand());
42986 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
42988 Load),
42989 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
42990 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
42991 return Insert;
42992 }
42993 }
42994
42995 return SDValue();
42996 }
42997 case X86ISD::VPERMV: {
42998 // Combine VPERMV to VPERMV3 if the source operand can be freely split.
43000 SmallVector<SDValue, 2> SrcOps, SubOps;
43001 SDValue Src = peekThroughBitcasts(N.getOperand(1));
43002 if ((Subtarget.hasVLX() || VT.is512BitVector()) &&
43003 getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask) &&
43004 collectConcatOps(Src.getNode(), SubOps, DAG)) {
43005 assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
43006 assert(SrcOps.size() == 1 && "Unexpected shuffle ops");
43007 assert((SubOps.size() == 2 || SubOps.size() == 4) &&
43008 "Unexpected split ops");
43009 // Bail if we were permuting a widened vector.
43010 if (SubOps[1].isUndef() &&
43011 (SubOps.size() == 2 || (SubOps[2].isUndef() && SubOps[3].isUndef())))
43012 return SDValue();
43013 // Bail if any subops would have folded into the concat.
43014 if (any_of(SubOps, isShuffleFoldableLoad))
43015 return SDValue();
43016 // Concat 4x128 back to 2x256.
43017 if (SubOps.size() == 4) {
43018 SubOps[0] = concatSubVectors(SubOps[0], SubOps[1], DAG, DL);
43019 SubOps[1] = concatSubVectors(SubOps[2], SubOps[3], DAG, DL);
43020 }
43021 // Convert mask to 2 operand shuffle.
43022 int HalfElts = NumElts / 2;
43023 for (int &M : Mask)
43024 M += M >= HalfElts ? HalfElts : 0;
43025 SDValue Lo = widenSubVector(SubOps[0], false, Subtarget, DAG, DL,
43026 VT.getSizeInBits());
43027 SDValue Hi = widenSubVector(SubOps[1], false, Subtarget, DAG, DL,
43028 VT.getSizeInBits());
43029 return lowerShuffleWithPERMV(DL, VT, Mask, DAG.getBitcast(VT, Lo),
43030 DAG.getBitcast(VT, Hi), Subtarget, DAG);
43031 }
43032 return SDValue();
43033 }
43034 case X86ISD::VPERMV3: {
43035 MVT WideVT = VT.getDoubleNumVectorElementsVT();
43036 bool CanConcat = VT.is128BitVector() ||
43037 (VT.is256BitVector() && Subtarget.useAVX512Regs());
43040 if (getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask)) {
43041 assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
43042 SDValue V1 = peekThroughBitcasts(N.getOperand(0));
43043 SDValue V2 = peekThroughBitcasts(N.getOperand(2));
43044 // Canonicalize to VPERMV if both sources are the same.
43045 if (V1 == V2) {
43046 for (int &M : Mask)
43047 M = (M < 0 ? M : (M & (NumElts - 1)));
43048 return lowerShuffleWithPERMV(DL, VT, Mask, N.getOperand(0),
43049 DAG.getUNDEF(VT), Subtarget, DAG);
43050 }
43051 // If sources are half width, then concat and use VPERMV with adjusted
43052 // mask.
43053 SDValue Ops[2];
43054 MVT HalfVT = VT.getHalfNumVectorElementsVT();
43055 if (sd_match(V1,
43057 sd_match(V2,
43059 Ops[0].getValueType() == HalfVT && Ops[1].getValueType() == HalfVT) {
43060 if (SDValue ConcatSrc =
43061 combineConcatVectorOps(DL, VT, Ops, DAG, Subtarget)) {
43062 for (int &M : Mask)
43063 M = (M < (int)NumElts ? M : (M - (NumElts / 2)));
43064 return lowerShuffleWithPERMV(DL, VT, Mask, ConcatSrc,
43065 DAG.getUNDEF(VT), Subtarget, DAG);
43066 }
43067 }
43068 // Commute foldable source to the RHS.
43069 if (isShuffleFoldableLoad(N.getOperand(0)) &&
43070 !isShuffleFoldableLoad(N.getOperand(2))) {
43072 return lowerShuffleWithPERMV(DL, VT, Mask, N.getOperand(2),
43073 N.getOperand(0), Subtarget, DAG);
43074 }
43075 // Combine VPERMV3 to widened VPERMV if the two source operands can be
43076 // freely concatenated, with a commuted shuffle mask.
43077 if (CanConcat) {
43078 if (SDValue ConcatSrc = combineConcatVectorOps(
43079 DL, WideVT, {N.getOperand(2), N.getOperand(0)}, DAG,
43080 Subtarget)) {
43082 Mask.append(NumElts, SM_SentinelUndef);
43083 SDValue Perm =
43084 lowerShuffleWithPERMV(DL, WideVT, Mask, ConcatSrc,
43085 DAG.getUNDEF(WideVT), Subtarget, DAG);
43086 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
43087 DAG.getVectorIdxConstant(0, DL));
43088 }
43089 }
43090 }
43091 // Combine VPERMV3 to widened VPERMV if the two source operands can be
43092 // freely concatenated.
43093 if (CanConcat) {
43094 if (SDValue ConcatSrc = combineConcatVectorOps(
43095 DL, WideVT, {N.getOperand(0), N.getOperand(2)}, DAG, Subtarget)) {
43096 SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG,
43097 DL, WideVT.getSizeInBits());
43098 SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, ConcatSrc);
43099 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
43100 DAG.getVectorIdxConstant(0, DL));
43101 }
43102 }
43103 return SDValue();
43104 }
43105 default:
43106 return SDValue();
43107 }
43108
43109 // Nuke no-op shuffles that show up after combining.
43110 if (isNoopShuffleMask(Mask))
43111 return N.getOperand(0);
43112
43113 // Look for simplifications involving one or two shuffle instructions.
43114 SDValue V = N.getOperand(0);
43115 switch (N.getOpcode()) {
43116 default:
43117 break;
43118 case X86ISD::PSHUFLW:
43119 case X86ISD::PSHUFHW:
43120 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
43121
43122 // See if this reduces to a PSHUFD which is no more expensive and can
43123 // combine with more operations. Note that it has to at least flip the
43124 // dwords as otherwise it would have been removed as a no-op.
43125 if (ArrayRef<int>(Mask).equals({2, 3, 0, 1})) {
43126 int DMask[] = {0, 1, 2, 3};
43127 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
43128 DMask[DOffset + 0] = DOffset + 1;
43129 DMask[DOffset + 1] = DOffset + 0;
43130 MVT DVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
43131 V = DAG.getBitcast(DVT, V);
43132 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
43133 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
43134 return DAG.getBitcast(VT, V);
43135 }
43136
43137 // Look for shuffle patterns which can be implemented as a single unpack.
43138 // FIXME: This doesn't handle the location of the PSHUFD generically, and
43139 // only works when we have a PSHUFD followed by two half-shuffles.
43140 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
43141 (V.getOpcode() == X86ISD::PSHUFLW ||
43142 V.getOpcode() == X86ISD::PSHUFHW) &&
43143 V.getOpcode() != N.getOpcode() &&
43144 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
43145 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
43146 if (D.getOpcode() == X86ISD::PSHUFD) {
43149 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
43150 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
43151 int WordMask[8];
43152 for (int i = 0; i < 4; ++i) {
43153 WordMask[i + NOffset] = Mask[i] + NOffset;
43154 WordMask[i + VOffset] = VMask[i] + VOffset;
43155 }
43156 // Map the word mask through the DWord mask.
43157 int MappedMask[8];
43158 for (int i = 0; i < 8; ++i)
43159 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
43160 if (ArrayRef<int>(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
43161 ArrayRef<int>(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
43162 // We can replace all three shuffles with an unpack.
43163 V = DAG.getBitcast(VT, D.getOperand(0));
43164 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
43166 DL, VT, V, V);
43167 }
43168 }
43169 }
43170
43171 break;
43172
43173 case X86ISD::PSHUFD:
43174 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DL, DAG))
43175 return NewN;
43176
43177 break;
43178 }
43179
43180 return SDValue();
43181}
43182
43183/// Checks if the shuffle mask takes subsequent elements
43184/// alternately from two vectors.
43185/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
43186static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
43187
43188 int ParitySrc[2] = {-1, -1};
43189 unsigned Size = Mask.size();
43190 for (unsigned i = 0; i != Size; ++i) {
43191 int M = Mask[i];
43192 if (M < 0)
43193 continue;
43194
43195 // Make sure we are using the matching element from the input.
43196 if ((M % Size) != i)
43197 return false;
43198
43199 // Make sure we use the same input for all elements of the same parity.
43200 int Src = M / Size;
43201 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
43202 return false;
43203 ParitySrc[i % 2] = Src;
43204 }
43205
43206 // Make sure each input is used.
43207 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
43208 return false;
43209
43210 Op0Even = ParitySrc[0] == 0;
43211 return true;
43212}
43213
43214/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
43215/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
43216/// are written to the parameters \p Opnd0 and \p Opnd1.
43217///
43218/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
43219/// so it is easier to generically match. We also insert dummy vector shuffle
43220/// nodes for the operands which explicitly discard the lanes which are unused
43221/// by this operation to try to flow through the rest of the combiner
43222/// the fact that they're unused.
43223static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
43224 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
43225 bool &IsSubAdd, bool &HasAllowContract) {
43226
43227 EVT VT = N->getValueType(0);
43228 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43229 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
43231 return false;
43232
43233 // We only handle target-independent shuffles.
43234 // FIXME: It would be easy and harmless to use the target shuffle mask
43235 // extraction tool to support more.
43236 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
43237 return false;
43238
43239 SDValue V1 = N->getOperand(0);
43240 SDValue V2 = N->getOperand(1);
43241
43242 // Make sure we have an FADD and an FSUB.
43243 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
43244 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
43245 V1.getOpcode() == V2.getOpcode())
43246 return false;
43247
43248 // If there are other uses of these operations we can't fold them.
43249 if (!V1->hasOneUse() || !V2->hasOneUse())
43250 return false;
43251
43252 // Ensure that both operations have the same operands. Note that we can
43253 // commute the FADD operands.
43254 SDValue LHS, RHS;
43255 if (V1.getOpcode() == ISD::FSUB) {
43256 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
43257 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
43258 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
43259 return false;
43260 } else {
43261 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
43262 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
43263 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
43264 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
43265 return false;
43266 }
43267
43268 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
43269 bool Op0Even;
43270 if (!isAddSubOrSubAddMask(Mask, Op0Even))
43271 return false;
43272
43273 // It's a subadd if the vector in the even parity is an FADD.
43274 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
43275 : V2->getOpcode() == ISD::FADD;
43276 HasAllowContract =
43278
43279 Opnd0 = LHS;
43280 Opnd1 = RHS;
43281 return true;
43282}
43283
43284/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
43286 const X86Subtarget &Subtarget,
43287 SelectionDAG &DAG) {
43288 // We only handle target-independent shuffles.
43289 // FIXME: It would be easy and harmless to use the target shuffle mask
43290 // extraction tool to support more.
43291 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
43292 return SDValue();
43293
43294 MVT VT = N->getSimpleValueType(0);
43295 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43296 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
43297 return SDValue();
43298
43299 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
43300 SDValue Op0 = N->getOperand(0);
43301 SDValue Op1 = N->getOperand(1);
43302 SDValue FMAdd = Op0, FMSub = Op1;
43303 if (FMSub.getOpcode() != X86ISD::FMSUB)
43304 std::swap(FMAdd, FMSub);
43305
43306 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
43307 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
43308 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
43309 FMAdd.getOperand(2) != FMSub.getOperand(2))
43310 return SDValue();
43311
43312 // Check for correct shuffle mask.
43313 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
43314 bool Op0Even;
43315 if (!isAddSubOrSubAddMask(Mask, Op0Even))
43316 return SDValue();
43317
43318 // FMAddSub takes zeroth operand from FMSub node.
43319 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
43320 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
43321 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
43322 FMAdd.getOperand(2));
43323}
43324
43325/// Try to combine a shuffle into a target-specific add-sub or
43326/// mul-add-sub node.
43328 const X86Subtarget &Subtarget,
43329 SelectionDAG &DAG) {
43330 if (SDValue V = combineShuffleToFMAddSub(N, DL, Subtarget, DAG))
43331 return V;
43332
43333 SDValue Opnd0, Opnd1;
43334 bool IsSubAdd;
43335 bool HasAllowContract;
43336 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd,
43337 HasAllowContract))
43338 return SDValue();
43339
43340 MVT VT = N->getSimpleValueType(0);
43341
43342 // Try to generate X86ISD::FMADDSUB node here.
43343 SDValue Opnd2;
43344 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2,
43345 HasAllowContract)) {
43346 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
43347 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
43348 }
43349
43350 if (IsSubAdd)
43351 return SDValue();
43352
43353 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
43354 // the ADDSUB idiom has been successfully recognized. There are no known
43355 // X86 targets with 512-bit ADDSUB instructions!
43356 if (VT.is512BitVector())
43357 return SDValue();
43358
43359 // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
43360 // the ADDSUB idiom has been successfully recognized. There are no known
43361 // X86 targets with FP16 ADDSUB instructions!
43362 if (VT.getVectorElementType() == MVT::f16)
43363 return SDValue();
43364
43365 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
43366}
43367
43368/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
43369/// low half of each source vector and does not set any high half elements in
43370/// the destination vector, narrow the shuffle to half its original size.
43372 EVT VT = Shuf->getValueType(0);
43373 if (!DAG.getTargetLoweringInfo().isTypeLegal(Shuf->getValueType(0)))
43374 return SDValue();
43375 if (!VT.is256BitVector() && !VT.is512BitVector())
43376 return SDValue();
43377
43378 // See if we can ignore all of the high elements of the shuffle.
43379 ArrayRef<int> Mask = Shuf->getMask();
43380 if (!isUndefUpperHalf(Mask))
43381 return SDValue();
43382
43383 // Check if the shuffle mask accesses only the low half of each input vector
43384 // (half-index output is 0 or 2).
43385 int HalfIdx1, HalfIdx2;
43386 SmallVector<int, 8> HalfMask(Mask.size() / 2);
43387 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
43388 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
43389 return SDValue();
43390
43391 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
43392 // The trick is knowing that all of the insert/extract are actually free
43393 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
43394 // of narrow inputs into a narrow output, and that is always cheaper than
43395 // the wide shuffle that we started with.
43396 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
43397 Shuf->getOperand(1), HalfMask, HalfIdx1,
43398 HalfIdx2, false, DAG, /*UseConcat*/ true);
43399}
43400
43403 const X86Subtarget &Subtarget) {
43404 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
43405 if (SDValue V = narrowShuffle(Shuf, DAG))
43406 return V;
43407
43408 // If we have legalized the vector types, look for blends of FADD and FSUB
43409 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
43410 SDLoc dl(N);
43411 EVT VT = N->getValueType(0);
43412 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43413 if (TLI.isTypeLegal(VT) && !isSoftF16(VT, Subtarget))
43414 if (SDValue AddSub =
43415 combineShuffleToAddSubOrFMAddSub(N, dl, Subtarget, DAG))
43416 return AddSub;
43417
43418 // Attempt to combine into a vector load/broadcast.
43420 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
43421 return LD;
43422
43423 if (isTargetShuffle(N->getOpcode())) {
43424 SDValue Op(N, 0);
43425 if (SDValue Shuffle = combineTargetShuffle(Op, dl, DAG, DCI, Subtarget))
43426 return Shuffle;
43427
43428 // Try recursively combining arbitrary sequences of x86 shuffle
43429 // instructions into higher-order shuffles. We do this after combining
43430 // specific PSHUF instruction sequences into their minimal form so that we
43431 // can evaluate how many specialized shuffle instructions are involved in
43432 // a particular chain.
43433 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
43434 return Res;
43435
43436 // Simplify source operands based on shuffle mask.
43437 // TODO - merge this into combineX86ShufflesRecursively.
43438 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
43439 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
43440 return SDValue(N, 0);
43441
43442 // Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
43443 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
43444 // Perform this after other shuffle combines to allow inner shuffles to be
43445 // combined away first.
43446 if (SDValue BinOp = canonicalizeShuffleWithOp(Op, DAG, dl))
43447 return BinOp;
43448 }
43449
43450 return SDValue();
43451}
43452
43453// Simplify variable target shuffle masks based on the demanded elements.
43454// TODO: Handle DemandedBits in mask indices as well?
43456 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
43457 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
43458 // If we're demanding all elements don't bother trying to simplify the mask.
43459 unsigned NumElts = DemandedElts.getBitWidth();
43460 if (DemandedElts.isAllOnes())
43461 return false;
43462
43463 SDValue Mask = Op.getOperand(MaskIndex);
43464 if (!Mask.hasOneUse())
43465 return false;
43466
43467 // Attempt to generically simplify the variable shuffle mask.
43468 APInt MaskUndef, MaskZero;
43469 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
43470 Depth + 1))
43471 return true;
43472
43473 // Attempt to extract+simplify a (constant pool load) shuffle mask.
43474 // TODO: Support other types from getTargetShuffleMaskIndices?
43476 EVT BCVT = BC.getValueType();
43477 auto *Load = dyn_cast<LoadSDNode>(BC);
43478 if (!Load || !Load->getBasePtr().hasOneUse())
43479 return false;
43480
43481 const Constant *C = getTargetConstantFromNode(Load);
43482 if (!C)
43483 return false;
43484
43485 Type *CTy = C->getType();
43486 if (!CTy->isVectorTy() ||
43487 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
43488 return false;
43489
43490 // Handle scaling for i64 elements on 32-bit targets.
43491 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
43492 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
43493 return false;
43494 unsigned Scale = NumCstElts / NumElts;
43495
43496 // Simplify mask if we have an undemanded element that is not undef.
43497 bool Simplified = false;
43498 SmallVector<Constant *, 32> ConstVecOps;
43499 for (unsigned i = 0; i != NumCstElts; ++i) {
43500 Constant *Elt = C->getAggregateElement(i);
43501 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
43502 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
43503 Simplified = true;
43504 continue;
43505 }
43506 ConstVecOps.push_back(Elt);
43507 }
43508 if (!Simplified)
43509 return false;
43510
43511 // Generate new constant pool entry + legalize immediately for the load.
43512 SDLoc DL(Op);
43513 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
43514 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
43515 SDValue NewMask = TLO.DAG.getLoad(
43516 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
43518 Load->getAlign());
43519 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
43520}
43521
43523 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
43524 TargetLoweringOpt &TLO, unsigned Depth) const {
43525 int NumElts = DemandedElts.getBitWidth();
43526 unsigned Opc = Op.getOpcode();
43527 EVT VT = Op.getValueType();
43528
43529 // Handle special case opcodes.
43530 switch (Opc) {
43531 case X86ISD::PMULDQ:
43532 case X86ISD::PMULUDQ: {
43533 APInt LHSUndef, LHSZero;
43534 APInt RHSUndef, RHSZero;
43535 SDValue LHS = Op.getOperand(0);
43536 SDValue RHS = Op.getOperand(1);
43537 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43538 Depth + 1))
43539 return true;
43540 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43541 Depth + 1))
43542 return true;
43543 // Multiply by zero.
43544 KnownZero = LHSZero | RHSZero;
43545 break;
43546 }
43547 case X86ISD::VPMADDUBSW:
43548 case X86ISD::VPMADDWD: {
43549 APInt LHSUndef, LHSZero;
43550 APInt RHSUndef, RHSZero;
43551 SDValue LHS = Op.getOperand(0);
43552 SDValue RHS = Op.getOperand(1);
43553 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
43554
43555 if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
43556 Depth + 1))
43557 return true;
43558 if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
43559 Depth + 1))
43560 return true;
43561
43562 // TODO: Multiply by zero.
43563
43564 // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
43565 APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
43566 if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
43567 Depth + 1))
43568 return true;
43569 APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
43570 if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
43571 Depth + 1))
43572 return true;
43573 break;
43574 }
43575 case X86ISD::PSADBW: {
43576 SDValue LHS = Op.getOperand(0);
43577 SDValue RHS = Op.getOperand(1);
43578 assert(VT.getScalarType() == MVT::i64 &&
43579 LHS.getValueType() == RHS.getValueType() &&
43580 LHS.getValueType().getScalarType() == MVT::i8 &&
43581 "Unexpected PSADBW types");
43582
43583 // Aggressively peek through ops to get at the demanded elts.
43584 if (!DemandedElts.isAllOnes()) {
43585 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
43586 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
43588 LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43590 RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43591 if (NewLHS || NewRHS) {
43592 NewLHS = NewLHS ? NewLHS : LHS;
43593 NewRHS = NewRHS ? NewRHS : RHS;
43594 return TLO.CombineTo(
43595 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43596 }
43597 }
43598 break;
43599 }
43600 case X86ISD::VSHL:
43601 case X86ISD::VSRL:
43602 case X86ISD::VSRA: {
43603 // We only need the bottom 64-bits of the (128-bit) shift amount.
43604 SDValue Amt = Op.getOperand(1);
43605 MVT AmtVT = Amt.getSimpleValueType();
43606 assert(AmtVT.is128BitVector() && "Unexpected value type");
43607
43608 // If we reuse the shift amount just for sse shift amounts then we know that
43609 // only the bottom 64-bits are only ever used.
43610 bool AssumeSingleUse = llvm::all_of(Amt->users(), [&Amt](SDNode *Use) {
43611 unsigned UseOpc = Use->getOpcode();
43612 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
43613 UseOpc == X86ISD::VSRA) &&
43614 Use->getOperand(0) != Amt;
43615 });
43616
43617 APInt AmtUndef, AmtZero;
43618 unsigned NumAmtElts = AmtVT.getVectorNumElements();
43619 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
43620 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
43621 Depth + 1, AssumeSingleUse))
43622 return true;
43623 [[fallthrough]];
43624 }
43625 case X86ISD::VSHLI:
43626 case X86ISD::VSRLI:
43627 case X86ISD::VSRAI: {
43628 SDValue Src = Op.getOperand(0);
43629 APInt SrcUndef;
43630 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
43631 Depth + 1))
43632 return true;
43633
43634 // Fold shift(0,x) -> 0
43635 if (DemandedElts.isSubsetOf(KnownZero))
43636 return TLO.CombineTo(
43637 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43638
43639 // Aggressively peek through ops to get at the demanded elts.
43640 if (!DemandedElts.isAllOnes())
43642 Src, DemandedElts, TLO.DAG, Depth + 1))
43643 return TLO.CombineTo(
43644 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
43645 break;
43646 }
43647 case X86ISD::VPSHA:
43648 case X86ISD::VPSHL:
43649 case X86ISD::VSHLV:
43650 case X86ISD::VSRLV:
43651 case X86ISD::VSRAV: {
43652 APInt LHSUndef, LHSZero;
43653 APInt RHSUndef, RHSZero;
43654 SDValue LHS = Op.getOperand(0);
43655 SDValue RHS = Op.getOperand(1);
43656 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43657 Depth + 1))
43658 return true;
43659
43660 // Fold shift(0,x) -> 0
43661 if (DemandedElts.isSubsetOf(LHSZero))
43662 return TLO.CombineTo(
43663 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43664
43665 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43666 Depth + 1))
43667 return true;
43668
43669 KnownZero = LHSZero;
43670 break;
43671 }
43672 case X86ISD::CMPM:
43673 case X86ISD::CMPP: {
43674 // Scalarize packed fp comparison if we only require element 0.
43675 if (DemandedElts == 1) {
43676 SDLoc dl(Op);
43677 MVT VT = Op.getSimpleValueType();
43678 MVT OpSVT = Op.getOperand(0).getSimpleValueType().getScalarType();
43679 SDValue LHS = TLO.DAG.getExtractVectorElt(dl, OpSVT, Op.getOperand(0), 0);
43680 SDValue RHS = TLO.DAG.getExtractVectorElt(dl, OpSVT, Op.getOperand(1), 0);
43681 SDValue CC = Op.getOperand(2);
43682 if (Opc == X86ISD::CMPM) {
43683 SDValue Cmp =
43684 TLO.DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS, CC);
43685 return TLO.CombineTo(
43686 Op, TLO.DAG.getInsertSubvector(dl, TLO.DAG.getUNDEF(VT), Cmp, 0));
43687 }
43688 SDValue Cmp = TLO.DAG.getNode(X86ISD::FSETCC, dl, OpSVT, LHS, RHS, CC);
43689 return TLO.CombineTo(Op,
43690 TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Cmp));
43691 }
43692 break;
43693 }
43694 case X86ISD::PCMPEQ:
43695 case X86ISD::PCMPGT: {
43696 APInt LHSUndef, LHSZero;
43697 APInt RHSUndef, RHSZero;
43698 SDValue LHS = Op.getOperand(0);
43699 SDValue RHS = Op.getOperand(1);
43700 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43701 Depth + 1))
43702 return true;
43703 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43704 Depth + 1))
43705 return true;
43706 break;
43707 }
43708 case X86ISD::KSHIFTL: {
43709 SDValue Src = Op.getOperand(0);
43710 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43711 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43712 unsigned ShiftAmt = Amt->getZExtValue();
43713
43714 if (ShiftAmt == 0)
43715 return TLO.CombineTo(Op, Src);
43716
43717 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
43718 // single shift. We can do this if the bottom bits (which are shifted
43719 // out) are never demanded.
43720 if (Src.getOpcode() == X86ISD::KSHIFTR) {
43721 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
43722 unsigned C1 = Src.getConstantOperandVal(1);
43723 unsigned NewOpc = X86ISD::KSHIFTL;
43724 int Diff = ShiftAmt - C1;
43725 if (Diff < 0) {
43726 Diff = -Diff;
43727 NewOpc = X86ISD::KSHIFTR;
43728 }
43729
43730 SDLoc dl(Op);
43731 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43732 return TLO.CombineTo(
43733 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43734 }
43735 }
43736
43737 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
43738 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43739 Depth + 1))
43740 return true;
43741
43742 KnownUndef <<= ShiftAmt;
43743 KnownZero <<= ShiftAmt;
43744 KnownZero.setLowBits(ShiftAmt);
43745 break;
43746 }
43747 case X86ISD::KSHIFTR: {
43748 SDValue Src = Op.getOperand(0);
43749 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43750 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43751 unsigned ShiftAmt = Amt->getZExtValue();
43752
43753 if (ShiftAmt == 0)
43754 return TLO.CombineTo(Op, Src);
43755
43756 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
43757 // single shift. We can do this if the top bits (which are shifted
43758 // out) are never demanded.
43759 if (Src.getOpcode() == X86ISD::KSHIFTL) {
43760 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
43761 unsigned C1 = Src.getConstantOperandVal(1);
43762 unsigned NewOpc = X86ISD::KSHIFTR;
43763 int Diff = ShiftAmt - C1;
43764 if (Diff < 0) {
43765 Diff = -Diff;
43766 NewOpc = X86ISD::KSHIFTL;
43767 }
43768
43769 SDLoc dl(Op);
43770 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43771 return TLO.CombineTo(
43772 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43773 }
43774 }
43775
43776 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
43777 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43778 Depth + 1))
43779 return true;
43780
43781 KnownUndef.lshrInPlace(ShiftAmt);
43782 KnownZero.lshrInPlace(ShiftAmt);
43783 KnownZero.setHighBits(ShiftAmt);
43784 break;
43785 }
43786 case X86ISD::ANDNP: {
43787 // ANDNP = (~LHS & RHS);
43788 SDValue LHS = Op.getOperand(0);
43789 SDValue RHS = Op.getOperand(1);
43790
43791 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
43792 APInt UndefElts;
43793 SmallVector<APInt> EltBits;
43794 int NumElts = VT.getVectorNumElements();
43795 int EltSizeInBits = VT.getScalarSizeInBits();
43796 APInt OpBits = APInt::getAllOnes(EltSizeInBits);
43797 APInt OpElts = DemandedElts;
43798 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
43799 EltBits)) {
43800 OpBits.clearAllBits();
43801 OpElts.clearAllBits();
43802 for (int I = 0; I != NumElts; ++I) {
43803 if (!DemandedElts[I])
43804 continue;
43805 if (UndefElts[I]) {
43806 // We can't assume an undef src element gives an undef dst - the
43807 // other src might be zero.
43808 OpBits.setAllBits();
43809 OpElts.setBit(I);
43810 } else if ((Invert && !EltBits[I].isAllOnes()) ||
43811 (!Invert && !EltBits[I].isZero())) {
43812 OpBits |= Invert ? ~EltBits[I] : EltBits[I];
43813 OpElts.setBit(I);
43814 }
43815 }
43816 }
43817 return std::make_pair(OpBits, OpElts);
43818 };
43819 APInt BitsLHS, EltsLHS;
43820 APInt BitsRHS, EltsRHS;
43821 std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);
43822 std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);
43823
43824 APInt LHSUndef, LHSZero;
43825 APInt RHSUndef, RHSZero;
43826 if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,
43827 Depth + 1))
43828 return true;
43829 if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,
43830 Depth + 1))
43831 return true;
43832
43833 if (!DemandedElts.isAllOnes()) {
43834 SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,
43835 TLO.DAG, Depth + 1);
43836 SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,
43837 TLO.DAG, Depth + 1);
43838 if (NewLHS || NewRHS) {
43839 NewLHS = NewLHS ? NewLHS : LHS;
43840 NewRHS = NewRHS ? NewRHS : RHS;
43841 return TLO.CombineTo(
43842 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43843 }
43844 }
43845 break;
43846 }
43847 case X86ISD::CVTSI2P:
43848 case X86ISD::CVTUI2P:
43849 case X86ISD::CVTPH2PS:
43850 case X86ISD::CVTPS2PH: {
43851 SDValue Src = Op.getOperand(0);
43852 EVT SrcVT = Src.getValueType();
43853 APInt SrcUndef, SrcZero;
43854 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43855 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
43856 Depth + 1))
43857 return true;
43858 break;
43859 }
43860 case X86ISD::PACKSS:
43861 case X86ISD::PACKUS: {
43862 SDValue N0 = Op.getOperand(0);
43863 SDValue N1 = Op.getOperand(1);
43864
43865 APInt DemandedLHS, DemandedRHS;
43866 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43867
43868 APInt LHSUndef, LHSZero;
43869 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43870 Depth + 1))
43871 return true;
43872 APInt RHSUndef, RHSZero;
43873 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43874 Depth + 1))
43875 return true;
43876
43877 // TODO - pass on known zero/undef.
43878
43879 // Aggressively peek through ops to get at the demanded elts.
43880 // TODO - we should do this for all target/faux shuffles ops.
43881 if (!DemandedElts.isAllOnes()) {
43882 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43883 TLO.DAG, Depth + 1);
43884 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43885 TLO.DAG, Depth + 1);
43886 if (NewN0 || NewN1) {
43887 NewN0 = NewN0 ? NewN0 : N0;
43888 NewN1 = NewN1 ? NewN1 : N1;
43889 return TLO.CombineTo(Op,
43890 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43891 }
43892 }
43893 break;
43894 }
43895 case X86ISD::HADD:
43896 case X86ISD::HSUB:
43897 case X86ISD::FHADD:
43898 case X86ISD::FHSUB: {
43899 SDValue N0 = Op.getOperand(0);
43900 SDValue N1 = Op.getOperand(1);
43901
43902 APInt DemandedLHS, DemandedRHS;
43903 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43904
43905 APInt LHSUndef, LHSZero;
43906 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43907 Depth + 1))
43908 return true;
43909 APInt RHSUndef, RHSZero;
43910 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43911 Depth + 1))
43912 return true;
43913
43914 // TODO - pass on known zero/undef.
43915
43916 // Aggressively peek through ops to get at the demanded elts.
43917 // TODO: Handle repeated operands.
43918 if (N0 != N1 && !DemandedElts.isAllOnes()) {
43919 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43920 TLO.DAG, Depth + 1);
43921 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43922 TLO.DAG, Depth + 1);
43923 if (NewN0 || NewN1) {
43924 NewN0 = NewN0 ? NewN0 : N0;
43925 NewN1 = NewN1 ? NewN1 : N1;
43926 return TLO.CombineTo(Op,
43927 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43928 }
43929 }
43930 break;
43931 }
43932 case X86ISD::VTRUNC:
43933 case X86ISD::VTRUNCS:
43934 case X86ISD::VTRUNCUS: {
43935 SDValue Src = Op.getOperand(0);
43936 MVT SrcVT = Src.getSimpleValueType();
43937 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43938 APInt SrcUndef, SrcZero;
43939 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
43940 Depth + 1))
43941 return true;
43942 KnownZero = SrcZero.zextOrTrunc(NumElts);
43943 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
43944 break;
43945 }
43946 case X86ISD::BLENDI: {
43947 SmallVector<int, 16> BlendMask;
43948 DecodeBLENDMask(NumElts, Op.getConstantOperandVal(2), BlendMask);
43950 VT.getSimpleVT(), Op.getOperand(0), Op.getOperand(1), BlendMask,
43951 DemandedElts, TLO.DAG, Subtarget, SDLoc(Op)))
43952 return TLO.CombineTo(Op, R);
43953 break;
43954 }
43955 case X86ISD::BLENDV: {
43956 APInt SelUndef, SelZero;
43957 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
43958 SelZero, TLO, Depth + 1))
43959 return true;
43960
43961 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
43962 APInt LHSUndef, LHSZero;
43963 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
43964 LHSZero, TLO, Depth + 1))
43965 return true;
43966
43967 APInt RHSUndef, RHSZero;
43968 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
43969 RHSZero, TLO, Depth + 1))
43970 return true;
43971
43972 KnownZero = LHSZero & RHSZero;
43973 KnownUndef = LHSUndef & RHSUndef;
43974 break;
43975 }
43976 case X86ISD::VZEXT_MOVL: {
43977 // If upper demanded elements are already zero then we have nothing to do.
43978 SDValue Src = Op.getOperand(0);
43979 APInt DemandedUpperElts = DemandedElts;
43980 DemandedUpperElts.clearLowBits(1);
43981 if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
43982 return TLO.CombineTo(Op, Src);
43983 break;
43984 }
43985 case X86ISD::VZEXT_LOAD: {
43986 // If upper demanded elements are not demanded then simplify to a
43987 // scalar_to_vector(load()).
43989 if (DemandedElts == 1 && Op.getValue(1).use_empty() && isTypeLegal(SVT)) {
43990 SDLoc DL(Op);
43991 auto *Mem = cast<MemSDNode>(Op);
43992 SDValue Elt = TLO.DAG.getLoad(SVT, DL, Mem->getChain(), Mem->getBasePtr(),
43993 Mem->getMemOperand());
43994 SDValue Vec = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Elt);
43995 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Vec));
43996 }
43997 break;
43998 }
43999 case X86ISD::VBROADCAST: {
44000 SDValue Src = Op.getOperand(0);
44001 MVT SrcVT = Src.getSimpleValueType();
44002 // Don't bother broadcasting if we just need the 0'th element.
44003 if (DemandedElts == 1) {
44004 if (!SrcVT.isVector())
44005 Src = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op), VT, Src);
44006 else if (Src.getValueType() != VT)
44007 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
44008 SDLoc(Op));
44009 return TLO.CombineTo(Op, Src);
44010 }
44011 if (!SrcVT.isVector())
44012 break;
44013 APInt SrcUndef, SrcZero;
44014 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
44015 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
44016 Depth + 1))
44017 return true;
44018 // Aggressively peek through src to get at the demanded elt.
44019 // TODO - we should do this for all target/faux shuffles ops.
44021 Src, SrcElts, TLO.DAG, Depth + 1))
44022 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44023 break;
44024 }
44025 case X86ISD::VPERMV:
44026 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
44027 Depth))
44028 return true;
44029 break;
44030 case X86ISD::PSHUFB:
44031 case X86ISD::VPERMV3:
44032 case X86ISD::VPERMILPV:
44033 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
44034 Depth))
44035 return true;
44036 break;
44037 case X86ISD::VPPERM:
44038 case X86ISD::VPERMIL2:
44039 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
44040 Depth))
44041 return true;
44042 break;
44043 }
44044
44045 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
44046 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
44047 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
44048 if ((VT.is256BitVector() || VT.is512BitVector()) &&
44049 DemandedElts.lshr(NumElts / 2) == 0) {
44050 unsigned SizeInBits = VT.getSizeInBits();
44051 unsigned ExtSizeInBits = SizeInBits / 2;
44052
44053 // See if 512-bit ops only use the bottom 128-bits.
44054 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
44055 ExtSizeInBits = SizeInBits / 4;
44056
44057 switch (Opc) {
44058 // Scalar broadcast.
44059 case X86ISD::VBROADCAST: {
44060 SDLoc DL(Op);
44061 SDValue Src = Op.getOperand(0);
44062 if (Src.getValueSizeInBits() > ExtSizeInBits)
44063 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
44064 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44065 ExtSizeInBits / VT.getScalarSizeInBits());
44066 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
44067 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
44068 TLO.DAG, DL, ExtSizeInBits));
44069 }
44071 SDLoc DL(Op);
44072 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
44073 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44074 ExtSizeInBits / VT.getScalarSizeInBits());
44075 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
44076 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
44077 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
44078 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
44079 MemIntr->getMemOperand());
44081 Bcst.getValue(1));
44082 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
44083 TLO.DAG, DL, ExtSizeInBits));
44084 }
44085 // Subvector broadcast.
44087 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
44088 EVT MemVT = MemIntr->getMemoryVT();
44089 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
44090 SDLoc DL(Op);
44091 SDValue Ld =
44092 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
44093 MemIntr->getBasePtr(), MemIntr->getMemOperand());
44095 Ld.getValue(1));
44096 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
44097 TLO.DAG, DL, ExtSizeInBits));
44098 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
44099 SDLoc DL(Op);
44100 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44101 ExtSizeInBits / VT.getScalarSizeInBits());
44102 if (SDValue BcstLd =
44103 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
44104 return TLO.CombineTo(Op,
44105 insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
44106 TLO.DAG, DL, ExtSizeInBits));
44107 }
44108 break;
44109 }
44110 // Byte shifts by immediate.
44111 case X86ISD::VSHLDQ:
44112 case X86ISD::VSRLDQ:
44113 // Shift by uniform.
44114 case X86ISD::VSHL:
44115 case X86ISD::VSRL:
44116 case X86ISD::VSRA:
44117 // Shift by immediate.
44118 case X86ISD::VSHLI:
44119 case X86ISD::VSRLI:
44120 case X86ISD::VSRAI: {
44121 SDLoc DL(Op);
44122 SDValue Ext0 =
44123 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
44124 SDValue ExtOp =
44125 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
44126 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44127 SDValue Insert =
44128 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44129 return TLO.CombineTo(Op, Insert);
44130 }
44131 case X86ISD::VPERMI: {
44132 // Simplify 256-bit PERMPD/PERMQ to extract_subvector.
44133 // TODO: This should be done in shuffle combining.
44134 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
44136 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
44137 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
44138 SDLoc DL(Op);
44139 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
44140 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44141 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
44142 return TLO.CombineTo(Op, Insert);
44143 }
44144 }
44145 // Simplify 512-bit PERMPD/PERMQ to 256-bit variant on lower half.
44146 if (VT == MVT::v8f64 || VT == MVT::v8i64) {
44147 SDLoc DL(Op);
44148 SDValue Ext0 = extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, 256);
44149 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0,
44150 Op.getOperand(1));
44151 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44152 SDValue Insert = insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, 256);
44153 return TLO.CombineTo(Op, Insert);
44154 }
44155 break;
44156 }
44157 case X86ISD::VPERMV: {
44160 // We can always split v16i32/v16f32 AVX512 to v8i32/v8f32 AVX2 variants.
44161 if ((VT.is256BitVector() || Subtarget.hasVLX() || VT == MVT::v16i32 ||
44162 VT == MVT::v16f32) &&
44163 getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
44164 // For lane-crossing shuffles, only split in half in case we're still
44165 // referencing higher elements.
44166 unsigned HalfElts = NumElts / 2;
44167 unsigned HalfSize = SizeInBits / 2;
44168 Mask.resize(HalfElts);
44169 if (all_of(Mask,
44170 [&](int M) { return isUndefOrInRange(M, 0, HalfElts); })) {
44172 SDLoc DL(Op);
44173 SDValue Ext;
44174 SDValue M =
44175 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize);
44176 SDValue V =
44177 extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, HalfSize);
44178 // For 128-bit v2X64/v4X32 instructions, use VPERMILPD/VPERMILPS.
44179 if (VT.is512BitVector() || VT.getScalarSizeInBits() <= 16)
44180 Ext = TLO.DAG.getNode(Opc, DL, HalfVT, M, V);
44181 else {
44183 MVT ShufVT = HalfVT.changeVectorElementType(ShufSVT);
44184 Ext = TLO.DAG.getNode(X86ISD::VPERMILPV, DL, ShufVT,
44185 TLO.DAG.getBitcast(ShufVT, V), M);
44186 Ext = TLO.DAG.getBitcast(HalfVT, Ext);
44187 }
44188 SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
44189 Subtarget, TLO.DAG, DL, SizeInBits);
44190 return TLO.CombineTo(Op, Insert);
44191 }
44192 }
44193 break;
44194 }
44195 case X86ISD::VPERMV3: {
44198 if (Subtarget.hasVLX() &&
44199 getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
44200 // For lane-crossing shuffles, only split in half in case we're still
44201 // referencing higher elements.
44202 unsigned HalfElts = NumElts / 2;
44203 unsigned HalfSize = SizeInBits / 2;
44204 Mask.resize(HalfElts);
44205 if (all_of(Mask, [&](int M) {
44206 return isUndefOrInRange(M, 0, HalfElts) ||
44207 isUndefOrInRange(M, NumElts, NumElts + HalfElts);
44208 })) {
44209 // Adjust mask elements for 2nd operand to point to half width.
44210 for (int &M : Mask)
44211 M = (M < NumElts) ? M : (M - HalfElts);
44213 MVT HalfIntVT = HalfVT.changeVectorElementTypeToInteger();
44214 SDLoc DL(Op);
44215 SDValue Ext = TLO.DAG.getNode(
44216 Opc, DL, HalfVT,
44217 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize),
44218 getConstVector(Mask, HalfIntVT, TLO.DAG, DL, /*IsMask=*/true),
44219 extractSubVector(Op.getOperand(2), 0, TLO.DAG, DL, HalfSize));
44220 SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
44221 Subtarget, TLO.DAG, DL, SizeInBits);
44222 return TLO.CombineTo(Op, Insert);
44223 }
44224 }
44225 break;
44226 }
44227 case X86ISD::VPERM2X128: {
44228 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
44229 SDLoc DL(Op);
44230 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
44231 if (LoMask & 0x8)
44232 return TLO.CombineTo(
44233 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
44234 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
44235 unsigned SrcIdx = (LoMask & 0x2) >> 1;
44236 SDValue ExtOp =
44237 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
44238 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44239 SDValue Insert =
44240 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44241 return TLO.CombineTo(Op, Insert);
44242 }
44243 // Conversions.
44244 // TODO: Add more CVT opcodes when we have test coverage.
44245 case X86ISD::CVTTP2UI: {
44246 if (!Subtarget.hasVLX())
44247 break;
44248 [[fallthrough]];
44249 }
44250 case X86ISD::CVTTP2SI: {
44251 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f16 &&
44252 !Subtarget.hasVLX())
44253 break;
44254 [[fallthrough]];
44255 }
44256 case X86ISD::CVTPH2PS: {
44257 SDLoc DL(Op);
44258 unsigned Scale = SizeInBits / ExtSizeInBits;
44259 SDValue SrcOp = Op.getOperand(0);
44260 MVT SrcVT = SrcOp.getSimpleValueType();
44261 unsigned SrcExtSize =
44262 std::max<unsigned>(SrcVT.getSizeInBits() / Scale, 128);
44264 ExtSizeInBits / VT.getScalarSizeInBits());
44265 SDValue ExtOp = TLO.DAG.getNode(
44266 Opc, DL, ExtVT, extractSubVector(SrcOp, 0, TLO.DAG, DL, SrcExtSize));
44267 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44268 SDValue Insert =
44269 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44270 return TLO.CombineTo(Op, Insert);
44271 }
44272 // Zero upper elements.
44273 case X86ISD::VZEXT_MOVL:
44274 // Variable blend.
44275 case X86ISD::BLENDV:
44276 // Target unary shuffles:
44277 case X86ISD::MOVDDUP:
44278 // Target unary shuffles by immediate:
44279 case X86ISD::PSHUFD:
44280 case X86ISD::PSHUFLW:
44281 case X86ISD::PSHUFHW:
44282 case X86ISD::VPERMILPI:
44283 // (Non-Lane Crossing) Target Shuffles.
44284 case X86ISD::VPERMILPV:
44285 case X86ISD::VPERMIL2:
44286 case X86ISD::PSHUFB:
44287 case X86ISD::UNPCKL:
44288 case X86ISD::UNPCKH:
44289 case X86ISD::BLENDI:
44290 // Integer ops.
44291 case X86ISD::PACKSS:
44292 case X86ISD::PACKUS:
44293 case X86ISD::PCMPEQ:
44294 case X86ISD::PCMPGT:
44295 case X86ISD::PMULUDQ:
44296 case X86ISD::PMULDQ:
44297 case X86ISD::VSHLV:
44298 case X86ISD::VSRLV:
44299 case X86ISD::VSRAV:
44300 // Float ops.
44301 case X86ISD::FMAX:
44302 case X86ISD::FMIN:
44303 case X86ISD::FMAXC:
44304 case X86ISD::FMINC:
44305 case X86ISD::FRSQRT:
44306 case X86ISD::FRCP:
44307 // Horizontal Ops.
44308 case X86ISD::HADD:
44309 case X86ISD::HSUB:
44310 case X86ISD::FHADD:
44311 case X86ISD::FHSUB: {
44312 SDLoc DL(Op);
44314 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
44315 SDValue SrcOp = Op.getOperand(i);
44316 EVT SrcVT = SrcOp.getValueType();
44317 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
44318 "Unsupported vector size");
44319 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
44320 ExtSizeInBits)
44321 : SrcOp);
44322 }
44323 MVT ExtVT = VT.getSimpleVT();
44324 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
44325 ExtSizeInBits / ExtVT.getScalarSizeInBits());
44326 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
44327 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44328 SDValue Insert =
44329 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44330 return TLO.CombineTo(Op, Insert);
44331 }
44332 }
44333 }
44334
44335 // For splats, unless we *only* demand the 0'th element,
44336 // stop attempts at simplification here, we aren't going to improve things,
44337 // this is better than any potential shuffle.
44338 if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
44339 return false;
44340
44341 // Get target/faux shuffle mask.
44342 APInt OpUndef, OpZero;
44343 SmallVector<int, 64> OpMask;
44344 SmallVector<SDValue, 2> OpInputs;
44345 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
44346 OpZero, TLO.DAG, Depth, false))
44347 return false;
44348
44349 // Shuffle inputs must be the same size as the result.
44350 if (OpMask.size() != (unsigned)NumElts ||
44351 llvm::any_of(OpInputs, [VT](SDValue V) {
44352 return VT.getSizeInBits() != V.getValueSizeInBits() ||
44353 !V.getValueType().isVector();
44354 }))
44355 return false;
44356
44357 KnownZero = OpZero;
44358 KnownUndef = OpUndef;
44359
44360 // Check if shuffle mask can be simplified to undef/zero/identity.
44361 int NumSrcs = OpInputs.size();
44362 for (int i = 0; i != NumElts; ++i)
44363 if (!DemandedElts[i])
44364 OpMask[i] = SM_SentinelUndef;
44365
44366 if (isUndefInRange(OpMask, 0, NumElts)) {
44367 KnownUndef.setAllBits();
44368 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
44369 }
44370 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
44371 KnownZero.setAllBits();
44372 return TLO.CombineTo(
44373 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
44374 }
44375 for (int Src = 0; Src != NumSrcs; ++Src)
44376 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
44377 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
44378
44379 // Attempt to simplify inputs.
44380 for (int Src = 0; Src != NumSrcs; ++Src) {
44381 // TODO: Support inputs of different types.
44382 if (OpInputs[Src].getValueType() != VT)
44383 continue;
44384
44385 int Lo = Src * NumElts;
44386 APInt SrcElts = APInt::getZero(NumElts);
44387 for (int i = 0; i != NumElts; ++i)
44388 if (DemandedElts[i]) {
44389 int M = OpMask[i] - Lo;
44390 if (0 <= M && M < NumElts)
44391 SrcElts.setBit(M);
44392 }
44393
44394 // TODO - Propagate input undef/zero elts.
44395 APInt SrcUndef, SrcZero;
44396 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
44397 TLO, Depth + 1))
44398 return true;
44399 }
44400
44401 // If we don't demand all elements, then attempt to combine to a simpler
44402 // shuffle.
44403 // We need to convert the depth to something combineX86ShufflesRecursively
44404 // can handle - so pretend its Depth == 0 again, and reduce the max depth
44405 // to match. This prevents combineX86ShuffleChain from returning a
44406 // combined shuffle that's the same as the original root, causing an
44407 // infinite loop.
44408 if (!DemandedElts.isAllOnes()) {
44409 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
44410
44411 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
44412 for (int i = 0; i != NumElts; ++i)
44413 if (DemandedElts[i])
44414 DemandedMask[i] = i;
44415
44417 {Op}, 0, Op.getOpcode(), Op.getSimpleValueType(), DemandedMask, {}, 0,
44419 /*AllowVariableCrossLaneMask=*/true,
44420 /*AllowVariablePerLaneMask=*/true, isMaskableNode(Op, Subtarget),
44421 TLO.DAG, SDLoc(Op), Subtarget);
44422 if (NewShuffle)
44423 return TLO.CombineTo(Op, NewShuffle);
44424 }
44425
44426 return false;
44427}
44428
44430 SDValue Op, const APInt &OriginalDemandedBits,
44431 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
44432 unsigned Depth) const {
44433 EVT VT = Op.getValueType();
44434 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
44435 unsigned Opc = Op.getOpcode();
44436 switch(Opc) {
44437 case X86ISD::VTRUNC: {
44438 KnownBits KnownOp;
44439 SDValue Src = Op.getOperand(0);
44440 MVT SrcVT = Src.getSimpleValueType();
44441
44442 // Simplify the input, using demanded bit information.
44443 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
44444 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
44445 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
44446 return true;
44447 break;
44448 }
44449 case X86ISD::PMULDQ:
44450 case X86ISD::PMULUDQ: {
44451 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
44452 KnownBits KnownLHS, KnownRHS;
44453 SDValue LHS = Op.getOperand(0);
44454 SDValue RHS = Op.getOperand(1);
44455
44456 // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
44457 // FIXME: Can we bound this better?
44458 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
44459 APInt DemandedMaskLHS = APInt::getAllOnes(64);
44460 APInt DemandedMaskRHS = APInt::getAllOnes(64);
44461
44462 bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();
44463 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))
44464 DemandedMaskLHS = DemandedMask;
44465 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))
44466 DemandedMaskRHS = DemandedMask;
44467
44468 if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,
44469 KnownLHS, TLO, Depth + 1))
44470 return true;
44471 if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,
44472 KnownRHS, TLO, Depth + 1))
44473 return true;
44474
44475 // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
44476 KnownRHS = KnownRHS.trunc(32);
44477 if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&
44478 KnownRHS.getConstant().isOne()) {
44479 SDLoc DL(Op);
44480 SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);
44481 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));
44482 }
44483
44484 // Aggressively peek through ops to get at the demanded low bits.
44486 LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
44488 RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
44489 if (DemandedLHS || DemandedRHS) {
44490 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
44491 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
44492 return TLO.CombineTo(
44493 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
44494 }
44495 break;
44496 }
44497 case X86ISD::ANDNP: {
44498 KnownBits Known2;
44499 SDValue Op0 = Op.getOperand(0);
44500 SDValue Op1 = Op.getOperand(1);
44501
44502 if (SimplifyDemandedBits(Op1, OriginalDemandedBits, OriginalDemandedElts,
44503 Known, TLO, Depth + 1))
44504 return true;
44505
44506 if (SimplifyDemandedBits(Op0, ~Known.Zero & OriginalDemandedBits,
44507 OriginalDemandedElts, Known2, TLO, Depth + 1))
44508 return true;
44509
44510 // If the RHS is a constant, see if we can simplify it.
44511 if (ShrinkDemandedConstant(Op, ~Known2.One & OriginalDemandedBits,
44512 OriginalDemandedElts, TLO))
44513 return true;
44514
44515 // ANDNP = (~Op0 & Op1);
44516 Known.One &= Known2.Zero;
44517 Known.Zero |= Known2.One;
44518 break;
44519 }
44520 case X86ISD::VSHLI: {
44521 SDValue Op0 = Op.getOperand(0);
44522 SDValue Op1 = Op.getOperand(1);
44523
44524 unsigned ShAmt = Op1->getAsZExtVal();
44525 if (ShAmt >= BitWidth)
44526 break;
44527
44528 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
44529
44530 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
44531 // single shift. We can do this if the bottom bits (which are shifted
44532 // out) are never demanded.
44533 if (Op0.getOpcode() == X86ISD::VSRLI &&
44534 OriginalDemandedBits.countr_zero() >= ShAmt) {
44535 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
44536 if (Shift2Amt < BitWidth) {
44537 int Diff = ShAmt - Shift2Amt;
44538 if (Diff == 0)
44539 return TLO.CombineTo(Op, Op0.getOperand(0));
44540
44541 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
44542 SDValue NewShift = TLO.DAG.getNode(
44543 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
44544 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
44545 return TLO.CombineTo(Op, NewShift);
44546 }
44547 }
44548
44549 // If we are only demanding sign bits then we can use the shift source directly.
44550 unsigned NumSignBits =
44551 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
44552 unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
44553 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
44554 return TLO.CombineTo(Op, Op0);
44555
44556 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44557 TLO, Depth + 1))
44558 return true;
44559
44560 Known <<= ShAmt;
44561
44562 // Low bits known zero.
44563 Known.Zero.setLowBits(ShAmt);
44564
44565 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44566 // Attempt to avoid multi-use ops if we don't need anything from them.
44567 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44568 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44569 SDValue NewOp =
44570 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44571 return TLO.CombineTo(Op, NewOp);
44572 }
44573 }
44574 return false;
44575 }
44576 case X86ISD::VSRLI: {
44577 SDValue Op0 = Op.getOperand(0);
44578 SDValue Op1 = Op.getOperand(1);
44579
44580 unsigned ShAmt = Op1->getAsZExtVal();
44581 if (ShAmt >= BitWidth)
44582 break;
44583
44584 APInt DemandedMask = OriginalDemandedBits << ShAmt;
44585
44586 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44587 TLO, Depth + 1))
44588 return true;
44589
44590 Known >>= ShAmt;
44591
44592 // High bits known zero.
44593 Known.Zero.setHighBits(ShAmt);
44594
44595 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44596 // Attempt to avoid multi-use ops if we don't need anything from them.
44597 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44598 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44599 SDValue NewOp =
44600 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44601 return TLO.CombineTo(Op, NewOp);
44602 }
44603 }
44604 return false;
44605 }
44606 case X86ISD::VSRAI: {
44607 SDValue Op0 = Op.getOperand(0);
44608 SDValue Op1 = Op.getOperand(1);
44609
44610 unsigned ShAmt = Op1->getAsZExtVal();
44611 if (ShAmt >= BitWidth)
44612 break;
44613
44614 APInt DemandedMask = OriginalDemandedBits << ShAmt;
44615
44616 // If we only want bits that already match the signbit then we don't need
44617 // to shift.
44618 unsigned NumHiDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
44619 if (TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1) >=
44620 NumHiDemandedBits)
44621 return TLO.CombineTo(Op, Op0);
44622
44623 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
44624 if (Op0.getOpcode() == X86ISD::VSHLI && Op1 == Op0.getOperand(1)) {
44625 SDValue Op00 = Op0.getOperand(0);
44626 unsigned NumSignBits =
44627 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
44628 if (ShAmt < NumSignBits)
44629 return TLO.CombineTo(Op, Op00);
44630 }
44631
44632 // If any of the demanded bits are produced by the sign extension, we also
44633 // demand the input sign bit.
44634 if (OriginalDemandedBits.countl_zero() < ShAmt)
44635 DemandedMask.setSignBit();
44636
44637 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44638 TLO, Depth + 1))
44639 return true;
44640
44641 Known >>= ShAmt;
44642
44643 // If the input sign bit is known to be zero, or if none of the top bits
44644 // are demanded, turn this into an unsigned shift right.
44645 if (Known.Zero[BitWidth - ShAmt - 1] ||
44646 OriginalDemandedBits.countl_zero() >= ShAmt)
44647 return TLO.CombineTo(
44648 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
44649
44650 // High bits are known one.
44651 if (Known.One[BitWidth - ShAmt - 1])
44652 Known.One.setHighBits(ShAmt);
44653
44654 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44655 // Attempt to avoid multi-use ops if we don't need anything from them.
44656 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44657 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44658 SDValue NewOp =
44659 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44660 return TLO.CombineTo(Op, NewOp);
44661 }
44662 }
44663 return false;
44664 }
44665 case X86ISD::BLENDI: {
44666 SDValue LHS = Op.getOperand(0);
44667 SDValue RHS = Op.getOperand(1);
44668 APInt Mask = getBLENDIBlendMask(Op);
44669
44670 APInt DemandedEltsLHS = OriginalDemandedElts & ~Mask;
44671 if (SimplifyDemandedBits(LHS, OriginalDemandedBits, DemandedEltsLHS, Known,
44672 TLO, Depth + 1))
44673 return true;
44674
44675 APInt DemandedEltsRHS = OriginalDemandedElts & Mask;
44676 if (SimplifyDemandedBits(RHS, OriginalDemandedBits, DemandedEltsRHS, Known,
44677 TLO, Depth + 1))
44678 return true;
44679
44680 // Attempt to avoid multi-use ops if we don't need anything from them.
44682 LHS, OriginalDemandedBits, DemandedEltsLHS, TLO.DAG, Depth + 1);
44684 RHS, OriginalDemandedBits, DemandedEltsRHS, TLO.DAG, Depth + 1);
44685 if (NewLHS || NewRHS) {
44686 NewLHS = NewLHS ? NewLHS : LHS;
44687 NewRHS = NewRHS ? NewRHS : RHS;
44688 return TLO.CombineTo(Op,
44689 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT,
44690 NewLHS, NewRHS, Op.getOperand(2)));
44691 }
44692 break;
44693 }
44694 case X86ISD::BLENDV: {
44695 SDValue Sel = Op.getOperand(0);
44696 SDValue LHS = Op.getOperand(1);
44697 SDValue RHS = Op.getOperand(2);
44698
44699 APInt SignMask = APInt::getSignMask(BitWidth);
44701 Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
44703 LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44705 RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44706
44707 if (NewSel || NewLHS || NewRHS) {
44708 NewSel = NewSel ? NewSel : Sel;
44709 NewLHS = NewLHS ? NewLHS : LHS;
44710 NewRHS = NewRHS ? NewRHS : RHS;
44711 return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
44712 NewSel, NewLHS, NewRHS));
44713 }
44714 break;
44715 }
44716 case X86ISD::PEXTRB:
44717 case X86ISD::PEXTRW: {
44718 SDValue Vec = Op.getOperand(0);
44719 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
44720 MVT VecVT = Vec.getSimpleValueType();
44721 unsigned NumVecElts = VecVT.getVectorNumElements();
44722
44723 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
44724 unsigned Idx = CIdx->getZExtValue();
44725 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
44726
44727 // If we demand no bits from the vector then we must have demanded
44728 // bits from the implict zext - simplify to zero.
44729 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
44730 if (DemandedVecBits == 0)
44731 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44732
44733 APInt KnownUndef, KnownZero;
44734 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
44735 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
44736 KnownZero, TLO, Depth + 1))
44737 return true;
44738
44739 KnownBits KnownVec;
44740 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
44741 KnownVec, TLO, Depth + 1))
44742 return true;
44743
44745 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
44746 return TLO.CombineTo(
44747 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
44748
44749 Known = KnownVec.zext(BitWidth);
44750 return false;
44751 }
44752 break;
44753 }
44754 case X86ISD::PINSRB:
44755 case X86ISD::PINSRW: {
44756 SDValue Vec = Op.getOperand(0);
44757 SDValue Scl = Op.getOperand(1);
44758 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
44759 MVT VecVT = Vec.getSimpleValueType();
44760
44761 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
44762 unsigned Idx = CIdx->getZExtValue();
44763 if (!OriginalDemandedElts[Idx])
44764 return TLO.CombineTo(Op, Vec);
44765
44766 KnownBits KnownVec;
44767 APInt DemandedVecElts(OriginalDemandedElts);
44768 DemandedVecElts.clearBit(Idx);
44769 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
44770 KnownVec, TLO, Depth + 1))
44771 return true;
44772
44773 KnownBits KnownScl;
44774 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
44775 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
44776 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
44777 return true;
44778
44779 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
44780 Known = KnownVec.intersectWith(KnownScl);
44781 return false;
44782 }
44783 break;
44784 }
44785 case X86ISD::PACKSS:
44786 // PACKSS saturates to MIN/MAX integer values. So if we just want the
44787 // sign bit then we can just ask for the source operands sign bit.
44788 // TODO - add known bits handling.
44789 if (OriginalDemandedBits.isSignMask()) {
44790 APInt DemandedLHS, DemandedRHS;
44791 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
44792
44793 KnownBits KnownLHS, KnownRHS;
44794 APInt SignMask = APInt::getSignMask(BitWidth * 2);
44795 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
44796 KnownLHS, TLO, Depth + 1))
44797 return true;
44798 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
44799 KnownRHS, TLO, Depth + 1))
44800 return true;
44801
44802 // Attempt to avoid multi-use ops if we don't need anything from them.
44804 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
44806 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
44807 if (DemandedOp0 || DemandedOp1) {
44808 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
44809 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
44810 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
44811 }
44812 }
44813 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
44814 break;
44815 case X86ISD::VBROADCAST: {
44816 SDValue Src = Op.getOperand(0);
44817 MVT SrcVT = Src.getSimpleValueType();
44818 APInt DemandedElts = APInt::getOneBitSet(
44819 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
44820 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
44821 TLO, Depth + 1))
44822 return true;
44823 // If we don't need the upper bits, attempt to narrow the broadcast source.
44824 // Don't attempt this on AVX512 as it might affect broadcast folding.
44825 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
44826 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
44827 OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&
44828 Src->hasOneUse()) {
44829 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
44830 SDValue NewSrc =
44831 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
44832 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
44833 SDValue NewBcst =
44834 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
44835 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
44836 }
44837 break;
44838 }
44839 case X86ISD::PCMPGT:
44840 // icmp sgt(0, R) == ashr(R, BitWidth-1).
44841 // iff we only need the sign bit then we can use R directly.
44842 if (OriginalDemandedBits.isSignMask() &&
44843 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
44844 return TLO.CombineTo(Op, Op.getOperand(1));
44845 break;
44846 case X86ISD::MOVMSK: {
44847 SDValue Src = Op.getOperand(0);
44848 MVT SrcVT = Src.getSimpleValueType();
44849 unsigned SrcBits = SrcVT.getScalarSizeInBits();
44850 unsigned NumElts = SrcVT.getVectorNumElements();
44851
44852 // If we don't need the sign bits at all just return zero.
44853 if (OriginalDemandedBits.countr_zero() >= NumElts)
44854 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44855
44856 // See if we only demand bits from the lower 128-bit vector.
44857 if (SrcVT.is256BitVector() &&
44858 OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
44859 SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
44860 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44861 }
44862
44863 // Only demand the vector elements of the sign bits we need.
44864 APInt KnownUndef, KnownZero;
44865 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
44866 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
44867 TLO, Depth + 1))
44868 return true;
44869
44870 Known.Zero = KnownZero.zext(BitWidth);
44871 Known.Zero.setHighBits(BitWidth - NumElts);
44872
44873 // MOVMSK only uses the MSB from each vector element.
44874 KnownBits KnownSrc;
44875 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
44876 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
44877 Depth + 1))
44878 return true;
44879
44880 if (KnownSrc.One[SrcBits - 1])
44881 Known.One.setLowBits(NumElts);
44882 else if (KnownSrc.Zero[SrcBits - 1])
44883 Known.Zero.setLowBits(NumElts);
44884
44885 // Attempt to avoid multi-use os if we don't need anything from it.
44887 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
44888 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44889 return false;
44890 }
44891 case X86ISD::TESTP: {
44892 SDValue Op0 = Op.getOperand(0);
44893 SDValue Op1 = Op.getOperand(1);
44894 MVT OpVT = Op0.getSimpleValueType();
44895 assert((OpVT.getVectorElementType() == MVT::f32 ||
44896 OpVT.getVectorElementType() == MVT::f64) &&
44897 "Illegal vector type for X86ISD::TESTP");
44898
44899 // TESTPS/TESTPD only demands the sign bits of ALL the elements.
44900 KnownBits KnownSrc;
44901 APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());
44902 bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());
44903 return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,
44904 AssumeSingleUse) ||
44905 SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,
44906 AssumeSingleUse);
44907 }
44908 case X86ISD::CMOV: {
44909 KnownBits Known2;
44910 if (SimplifyDemandedBits(Op.getOperand(1), OriginalDemandedBits,
44911 OriginalDemandedElts, Known2, TLO, Depth + 1))
44912 return true;
44913 if (SimplifyDemandedBits(Op.getOperand(0), OriginalDemandedBits,
44914 OriginalDemandedElts, Known, TLO, Depth + 1))
44915 return true;
44916
44917 // Only known if known in both the LHS and RHS.
44918 Known = Known.intersectWith(Known2);
44919 return false;
44920 }
44921 case X86ISD::BEXTR:
44922 case X86ISD::BEXTRI: {
44923 SDValue Op0 = Op.getOperand(0);
44924 SDValue Op1 = Op.getOperand(1);
44925
44926 // Only bottom 16-bits of the control bits are required.
44927 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
44928 // NOTE: SimplifyDemandedBits won't do this for constants.
44929 uint64_t Val1 = Cst1->getZExtValue();
44930 uint64_t MaskedVal1 = Val1 & 0xFFFF;
44931 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
44932 SDLoc DL(Op);
44933 return TLO.CombineTo(
44934 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
44935 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
44936 }
44937
44938 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
44939 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
44940
44941 // If the length is 0, the result is 0.
44942 if (Length == 0) {
44943 Known.setAllZero();
44944 return false;
44945 }
44946
44947 if ((Shift + Length) <= BitWidth) {
44948 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
44949 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
44950 return true;
44951
44952 Known = Known.extractBits(Length, Shift);
44953 Known = Known.zextOrTrunc(BitWidth);
44954 return false;
44955 }
44956 } else {
44957 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
44958 KnownBits Known1;
44959 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
44960 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
44961 return true;
44962
44963 // If the length is 0, replace with 0.
44964 KnownBits LengthBits = Known1.extractBits(8, 8);
44965 if (LengthBits.isZero())
44966 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44967 }
44968
44969 break;
44970 }
44971 case X86ISD::PDEP: {
44972 SDValue Op0 = Op.getOperand(0);
44973 SDValue Op1 = Op.getOperand(1);
44974
44975 unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
44976 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
44977
44978 // If the demanded bits has leading zeroes, we don't demand those from the
44979 // mask.
44980 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
44981 return true;
44982
44983 // The number of possible 1s in the mask determines the number of LSBs of
44984 // operand 0 used. Undemanded bits from the mask don't matter so filter
44985 // them before counting.
44986 KnownBits Known2;
44987 uint64_t Count = (~Known.Zero & LoMask).popcount();
44988 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
44989 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
44990 return true;
44991
44992 // Zeroes are retained from the mask, but not ones.
44993 Known.One.clearAllBits();
44994 // The result will have at least as many trailing zeros as the non-mask
44995 // operand since bits can only map to the same or higher bit position.
44996 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
44997 return false;
44998 }
44999 case X86ISD::VPMADD52L:
45000 case X86ISD::VPMADD52H: {
45001 KnownBits KnownOp0, KnownOp1, KnownOp2;
45002 SDValue Op0 = Op.getOperand(0);
45003 SDValue Op1 = Op.getOperand(1);
45004 SDValue Op2 = Op.getOperand(2);
45005 // Only demand the lower 52-bits of operands 0 / 1 (and all 64-bits of
45006 // operand 2).
45007 APInt Low52Bits = APInt::getLowBitsSet(BitWidth, 52);
45008 if (SimplifyDemandedBits(Op0, Low52Bits, OriginalDemandedElts, KnownOp0,
45009 TLO, Depth + 1))
45010 return true;
45011
45012 if (SimplifyDemandedBits(Op1, Low52Bits, OriginalDemandedElts, KnownOp1,
45013 TLO, Depth + 1))
45014 return true;
45015
45016 if (SimplifyDemandedBits(Op2, APInt::getAllOnes(64), OriginalDemandedElts,
45017 KnownOp2, TLO, Depth + 1))
45018 return true;
45019
45020 KnownBits KnownMul;
45021 KnownOp0 = KnownOp0.trunc(52);
45022 KnownOp1 = KnownOp1.trunc(52);
45023 KnownMul = Opc == X86ISD::VPMADD52L ? KnownBits::mul(KnownOp0, KnownOp1)
45024 : KnownBits::mulhu(KnownOp0, KnownOp1);
45025 KnownMul = KnownMul.zext(64);
45026
45027 // lo/hi(X * Y) + Z --> C + Z
45028 if (KnownMul.isConstant()) {
45029 SDLoc DL(Op);
45030 SDValue C = TLO.DAG.getConstant(KnownMul.getConstant(), DL, VT);
45031 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ADD, DL, VT, C, Op2));
45032 }
45033
45034 Known = KnownBits::add(KnownMul, KnownOp2);
45035 return false;
45036 }
45037 }
45038
45040 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
45041}
45042
45044 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
45045 SelectionDAG &DAG, unsigned Depth) const {
45046 int NumElts = DemandedElts.getBitWidth();
45047 unsigned Opc = Op.getOpcode();
45048 EVT VT = Op.getValueType();
45049
45050 switch (Opc) {
45051 case X86ISD::PINSRB:
45052 case X86ISD::PINSRW: {
45053 // If we don't demand the inserted element, return the base vector.
45054 SDValue Vec = Op.getOperand(0);
45055 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
45056 MVT VecVT = Vec.getSimpleValueType();
45057 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
45058 !DemandedElts[CIdx->getZExtValue()])
45059 return Vec;
45060 break;
45061 }
45062 case X86ISD::VSHLI: {
45063 // If we are only demanding sign bits then we can use the shift source
45064 // directly.
45065 SDValue Op0 = Op.getOperand(0);
45066 unsigned ShAmt = Op.getConstantOperandVal(1);
45067 unsigned BitWidth = DemandedBits.getBitWidth();
45068 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
45069 unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
45070 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
45071 return Op0;
45072 break;
45073 }
45074 case X86ISD::VSRAI:
45075 // iff we only need the sign bit then we can use the source directly.
45076 // TODO: generalize where we only demand extended signbits.
45077 if (DemandedBits.isSignMask())
45078 return Op.getOperand(0);
45079 break;
45080 case X86ISD::PCMPGT:
45081 // icmp sgt(0, R) == ashr(R, BitWidth-1).
45082 // iff we only need the sign bit then we can use R directly.
45083 if (DemandedBits.isSignMask() &&
45084 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
45085 return Op.getOperand(1);
45086 break;
45087 case X86ISD::BLENDV: {
45088 // BLENDV: Cond (MSB) ? LHS : RHS
45089 SDValue Cond = Op.getOperand(0);
45090 SDValue LHS = Op.getOperand(1);
45091 SDValue RHS = Op.getOperand(2);
45092
45093 KnownBits CondKnown = DAG.computeKnownBits(Cond, DemandedElts, Depth + 1);
45094 if (CondKnown.isNegative())
45095 return LHS;
45096 if (CondKnown.isNonNegative())
45097 return RHS;
45098 break;
45099 }
45100 case X86ISD::ANDNP: {
45101 // ANDNP = (~LHS & RHS);
45102 SDValue LHS = Op.getOperand(0);
45103 SDValue RHS = Op.getOperand(1);
45104
45105 KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
45106 KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
45107
45108 // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then
45109 // the (inverted) LHS bits cannot contribute to the result of the 'andn' in
45110 // this context, so return RHS.
45111 if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))
45112 return RHS;
45113 break;
45114 }
45115 }
45116
45117 APInt ShuffleUndef, ShuffleZero;
45118 SmallVector<int, 16> ShuffleMask;
45120 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
45121 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
45122 // If all the demanded elts are from one operand and are inline,
45123 // then we can use the operand directly.
45124 int NumOps = ShuffleOps.size();
45125 if (ShuffleMask.size() == (unsigned)NumElts &&
45127 return VT.getSizeInBits() == V.getValueSizeInBits();
45128 })) {
45129
45130 if (DemandedElts.isSubsetOf(ShuffleUndef))
45131 return DAG.getUNDEF(VT);
45132 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
45133 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
45134
45135 // Bitmask that indicates which ops have only been accessed 'inline'.
45136 APInt IdentityOp = APInt::getAllOnes(NumOps);
45137 for (int i = 0; i != NumElts; ++i) {
45138 int M = ShuffleMask[i];
45139 if (!DemandedElts[i] || ShuffleUndef[i])
45140 continue;
45141 int OpIdx = M / NumElts;
45142 int EltIdx = M % NumElts;
45143 if (M < 0 || EltIdx != i) {
45144 IdentityOp.clearAllBits();
45145 break;
45146 }
45147 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
45148 if (IdentityOp == 0)
45149 break;
45150 }
45151 assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&
45152 "Multiple identity shuffles detected");
45153
45154 if (IdentityOp != 0)
45155 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);
45156 }
45157 }
45158
45160 Op, DemandedBits, DemandedElts, DAG, Depth);
45161}
45162
45164 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
45165 bool PoisonOnly, unsigned Depth) const {
45166 unsigned NumElts = DemandedElts.getBitWidth();
45167
45168 switch (Op.getOpcode()) {
45170 case X86ISD::Wrapper:
45171 case X86ISD::WrapperRIP:
45172 return true;
45173 case X86ISD::PACKSS:
45174 case X86ISD::PACKUS: {
45175 APInt DemandedLHS, DemandedRHS;
45176 getPackDemandedElts(Op.getSimpleValueType(), DemandedElts, DemandedLHS,
45177 DemandedRHS);
45178 return (!DemandedLHS ||
45179 DAG.isGuaranteedNotToBeUndefOrPoison(Op.getOperand(0), DemandedLHS,
45180 PoisonOnly, Depth + 1)) &&
45181 (!DemandedRHS ||
45182 DAG.isGuaranteedNotToBeUndefOrPoison(Op.getOperand(1), DemandedRHS,
45183 PoisonOnly, Depth + 1));
45184 }
45185 case X86ISD::INSERTPS:
45186 case X86ISD::BLENDI:
45187 case X86ISD::PSHUFB:
45188 case X86ISD::PSHUFD:
45189 case X86ISD::UNPCKL:
45190 case X86ISD::UNPCKH:
45191 case X86ISD::VPERMILPV:
45192 case X86ISD::VPERMILPI:
45193 case X86ISD::VPERMV:
45194 case X86ISD::VPERMV3: {
45197 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
45198 SmallVector<APInt, 2> DemandedSrcElts(Ops.size(),
45199 APInt::getZero(NumElts));
45200 for (auto M : enumerate(Mask)) {
45201 if (!DemandedElts[M.index()] || M.value() == SM_SentinelZero)
45202 continue;
45203 if (M.value() == SM_SentinelUndef)
45204 return false;
45205 assert(0 <= M.value() && M.value() < (int)(Ops.size() * NumElts) &&
45206 "Shuffle mask index out of range");
45207 DemandedSrcElts[M.value() / NumElts].setBit(M.value() % NumElts);
45208 }
45209 for (auto Op : enumerate(Ops))
45210 if (!DemandedSrcElts[Op.index()].isZero() &&
45212 Op.value(), DemandedSrcElts[Op.index()], PoisonOnly, Depth + 1))
45213 return false;
45214 return true;
45215 }
45216 break;
45217 }
45218 }
45220 Op, DemandedElts, DAG, PoisonOnly, Depth);
45221}
45222
45224 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
45225 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
45226
45227 switch (Op.getOpcode()) {
45228 // SSE bit logic.
45229 case X86ISD::FAND:
45230 case X86ISD::FOR:
45231 case X86ISD::FXOR:
45232 case X86ISD::FANDN:
45233 case X86ISD::ANDNP:
45234 case X86ISD::VPTERNLOG:
45235 return false;
45236 // SSE vector insert/extracts use modulo indices.
45237 case X86ISD::PINSRB:
45238 case X86ISD::PINSRW:
45239 case X86ISD::PEXTRB:
45240 case X86ISD::PEXTRW:
45241 return false;
45242 // SSE vector multiplies are either inbounds or saturate.
45243 case X86ISD::VPMADDUBSW:
45244 case X86ISD::VPMADDWD:
45245 return false;
45246 // SSE vector shifts handle out of bounds shift amounts.
45247 case X86ISD::VSHLI:
45248 case X86ISD::VSRLI:
45249 case X86ISD::VSRAI:
45250 return false;
45251 // SSE blends.
45252 case X86ISD::BLENDI:
45253 case X86ISD::BLENDV:
45254 return false;
45255 // SSE packs.
45256 case X86ISD::PACKSS:
45257 case X86ISD::PACKUS:
45258 return false;
45259 // SSE target shuffles.
45260 case X86ISD::INSERTPS:
45261 case X86ISD::PSHUFB:
45262 case X86ISD::PSHUFD:
45263 case X86ISD::UNPCKL:
45264 case X86ISD::UNPCKH:
45265 case X86ISD::VPERMILPV:
45266 case X86ISD::VPERMILPI:
45267 case X86ISD::VPERMV:
45268 case X86ISD::VPERMV3:
45269 return false;
45270 // SSE comparisons handle all icmp/fcmp cases.
45271 // TODO: Add CMPM/MM with test coverage.
45272 case X86ISD::CMPP:
45273 case X86ISD::PCMPEQ:
45274 case X86ISD::PCMPGT:
45275 return false;
45276 // SSE signbit extraction.
45277 case X86ISD::MOVMSK:
45278 return false;
45279 // GFNI instructions.
45282 case X86ISD::GF2P8MULB:
45283 return false;
45285 switch (Op->getConstantOperandVal(0)) {
45286 case Intrinsic::x86_sse2_pmadd_wd:
45287 case Intrinsic::x86_avx2_pmadd_wd:
45288 case Intrinsic::x86_avx512_pmaddw_d_512:
45289 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
45290 case Intrinsic::x86_avx2_pmadd_ub_sw:
45291 case Intrinsic::x86_avx512_pmaddubs_w_512:
45292 return false;
45293 case Intrinsic::x86_avx512_vpermi2var_d_128:
45294 case Intrinsic::x86_avx512_vpermi2var_d_256:
45295 case Intrinsic::x86_avx512_vpermi2var_d_512:
45296 case Intrinsic::x86_avx512_vpermi2var_hi_128:
45297 case Intrinsic::x86_avx512_vpermi2var_hi_256:
45298 case Intrinsic::x86_avx512_vpermi2var_hi_512:
45299 case Intrinsic::x86_avx512_vpermi2var_pd_128:
45300 case Intrinsic::x86_avx512_vpermi2var_pd_256:
45301 case Intrinsic::x86_avx512_vpermi2var_pd_512:
45302 case Intrinsic::x86_avx512_vpermi2var_ps_128:
45303 case Intrinsic::x86_avx512_vpermi2var_ps_256:
45304 case Intrinsic::x86_avx512_vpermi2var_ps_512:
45305 case Intrinsic::x86_avx512_vpermi2var_q_128:
45306 case Intrinsic::x86_avx512_vpermi2var_q_256:
45307 case Intrinsic::x86_avx512_vpermi2var_q_512:
45308 case Intrinsic::x86_avx512_vpermi2var_qi_128:
45309 case Intrinsic::x86_avx512_vpermi2var_qi_256:
45310 case Intrinsic::x86_avx512_vpermi2var_qi_512:
45311 return false;
45312 }
45313 }
45315 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
45316}
45317
45319 const APInt &DemandedElts,
45320 APInt &UndefElts,
45321 const SelectionDAG &DAG,
45322 unsigned Depth) const {
45323 unsigned NumElts = DemandedElts.getBitWidth();
45324 unsigned Opc = Op.getOpcode();
45325
45326 switch (Opc) {
45327 case X86ISD::VBROADCAST:
45329 UndefElts = APInt::getZero(NumElts);
45330 return true;
45331 }
45332
45333 return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
45334 DAG, Depth);
45335}
45336
45337// Helper to peek through bitops/trunc/setcc to determine size of source vector.
45338// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
45339static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
45340 bool AllowTruncate, unsigned Depth) {
45341 // Limit recursion.
45343 return false;
45344 switch (Src.getOpcode()) {
45345 case ISD::TRUNCATE:
45346 if (!AllowTruncate)
45347 return false;
45348 [[fallthrough]];
45349 case ISD::SETCC:
45350 return Src.getOperand(0).getValueSizeInBits() == Size;
45351 case ISD::FREEZE:
45352 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate,
45353 Depth + 1);
45354 case ISD::AND:
45355 case ISD::XOR:
45356 case ISD::OR:
45357 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate,
45358 Depth + 1) &&
45359 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate,
45360 Depth + 1);
45361 case ISD::SELECT:
45362 case ISD::VSELECT:
45363 return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&
45364 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate,
45365 Depth + 1) &&
45366 checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate,
45367 Depth + 1);
45368 case ISD::BUILD_VECTOR:
45369 return ISD::isBuildVectorAllZeros(Src.getNode()) ||
45370 ISD::isBuildVectorAllOnes(Src.getNode());
45371 }
45372 return false;
45373}
45374
45375// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
45376static unsigned getAltBitOpcode(unsigned Opcode) {
45377 switch(Opcode) {
45378 // clang-format off
45379 case ISD::AND: return X86ISD::FAND;
45380 case ISD::OR: return X86ISD::FOR;
45381 case ISD::XOR: return X86ISD::FXOR;
45382 case X86ISD::ANDNP: return X86ISD::FANDN;
45383 // clang-format on
45384 }
45385 llvm_unreachable("Unknown bitwise opcode");
45386}
45387
45388// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
45390 const SDLoc &DL) {
45391 EVT SrcVT = Src.getValueType();
45392 if (SrcVT != MVT::v4i1)
45393 return SDValue();
45394
45395 switch (Src.getOpcode()) {
45396 case ISD::SETCC:
45397 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
45398 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
45399 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
45400 SDValue Op0 = Src.getOperand(0);
45401 if (ISD::isNormalLoad(Op0.getNode()))
45402 return DAG.getBitcast(MVT::v4f32, Op0);
45403 if (Op0.getOpcode() == ISD::BITCAST &&
45404 Op0.getOperand(0).getValueType() == MVT::v4f32)
45405 return Op0.getOperand(0);
45406 }
45407 break;
45408 case ISD::AND:
45409 case ISD::XOR:
45410 case ISD::OR: {
45411 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
45412 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
45413 if (Op0 && Op1)
45414 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
45415 Op1);
45416 break;
45417 }
45418 }
45419 return SDValue();
45420}
45421
45422// Helper to push sign extension of vXi1 SETCC result through bitops.
45424 SDValue Src, const SDLoc &DL) {
45425 switch (Src.getOpcode()) {
45426 case ISD::SETCC:
45427 case ISD::FREEZE:
45428 case ISD::TRUNCATE:
45429 case ISD::BUILD_VECTOR:
45430 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
45431 case ISD::AND:
45432 case ISD::XOR:
45433 case ISD::OR:
45434 return DAG.getNode(
45435 Src.getOpcode(), DL, SExtVT,
45436 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
45437 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
45438 case ISD::SELECT:
45439 case ISD::VSELECT:
45440 return DAG.getSelect(
45441 DL, SExtVT, Src.getOperand(0),
45442 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),
45443 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));
45444 }
45445 llvm_unreachable("Unexpected node type for vXi1 sign extension");
45446}
45447
45448// Try to match patterns such as
45449// (i16 bitcast (v16i1 x))
45450// ->
45451// (i16 movmsk (16i8 sext (v16i1 x)))
45452// before the illegal vector is scalarized on subtargets that don't have legal
45453// vxi1 types.
45455 const SDLoc &DL,
45456 const X86Subtarget &Subtarget) {
45457 EVT SrcVT = Src.getValueType();
45458 if (Subtarget.useSoftFloat() || !SrcVT.isSimple() ||
45459 SrcVT.getScalarType() != MVT::i1)
45460 return SDValue();
45461
45462 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
45463 // legalization destroys the v4i32 type.
45464 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
45465 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
45466 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
45467 DAG.getBitcast(MVT::v4f32, V));
45468 return DAG.getZExtOrTrunc(V, DL, VT);
45469 }
45470 }
45471
45472 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
45473 // movmskb even with avx512. This will be better than truncating to vXi1 and
45474 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
45475 // vpcmpeqb/vpcmpgtb.
45476 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
45477 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
45478 Src.getOperand(0).getValueType() == MVT::v32i8 ||
45479 Src.getOperand(0).getValueType() == MVT::v64i8);
45480
45481 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
45482 // directly with vpmovmskb/vmovmskps/vmovmskpd.
45483 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
45484 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
45485 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
45486 EVT CmpVT = Src.getOperand(0).getValueType();
45487 EVT EltVT = CmpVT.getVectorElementType();
45488 if (CmpVT.getSizeInBits() <= 256 &&
45489 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
45490 PreferMovMsk = true;
45491 }
45492
45493 // With AVX512 vxi1 types are legal and we prefer using k-regs.
45494 // MOVMSK is supported in SSE2 or later.
45495 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
45496 return SDValue();
45497
45498 // If the upper ops of a concatenation are undef, then try to bitcast the
45499 // lower op and extend.
45500 SmallVector<SDValue, 4> SubSrcOps;
45501 if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&
45502 SubSrcOps.size() >= 2) {
45503 SDValue LowerOp = SubSrcOps[0];
45504 ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());
45505 if (LowerOp.getOpcode() == ISD::SETCC &&
45506 all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {
45507 EVT SubVT = VT.getIntegerVT(
45508 *DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());
45509 if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {
45510 EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
45511 return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));
45512 }
45513 }
45514 }
45515
45516 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
45517 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
45518 // v8i16 and v16i16.
45519 // For these two cases, we can shuffle the upper element bytes to a
45520 // consecutive sequence at the start of the vector and treat the results as
45521 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
45522 // for v16i16 this is not the case, because the shuffle is expensive, so we
45523 // avoid sign-extending to this type entirely.
45524 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
45525 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
45526 MVT SExtVT;
45527 bool PropagateSExt = false;
45528 switch (SrcVT.getSimpleVT().SimpleTy) {
45529 default:
45530 return SDValue();
45531 case MVT::v2i1:
45532 SExtVT = MVT::v2i64;
45533 break;
45534 case MVT::v4i1:
45535 SExtVT = MVT::v4i32;
45536 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
45537 // sign-extend to a 256-bit operation to avoid truncation.
45538 if (Subtarget.hasAVX() &&
45539 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2(), 0)) {
45540 SExtVT = MVT::v4i64;
45541 PropagateSExt = true;
45542 }
45543 break;
45544 case MVT::v8i1:
45545 SExtVT = MVT::v8i16;
45546 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
45547 // sign-extend to a 256-bit operation to match the compare.
45548 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
45549 // 256-bit because the shuffle is cheaper than sign extending the result of
45550 // the compare.
45551 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true, 0) ||
45552 checkBitcastSrcVectorSize(Src, 512, true, 0))) {
45553 SExtVT = MVT::v8i32;
45554 PropagateSExt = true;
45555 }
45556 break;
45557 case MVT::v16i1:
45558 SExtVT = MVT::v16i8;
45559 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
45560 // it is not profitable to sign-extend to 256-bit because this will
45561 // require an extra cross-lane shuffle which is more expensive than
45562 // truncating the result of the compare to 128-bits.
45563 break;
45564 case MVT::v32i1:
45565 SExtVT = MVT::v32i8;
45566 break;
45567 case MVT::v64i1:
45568 // If we have AVX512F, but not AVX512BW and the input is truncated from
45569 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
45570 if (Subtarget.hasAVX512()) {
45571 if (Subtarget.hasBWI())
45572 return SDValue();
45573 SExtVT = MVT::v64i8;
45574 break;
45575 }
45576 // Split if this is a <64 x i8> comparison result.
45577 if (checkBitcastSrcVectorSize(Src, 512, false, 0)) {
45578 SExtVT = MVT::v64i8;
45579 break;
45580 }
45581 return SDValue();
45582 };
45583
45584 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
45585 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
45586
45587 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
45588 V = getPMOVMSKB(DL, V, DAG, Subtarget);
45589 } else {
45590 if (SExtVT == MVT::v8i16) {
45591 V = widenSubVector(V, false, Subtarget, DAG, DL, 256);
45592 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v16i8, V);
45593 }
45594 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
45595 }
45596
45597 EVT IntVT =
45599 V = DAG.getZExtOrTrunc(V, DL, IntVT);
45600 return DAG.getBitcast(VT, V);
45601}
45602
45603// Convert a vXi1 constant build vector to the same width scalar integer.
45605 EVT SrcVT = Op.getValueType();
45606 assert(SrcVT.getVectorElementType() == MVT::i1 &&
45607 "Expected a vXi1 vector");
45609 "Expected a constant build vector");
45610
45611 APInt Imm(SrcVT.getVectorNumElements(), 0);
45612 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
45613 SDValue In = Op.getOperand(Idx);
45614 if (!In.isUndef() && (In->getAsZExtVal() & 0x1))
45615 Imm.setBit(Idx);
45616 }
45617 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
45618 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
45619}
45620
45623 const X86Subtarget &Subtarget) {
45624 using namespace SDPatternMatch;
45625 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
45626
45627 if (!DCI.isBeforeLegalizeOps())
45628 return SDValue();
45629
45630 // Only do this if we have k-registers.
45631 if (!Subtarget.hasAVX512())
45632 return SDValue();
45633
45634 EVT DstVT = N->getValueType(0);
45635 SDValue Op = N->getOperand(0);
45636 EVT SrcVT = Op.getValueType();
45637
45638 // Make sure we have a bitcast between mask registers and a scalar type.
45639 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
45640 DstVT.isScalarInteger()) &&
45641 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
45642 SrcVT.isScalarInteger()))
45643 return SDValue();
45644
45645 SDValue LHS, RHS;
45646
45647 // Look for logic ops.
45649 return SDValue();
45650
45651 // If either operand was bitcast from DstVT, then perform logic with DstVT (at
45652 // least one of the getBitcast() will fold away).
45653 if (sd_match(LHS, m_OneUse(m_BitCast(m_SpecificVT(DstVT)))) ||
45655 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
45656 DAG.getBitcast(DstVT, LHS), DAG.getBitcast(DstVT, RHS));
45657
45658 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
45659 // Most of these have to move a constant from the scalar domain anyway.
45662 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
45663 DAG.getBitcast(DstVT, LHS), RHS);
45664 }
45665
45666 return SDValue();
45667}
45668
45670 const X86Subtarget &Subtarget) {
45671 SDLoc DL(BV);
45672 unsigned NumElts = BV->getNumOperands();
45673 SDValue Splat = BV->getSplatValue();
45674
45675 // Build MMX element from integer GPR or SSE float values.
45676 auto CreateMMXElement = [&](SDValue V) {
45677 if (V.isUndef())
45678 return DAG.getUNDEF(MVT::x86mmx);
45679 if (V.getValueType().isFloatingPoint()) {
45680 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
45681 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
45682 V = DAG.getBitcast(MVT::v2i64, V);
45683 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
45684 }
45685 V = DAG.getBitcast(MVT::i32, V);
45686 } else {
45687 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
45688 }
45689 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
45690 };
45691
45692 // Convert build vector ops to MMX data in the bottom elements.
45694
45695 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45696
45697 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
45698 if (Splat) {
45699 if (Splat.isUndef())
45700 return DAG.getUNDEF(MVT::x86mmx);
45701
45702 Splat = CreateMMXElement(Splat);
45703
45704 if (Subtarget.hasSSE1()) {
45705 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
45706 if (NumElts == 8)
45707 Splat = DAG.getNode(
45708 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
45709 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
45710 TLI.getPointerTy(DAG.getDataLayout())),
45711 Splat, Splat);
45712
45713 // Use PSHUFW to repeat 16-bit elements.
45714 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
45715 return DAG.getNode(
45716 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
45717 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
45718 TLI.getPointerTy(DAG.getDataLayout())),
45719 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
45720 }
45721 Ops.append(NumElts, Splat);
45722 } else {
45723 for (unsigned i = 0; i != NumElts; ++i)
45724 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
45725 }
45726
45727 // Use tree of PUNPCKLs to build up general MMX vector.
45728 while (Ops.size() > 1) {
45729 unsigned NumOps = Ops.size();
45730 unsigned IntrinOp =
45731 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
45732 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
45733 : Intrinsic::x86_mmx_punpcklbw));
45734 SDValue Intrin = DAG.getTargetConstant(
45735 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
45736 for (unsigned i = 0; i != NumOps; i += 2)
45737 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
45738 Ops[i], Ops[i + 1]);
45739 Ops.resize(NumOps / 2);
45740 }
45741
45742 return Ops[0];
45743}
45744
45745// Recursive function that attempts to find if a bool vector node was originally
45746// a vector/float/double that got truncated/extended/bitcast to/from a scalar
45747// integer. If so, replace the scalar ops with bool vector equivalents back down
45748// the chain.
45750 SelectionDAG &DAG,
45751 const X86Subtarget &Subtarget,
45752 unsigned Depth = 0) {
45754 return SDValue(); // Limit search depth.
45755
45756 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45757 unsigned Opc = V.getOpcode();
45758 switch (Opc) {
45759 case ISD::BITCAST: {
45760 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
45761 SDValue Src = V.getOperand(0);
45762 EVT SrcVT = Src.getValueType();
45763 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
45764 return DAG.getBitcast(VT, Src);
45765 break;
45766 }
45767 case ISD::Constant: {
45768 auto *C = cast<ConstantSDNode>(V);
45769 if (C->isZero())
45770 return DAG.getConstant(0, DL, VT);
45771 if (C->isAllOnes())
45772 return DAG.getAllOnesConstant(DL, VT);
45773 break;
45774 }
45775 case ISD::TRUNCATE: {
45776 // If we find a suitable source, a truncated scalar becomes a subvector.
45777 SDValue Src = V.getOperand(0);
45778 EVT NewSrcVT =
45779 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
45780 if (TLI.isTypeLegal(NewSrcVT))
45781 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
45782 Subtarget, Depth + 1))
45783 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
45784 DAG.getVectorIdxConstant(0, DL));
45785 break;
45786 }
45787 case ISD::ANY_EXTEND:
45788 case ISD::ZERO_EXTEND: {
45789 // If we find a suitable source, an extended scalar becomes a subvector.
45790 SDValue Src = V.getOperand(0);
45791 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
45792 Src.getScalarValueSizeInBits());
45793 if (TLI.isTypeLegal(NewSrcVT))
45794 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
45795 Subtarget, Depth + 1))
45796 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
45797 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
45798 : DAG.getConstant(0, DL, VT),
45799 N0, DAG.getVectorIdxConstant(0, DL));
45800 break;
45801 }
45802 case ISD::OR:
45803 case ISD::XOR: {
45804 // If we find suitable sources, we can just move the op to the vector
45805 // domain.
45806 if (SDValue N0 = combineBitcastToBoolVector(VT, V.getOperand(0), DL, DAG,
45807 Subtarget, Depth + 1))
45808 if (SDValue N1 = combineBitcastToBoolVector(VT, V.getOperand(1), DL, DAG,
45809 Subtarget, Depth + 1))
45810 return DAG.getNode(Opc, DL, VT, N0, N1);
45811 break;
45812 }
45813 case ISD::SHL: {
45814 // If we find a suitable source, a SHL becomes a KSHIFTL.
45815 SDValue Src0 = V.getOperand(0);
45816 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
45817 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
45818 break;
45819
45820 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
45821 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget,
45822 Depth + 1))
45823 return DAG.getNode(
45824 X86ISD::KSHIFTL, DL, VT, N0,
45825 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
45826 break;
45827 }
45828 }
45829
45830 // Does the inner bitcast already exist?
45831 if (Depth > 0)
45832 if (SDNode *Alt = DAG.getNodeIfExists(ISD::BITCAST, DAG.getVTList(VT), {V}))
45833 return SDValue(Alt, 0);
45834
45835 return SDValue();
45836}
45837
45840 const X86Subtarget &Subtarget) {
45841 SDValue N0 = N->getOperand(0);
45842 EVT VT = N->getValueType(0);
45843 EVT SrcVT = N0.getValueType();
45844 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45845
45846 // Try to match patterns such as
45847 // (i16 bitcast (v16i1 x))
45848 // ->
45849 // (i16 movmsk (16i8 sext (v16i1 x)))
45850 // before the setcc result is scalarized on subtargets that don't have legal
45851 // vxi1 types.
45852 if (DCI.isBeforeLegalize()) {
45853 SDLoc dl(N);
45854 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
45855 return V;
45856
45857 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45858 // type, widen both sides to avoid a trip through memory.
45859 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
45860 Subtarget.hasAVX512()) {
45861 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
45862 N0 = DAG.getBitcast(MVT::v8i1, N0);
45863 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
45864 DAG.getVectorIdxConstant(0, dl));
45865 }
45866
45867 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45868 // type, widen both sides to avoid a trip through memory.
45869 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
45870 Subtarget.hasAVX512()) {
45871 // Use zeros for the widening if we already have some zeroes. This can
45872 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
45873 // stream of this.
45874 // FIXME: It might make sense to detect a concat_vectors with a mix of
45875 // zeroes and undef and turn it into insert_subvector for i1 vectors as
45876 // a separate combine. What we can't do is canonicalize the operands of
45877 // such a concat or we'll get into a loop with SimplifyDemandedBits.
45878 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
45879 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
45880 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
45881 SrcVT = LastOp.getValueType();
45882 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45884 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
45885 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45886 N0 = DAG.getBitcast(MVT::i8, N0);
45887 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45888 }
45889 }
45890
45891 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45892 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
45893 Ops[0] = N0;
45894 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45895 N0 = DAG.getBitcast(MVT::i8, N0);
45896 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45897 }
45898 } else if (DCI.isAfterLegalizeDAG()) {
45899 // If we're bitcasting from iX to vXi1, see if the integer originally
45900 // began as a vXi1 and whether we can remove the bitcast entirely.
45901 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
45902 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
45903 if (SDValue V =
45904 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
45905 return V;
45906 }
45907 }
45908
45909 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
45910 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
45911 // due to insert_subvector legalization on KNL. By promoting the copy to i16
45912 // we can help with known bits propagation from the vXi1 domain to the
45913 // scalar domain.
45914 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
45915 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
45916 N0.getOperand(0).getValueType() == MVT::v16i1 &&
45918 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
45919 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
45920
45921 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
45922 // and the vbroadcast_load are both integer or both fp. In some cases this
45923 // will remove the bitcast entirely.
45924 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
45925 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
45926 auto *BCast = cast<MemIntrinsicSDNode>(N0);
45927 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
45928 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
45929 // Don't swap i8/i16 since don't have fp types that size.
45930 if (MemSize >= 32) {
45931 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
45932 : MVT::getIntegerVT(MemSize);
45933 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
45934 : MVT::getIntegerVT(SrcVTSize);
45935 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
45936
45937 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
45938 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
45939 SDValue ResNode =
45941 MemVT, BCast->getMemOperand());
45942 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
45943 return DAG.getBitcast(VT, ResNode);
45944 }
45945 }
45946
45947 // Attempt to peek through f16 bitcasted extractions hidden by truncation.
45948 if (VT == MVT::f16 && SrcVT == MVT::i16) {
45949 SDValue Src = peekThroughTruncates(N0);
45950 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
45951 Src.getOperand(0).getValueSizeInBits() == 128 &&
45952 isNullConstant(Src.getOperand(1))) {
45953 SDLoc DL(N);
45954 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
45955 DAG.getBitcast(MVT::v8f16, Src.getOperand(0)),
45956 DAG.getVectorIdxConstant(0, DL));
45957 }
45958 }
45959
45960 // Since MMX types are special and don't usually play with other vector types,
45961 // it's better to handle them early to be sure we emit efficient code by
45962 // avoiding store-load conversions.
45963 if (VT == MVT::x86mmx) {
45964 // Detect MMX constant vectors.
45965 APInt UndefElts;
45966 SmallVector<APInt, 1> EltBits;
45967 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits,
45968 /*AllowWholeUndefs*/ true,
45969 /*AllowPartialUndefs*/ true)) {
45970 SDLoc DL(N0);
45971 // Handle zero-extension of i32 with MOVD.
45972 if (EltBits[0].countl_zero() >= 32)
45973 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
45974 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
45975 // Else, bitcast to a double.
45976 // TODO - investigate supporting sext 32-bit immediates on x86_64.
45977 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
45978 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
45979 }
45980
45981 // Detect bitcasts to x86mmx low word.
45982 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
45983 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
45984 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
45985 bool LowUndef = true, AllUndefOrZero = true;
45986 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
45987 SDValue Op = N0.getOperand(i);
45988 LowUndef &= Op.isUndef() || (i >= e/2);
45989 AllUndefOrZero &= isNullConstantOrUndef(Op);
45990 }
45991 if (AllUndefOrZero) {
45992 SDValue N00 = N0.getOperand(0);
45993 SDLoc dl(N00);
45994 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
45995 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
45996 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
45997 }
45998 }
45999
46000 // Detect bitcasts of 64-bit build vectors and convert to a
46001 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
46002 // lowest element.
46003 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
46004 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
46005 SrcVT == MVT::v8i8))
46006 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
46007
46008 // Detect bitcasts between element or subvector extraction to x86mmx.
46009 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
46011 isNullConstant(N0.getOperand(1))) {
46012 SDValue N00 = N0.getOperand(0);
46013 if (N00.getValueType().is128BitVector())
46014 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
46015 DAG.getBitcast(MVT::v2i64, N00));
46016 }
46017
46018 // Detect bitcasts from FP_TO_SINT to x86mmx.
46019 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
46020 SDLoc DL(N0);
46021 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
46022 DAG.getUNDEF(MVT::v2i32));
46023 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
46024 DAG.getBitcast(MVT::v2i64, Res));
46025 }
46026 }
46027
46028 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
46029 // most of these to scalar anyway.
46030 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
46031 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
46033 return combinevXi1ConstantToInteger(N0, DAG);
46034 }
46035
46036 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && VT.isVector() &&
46037 VT.getVectorElementType() == MVT::i1) {
46038 if (auto *C = dyn_cast<ConstantSDNode>(N0)) {
46039 if (C->isAllOnes())
46040 return DAG.getConstant(1, SDLoc(N0), VT);
46041 if (C->isZero())
46042 return DAG.getConstant(0, SDLoc(N0), VT);
46043 }
46044 }
46045
46046 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
46047 // Turn it into a sign bit compare that produces a k-register. This avoids
46048 // a trip through a GPR.
46049 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
46050 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
46052 unsigned NumElts = VT.getVectorNumElements();
46053 SDValue Src = N0;
46054
46055 // Peek through truncate.
46056 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
46057 Src = N0.getOperand(0);
46058
46059 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
46060 SDValue MovmskIn = Src.getOperand(0);
46061 MVT MovmskVT = MovmskIn.getSimpleValueType();
46062 unsigned MovMskElts = MovmskVT.getVectorNumElements();
46063
46064 // We allow extra bits of the movmsk to be used since they are known zero.
46065 // We can't convert a VPMOVMSKB without avx512bw.
46066 if (MovMskElts <= NumElts &&
46067 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
46068 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
46069 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
46070 SDLoc dl(N);
46071 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
46072 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
46073 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
46074 if (EVT(CmpVT) == VT)
46075 return Cmp;
46076
46077 // Pad with zeroes up to original VT to replace the zeroes that were
46078 // being used from the MOVMSK.
46079 unsigned NumConcats = NumElts / MovMskElts;
46080 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
46081 Ops[0] = Cmp;
46082 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
46083 }
46084 }
46085 }
46086
46087 // Try to remove bitcasts from input and output of mask arithmetic to
46088 // remove GPR<->K-register crossings.
46089 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
46090 return V;
46091
46092 // bitcast(v1Ty insert_vector_elt(X, Y, 0)) --> Y
46093 if (N0.getOpcode() == ISD::INSERT_VECTOR_ELT && SrcVT.getScalarType() == VT &&
46094 SrcVT.getVectorNumElements() == 1)
46095 return N0.getOperand(1);
46096
46097 // Convert a bitcasted integer logic operation that has one bitcasted
46098 // floating-point operand into a floating-point logic operation. This may
46099 // create a load of a constant, but that is cheaper than materializing the
46100 // constant in an integer register and transferring it to an SSE register or
46101 // transferring the SSE operand to integer register and back.
46102 unsigned FPOpcode;
46103 switch (N0.getOpcode()) {
46104 // clang-format off
46105 case ISD::AND: FPOpcode = X86ISD::FAND; break;
46106 case ISD::OR: FPOpcode = X86ISD::FOR; break;
46107 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
46108 default: return SDValue();
46109 // clang-format on
46110 }
46111
46112 // Check if we have a bitcast from another integer type as well.
46113 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
46114 (Subtarget.hasSSE2() && VT == MVT::f64) ||
46115 (Subtarget.hasFP16() && VT == MVT::f16) ||
46116 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
46117 TLI.isTypeLegal(VT))))
46118 return SDValue();
46119
46120 SDValue LogicOp0 = N0.getOperand(0);
46121 SDValue LogicOp1 = N0.getOperand(1);
46122 SDLoc DL0(N0);
46123
46124 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
46125 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
46126 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
46127 LogicOp0.getOperand(0).getValueType() == VT &&
46128 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
46129 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
46130 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
46131 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
46132 }
46133 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
46134 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
46135 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
46136 LogicOp1.getOperand(0).getValueType() == VT &&
46137 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
46138 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
46139 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
46140 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
46141 }
46142
46143 return SDValue();
46144}
46145
46146// (mul (zext a), (sext, b))
46147static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
46148 SDValue &Op1) {
46149 Op0 = Mul.getOperand(0);
46150 Op1 = Mul.getOperand(1);
46151
46152 // The operand1 should be signed extend
46153 if (Op0.getOpcode() == ISD::SIGN_EXTEND)
46154 std::swap(Op0, Op1);
46155
46156 auto IsFreeTruncation = [](SDValue &Op) -> bool {
46157 if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
46158 Op.getOpcode() == ISD::SIGN_EXTEND) &&
46159 Op.getOperand(0).getScalarValueSizeInBits() <= 8)
46160 return true;
46161
46162 auto *BV = dyn_cast<BuildVectorSDNode>(Op);
46163 return (BV && BV->isConstant());
46164 };
46165
46166 // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
46167 // value, we need to check Op0 is zero extended value. Op1 should be signed
46168 // value, so we just check the signed bits.
46169 if ((IsFreeTruncation(Op0) &&
46170 DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
46171 (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
46172 return true;
46173
46174 return false;
46175}
46176
46178 unsigned &LogBias, const SDLoc &DL,
46179 const X86Subtarget &Subtarget) {
46180 // Extend or truncate to MVT::i8 first.
46181 MVT Vi8VT =
46182 MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
46183 LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
46184 RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
46185
46186 // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
46187 // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
46188 // The src A, B element type is i8, but the dst C element type is i32.
46189 // When we calculate the reduce stage, we use src vector type vXi8 for it
46190 // so we need logbias 2 to avoid extra 2 stages.
46191 LogBias = 2;
46192
46193 unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
46194 if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
46195 RegSize = std::max(512u, RegSize);
46196
46197 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
46198 // fill in the missing vector elements with 0.
46199 unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
46200 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
46201 Ops[0] = LHS;
46202 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
46203 SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46204 Ops[0] = RHS;
46205 SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46206
46207 // Actually build the DotProduct, split as 256/512 bits for
46208 // AVXVNNI/AVX512VNNI.
46209 auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46211 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
46212 return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
46213 };
46214 MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
46215 SDValue Zero = DAG.getConstant(0, DL, DpVT);
46216
46217 return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
46218 DpBuilder, /*CheckBWI=*/false, Subtarget.hasVNNI());
46219}
46220
46221// Create a PSADBW given two sources representable as zexts of vXi8.
46223 const SDLoc &DL, const X86Subtarget &Subtarget) {
46224 // Find the appropriate width for the PSADBW.
46225 EVT DstVT = N0.getValueType();
46226 EVT SrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8,
46227 DstVT.getVectorElementCount());
46228 unsigned RegSize = std::max(128u, (unsigned)SrcVT.getSizeInBits());
46229
46230 // Widen the vXi8 vectors, padding with zero vector elements.
46231 unsigned NumConcat = RegSize / SrcVT.getSizeInBits();
46232 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, SrcVT));
46233 Ops[0] = DAG.getZExtOrTrunc(N0, DL, SrcVT);
46234 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
46235 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46236 Ops[0] = DAG.getZExtOrTrunc(N1, DL, SrcVT);
46237 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46238
46239 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
46240 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46242 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
46243 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
46244 };
46245 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
46246 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {SadOp0, SadOp1},
46247 PSADBWBuilder);
46248}
46249
46250// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
46251// PHMINPOSUW.
46253 const X86Subtarget &Subtarget) {
46254 // Bail without SSE41.
46255 if (!Subtarget.hasSSE41())
46256 return SDValue();
46257
46258 EVT ExtractVT = Extract->getValueType(0);
46259 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
46260 return SDValue();
46261
46262 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
46263 ISD::NodeType BinOp;
46264 SDValue Src = DAG.matchBinOpReduction(
46265 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
46266 if (!Src)
46267 return SDValue();
46268
46269 EVT SrcVT = Src.getValueType();
46270 EVT SrcSVT = SrcVT.getScalarType();
46271 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
46272 return SDValue();
46273
46274 SDLoc DL(Extract);
46275 SDValue MinPos = Src;
46276
46277 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
46278 while (SrcVT.getSizeInBits() > 128) {
46279 SDValue Lo, Hi;
46280 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
46281 SrcVT = Lo.getValueType();
46282 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
46283 }
46284 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
46285 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
46286 "Unexpected value type");
46287
46288 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
46289 // to flip the value accordingly.
46290 SDValue Mask;
46291 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
46292 if (BinOp == ISD::SMAX)
46293 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
46294 else if (BinOp == ISD::SMIN)
46295 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
46296 else if (BinOp == ISD::UMAX)
46297 Mask = DAG.getAllOnesConstant(DL, SrcVT);
46298
46299 if (Mask)
46300 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
46301
46302 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
46303 // shuffling each upper element down and insert zeros. This means that the
46304 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
46305 // ready for the PHMINPOS.
46306 if (ExtractVT == MVT::i8) {
46308 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
46309 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
46310 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
46311 }
46312
46313 // Perform the PHMINPOS on a v8i16 vector,
46314 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
46315 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
46316 MinPos = DAG.getBitcast(SrcVT, MinPos);
46317
46318 if (Mask)
46319 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
46320
46321 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
46322 DAG.getVectorIdxConstant(0, DL));
46323}
46324
46325// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
46327 const X86Subtarget &Subtarget) {
46328 // Bail without SSE2.
46329 if (!Subtarget.hasSSE2())
46330 return SDValue();
46331
46332 EVT ExtractVT = Extract->getValueType(0);
46333 unsigned BitWidth = ExtractVT.getSizeInBits();
46334 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
46335 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
46336 return SDValue();
46337
46338 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
46339 ISD::NodeType BinOp;
46340 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
46341 if (!Match && ExtractVT == MVT::i1)
46342 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
46343 if (!Match)
46344 return SDValue();
46345
46346 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
46347 // which we can't support here for now.
46348 if (Match.getScalarValueSizeInBits() != BitWidth)
46349 return SDValue();
46350
46351 SDValue Movmsk;
46352 SDLoc DL(Extract);
46353 EVT MatchVT = Match.getValueType();
46354 unsigned NumElts = MatchVT.getVectorNumElements();
46355 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
46356 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46357 LLVMContext &Ctx = *DAG.getContext();
46358
46359 if (ExtractVT == MVT::i1) {
46360 // Special case for (pre-legalization) vXi1 reductions.
46361 if (NumElts > 64 || !isPowerOf2_32(NumElts))
46362 return SDValue();
46363 if (Match.getOpcode() == ISD::SETCC) {
46364 ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
46365 if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||
46366 (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {
46367 // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
46368 // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
46369 X86::CondCode X86CC;
46370 SDValue LHS = DAG.getFreeze(Match.getOperand(0));
46371 SDValue RHS = DAG.getFreeze(Match.getOperand(1));
46372 APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());
46373 if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,
46374 DAG, X86CC))
46375 return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,
46376 getSETCC(X86CC, V, DL, DAG));
46377 }
46378 }
46379 if (TLI.isTypeLegal(MatchVT)) {
46380 // If this is a legal AVX512 predicate type then we can just bitcast.
46381 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
46382 Movmsk = DAG.getBitcast(MovmskVT, Match);
46383 } else {
46384 // Use combineBitcastvxi1 to create the MOVMSK.
46385 while (NumElts > MaxElts) {
46386 SDValue Lo, Hi;
46387 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
46388 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
46389 NumElts /= 2;
46390 }
46391 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
46392 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
46393 }
46394 if (!Movmsk)
46395 return SDValue();
46396 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
46397 } else {
46398 // FIXME: Better handling of k-registers or 512-bit vectors?
46399 unsigned MatchSizeInBits = Match.getValueSizeInBits();
46400 if (!(MatchSizeInBits == 128 ||
46401 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
46402 return SDValue();
46403
46404 // Make sure this isn't a vector of 1 element. The perf win from using
46405 // MOVMSK diminishes with less elements in the reduction, but it is
46406 // generally better to get the comparison over to the GPRs as soon as
46407 // possible to reduce the number of vector ops.
46408 if (Match.getValueType().getVectorNumElements() < 2)
46409 return SDValue();
46410
46411 // Check that we are extracting a reduction of all sign bits.
46412 if (DAG.ComputeNumSignBits(Match) != BitWidth)
46413 return SDValue();
46414
46415 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
46416 SDValue Lo, Hi;
46417 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
46418 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
46419 MatchSizeInBits = Match.getValueSizeInBits();
46420 }
46421
46422 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
46423 MVT MaskSrcVT;
46424 if (64 == BitWidth || 32 == BitWidth)
46426 MatchSizeInBits / BitWidth);
46427 else
46428 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
46429
46430 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
46431 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
46432 NumElts = MaskSrcVT.getVectorNumElements();
46433 }
46434 assert((NumElts <= 32 || NumElts == 64) &&
46435 "Not expecting more than 64 elements");
46436
46437 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
46438 if (BinOp == ISD::XOR) {
46439 // parity -> (PARITY(MOVMSK X))
46440 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
46441 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
46442 }
46443
46444 SDValue CmpC;
46445 ISD::CondCode CondCode;
46446 if (BinOp == ISD::OR) {
46447 // any_of -> MOVMSK != 0
46448 CmpC = DAG.getConstant(0, DL, CmpVT);
46449 CondCode = ISD::CondCode::SETNE;
46450 } else {
46451 // all_of -> MOVMSK == ((1 << NumElts) - 1)
46452 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
46453 DL, CmpVT);
46454 CondCode = ISD::CondCode::SETEQ;
46455 }
46456
46457 // The setcc produces an i8 of 0/1, so extend that to the result width and
46458 // negate to get the final 0/-1 mask value.
46459 EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);
46460 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
46461 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
46462 return DAG.getNegative(Zext, DL, ExtractVT);
46463}
46464
46466 const X86Subtarget &Subtarget) {
46467 if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
46468 return SDValue();
46469
46470 EVT ExtractVT = Extract->getValueType(0);
46471 // Verify the type we're extracting is i32, as the output element type of
46472 // vpdpbusd is i32.
46473 if (ExtractVT != MVT::i32)
46474 return SDValue();
46475
46476 EVT VT = Extract->getOperand(0).getValueType();
46478 return SDValue();
46479
46480 // Match shuffle + add pyramid.
46481 ISD::NodeType BinOp;
46482 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
46483
46484 // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
46485 // done by vpdpbusd compute a signed 16-bit product that will be sign extended
46486 // before adding into the accumulator.
46487 // TODO:
46488 // We also need to verify that the multiply has at least 2x the number of bits
46489 // of the input. We shouldn't match
46490 // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
46491 // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
46492 // Root = Root.getOperand(0);
46493
46494 // If there was a match, we want Root to be a mul.
46495 if (!Root || Root.getOpcode() != ISD::MUL)
46496 return SDValue();
46497
46498 // Check whether we have an extend and mul pattern
46499 SDValue LHS, RHS;
46500 if (!detectExtMul(DAG, Root, LHS, RHS))
46501 return SDValue();
46502
46503 // Create the dot product instruction.
46504 SDLoc DL(Extract);
46505 unsigned StageBias;
46506 SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
46507
46508 // If the original vector was wider than 4 elements, sum over the results
46509 // in the DP vector.
46510 unsigned Stages = Log2_32(VT.getVectorNumElements());
46511 EVT DpVT = DP.getValueType();
46512
46513 if (Stages > StageBias) {
46514 unsigned DpElems = DpVT.getVectorNumElements();
46515
46516 for (unsigned i = Stages - StageBias; i > 0; --i) {
46517 SmallVector<int, 16> Mask(DpElems, -1);
46518 for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
46519 Mask[j] = MaskEnd + j;
46520
46521 SDValue Shuffle =
46522 DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
46523 DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
46524 }
46525 }
46526
46527 // Return the lowest ExtractSizeInBits bits.
46528 EVT ResVT =
46529 EVT::getVectorVT(*DAG.getContext(), ExtractVT,
46530 DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
46531 DP = DAG.getBitcast(ResVT, DP);
46532 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
46533 Extract->getOperand(1));
46534}
46535
46537 const X86Subtarget &Subtarget) {
46538 using namespace SDPatternMatch;
46539
46540 // PSADBW is only supported on SSE2 and up.
46541 if (!Subtarget.hasSSE2())
46542 return SDValue();
46543
46544 EVT ExtractVT = Extract->getValueType(0);
46545 if (ExtractVT != MVT::i8 && ExtractVT != MVT::i16 && ExtractVT != MVT::i32 &&
46546 ExtractVT != MVT::i64)
46547 return SDValue();
46548
46549 EVT VT = Extract->getOperand(0).getValueType();
46551 return SDValue();
46552
46553 // Match shuffle + add pyramid.
46554 ISD::NodeType BinOp;
46555 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
46556 if (!Root)
46557 return SDValue();
46558
46559 // The operand is expected to be zero extended from i8.
46560 // In order to convert to i64 and above, additional any/zero/sign
46561 // extend is expected.
46562 // The zero extend from 32 bit has no mathematical effect on the result.
46563 // Also the sign extend is basically zero extend
46564 // (extends the sign bit which is zero).
46565 // So it is correct to skip the sign/zero extend instruction.
46566 if (Root.getOpcode() == ISD::SIGN_EXTEND ||
46567 Root.getOpcode() == ISD::ZERO_EXTEND ||
46568 Root.getOpcode() == ISD::ANY_EXTEND)
46569 Root = Root.getOperand(0);
46570
46571 // Check whether we have an vXi8 abdu pattern.
46572 // TODO: Just match ISD::ABDU once the DAG is topological sorted.
46573 SDValue Src0, Src1;
46574 if (!sd_match(
46575 Root,
46576 m_AnyOf(
46578 MVT::i8, m_c_BinOp(ISD::ABDU, m_Value(Src0), m_Value(Src1))),
46580 MVT::i8, m_Sub(m_UMax(m_Value(Src0), m_Value(Src1)),
46581 m_UMin(m_Deferred(Src0), m_Deferred(Src1)))),
46582 m_Abs(
46583 m_Sub(m_AllOf(m_Value(Src0),
46585 m_AllOf(m_Value(Src1),
46586 m_ZExt(m_SpecificVectorElementVT(MVT::i8))))))))
46587 return SDValue();
46588
46589 // Create the SAD instruction.
46590 SDLoc DL(Extract);
46591 SDValue SAD = createPSADBW(DAG, Src0, Src1, DL, Subtarget);
46592
46593 // If the original vector was wider than 8 elements, sum over the results
46594 // in the SAD vector.
46595 unsigned Stages = Log2_32(VT.getVectorNumElements());
46596 EVT SadVT = SAD.getValueType();
46597 if (Stages > 3) {
46598 unsigned SadElems = SadVT.getVectorNumElements();
46599
46600 for(unsigned i = Stages - 3; i > 0; --i) {
46601 SmallVector<int, 16> Mask(SadElems, -1);
46602 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
46603 Mask[j] = MaskEnd + j;
46604
46605 SDValue Shuffle =
46606 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
46607 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
46608 }
46609 }
46610
46611 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
46612 // Return the lowest ExtractSizeInBits bits.
46613 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
46614 SadVT.getSizeInBits() / ExtractSizeInBits);
46615 SAD = DAG.getBitcast(ResVT, SAD);
46616 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
46617 Extract->getOperand(1));
46618}
46619
46620// If this extract is from a loaded vector value and will be used as an
46621// integer, that requires a potentially expensive XMM -> GPR transfer.
46622// Additionally, if we can convert to a scalar integer load, that will likely
46623// be folded into a subsequent integer op.
46624// Note: SrcVec might not have a VecVT type, but it must be the same size.
46625// Note: Unlike the related fold for this in DAGCombiner, this is not limited
46626// to a single-use of the loaded vector. For the reasons above, we
46627// expect this to be profitable even if it creates an extra load.
46628static SDValue
46630 const SDLoc &dl, SelectionDAG &DAG,
46632 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
46633 "Only EXTRACT_VECTOR_ELT supported so far");
46634
46635 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46636 EVT VT = N->getValueType(0);
46637
46638 bool LikelyUsedAsVector = any_of(N->users(), [](SDNode *Use) {
46639 return Use->getOpcode() == ISD::STORE ||
46640 Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
46641 Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
46642 });
46643
46644 auto *LoadVec = dyn_cast<LoadSDNode>(SrcVec);
46645 if (LoadVec && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
46646 VecVT.getVectorElementType() == VT &&
46647 VecVT.getSizeInBits() == SrcVec.getValueSizeInBits() &&
46648 DCI.isAfterLegalizeDAG() && !LikelyUsedAsVector && LoadVec->isSimple()) {
46649 SDValue NewPtr = TLI.getVectorElementPointer(
46650 DAG, LoadVec->getBasePtr(), VecVT, DAG.getVectorIdxConstant(Idx, dl));
46651 unsigned PtrOff = VT.getSizeInBits() * Idx / 8;
46652 MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
46653 Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
46654 SDValue Load =
46655 DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
46656 LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
46657 DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
46658 return Load;
46659 }
46660
46661 return SDValue();
46662}
46663
46664// Attempt to peek through a target shuffle and extract the scalar from the
46665// source.
46668 const X86Subtarget &Subtarget) {
46669 if (DCI.isBeforeLegalizeOps())
46670 return SDValue();
46671
46672 SDLoc dl(N);
46673 SDValue Src = N->getOperand(0);
46674 SDValue Idx = N->getOperand(1);
46675
46676 EVT VT = N->getValueType(0);
46677 EVT SrcVT = Src.getValueType();
46678 EVT SrcSVT = SrcVT.getVectorElementType();
46679 unsigned SrcEltBits = SrcSVT.getSizeInBits();
46680 unsigned NumSrcElts = SrcVT.getVectorNumElements();
46681
46682 // Don't attempt this for boolean mask vectors or unknown extraction indices.
46683 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
46684 return SDValue();
46685
46686 const APInt &IdxC = N->getConstantOperandAPInt(1);
46687 if (IdxC.uge(NumSrcElts))
46688 return SDValue();
46689
46690 SDValue SrcBC = peekThroughBitcasts(Src);
46691
46692 // Handle extract(bitcast(broadcast(scalar_value))).
46693 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
46694 SDValue SrcOp = SrcBC.getOperand(0);
46695 EVT SrcOpVT = SrcOp.getValueType();
46696 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
46697 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
46698 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
46699 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
46700 // TODO support non-zero offsets.
46701 if (Offset == 0) {
46702 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
46703 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
46704 return SrcOp;
46705 }
46706 }
46707 }
46708
46709 // If we're extracting a single element from a broadcast load and there are
46710 // no other users, just create a single load.
46712 SrcBC.hasOneUse()) {
46713 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
46714 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
46715 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
46716 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
46717 SDValue Load =
46718 DAG.getLoad(VT, dl, MemIntr->getChain(), MemIntr->getBasePtr(),
46719 MemIntr->getPointerInfo(), MemIntr->getBaseAlign(),
46720 MemIntr->getMemOperand()->getFlags());
46721 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
46722 return Load;
46723 }
46724 }
46725
46726 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
46727 // TODO: Move to DAGCombine?
46728 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
46729 SrcBC.getValueType().isInteger() &&
46730 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
46731 SrcBC.getScalarValueSizeInBits() ==
46732 SrcBC.getOperand(0).getValueSizeInBits()) {
46733 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
46734 if (IdxC.ult(Scale)) {
46735 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
46736 SDValue Scl = SrcBC.getOperand(0);
46737 EVT SclVT = Scl.getValueType();
46738 if (Offset) {
46739 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
46740 DAG.getShiftAmountConstant(Offset, SclVT, dl));
46741 }
46742 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
46743 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
46744 return Scl;
46745 }
46746 }
46747
46748 // Handle extract(truncate(x)) for 0'th index.
46749 // TODO: Treat this as a faux shuffle?
46750 // TODO: When can we use this for general indices?
46751 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
46752 (SrcVT.getSizeInBits() % 128) == 0) {
46753 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
46754 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
46755 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
46756 Idx);
46757 }
46758
46759 // We can only legally extract other elements from 128-bit vectors and in
46760 // certain circumstances, depending on SSE-level.
46761 // TODO: Investigate float/double extraction if it will be just stored.
46762 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
46763 unsigned Idx) {
46764 EVT VecSVT = VecVT.getScalarType();
46765 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
46766 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
46767 VecSVT == MVT::i64)) {
46768 unsigned EltSizeInBits = VecSVT.getSizeInBits();
46769 unsigned NumEltsPerLane = 128 / EltSizeInBits;
46770 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
46771 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
46772 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
46773 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
46774 Idx &= (NumEltsPerLane - 1);
46775 }
46776 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
46777 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
46778 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
46779 DAG.getBitcast(VecVT, Vec),
46780 DAG.getVectorIdxConstant(Idx, dl));
46781 }
46782 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
46783 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
46784 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
46785 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
46786 DAG.getTargetConstant(Idx, dl, MVT::i8));
46787 }
46788 return SDValue();
46789 };
46790
46791 // Resolve the target shuffle inputs and mask.
46794 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
46795 return SDValue();
46796
46797 // Shuffle inputs must be the same size as the result.
46798 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
46799 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
46800 }))
46801 return SDValue();
46802
46803 // Attempt to narrow/widen the shuffle mask to the correct size.
46804 if (Mask.size() != NumSrcElts) {
46805 if ((NumSrcElts % Mask.size()) == 0) {
46806 SmallVector<int, 16> ScaledMask;
46807 int Scale = NumSrcElts / Mask.size();
46808 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
46809 Mask = std::move(ScaledMask);
46810 } else if ((Mask.size() % NumSrcElts) == 0) {
46811 // Simplify Mask based on demanded element.
46812 int ExtractIdx = (int)IdxC.getZExtValue();
46813 int Scale = Mask.size() / NumSrcElts;
46814 int Lo = Scale * ExtractIdx;
46815 int Hi = Scale * (ExtractIdx + 1);
46816 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
46817 if (i < Lo || Hi <= i)
46818 Mask[i] = SM_SentinelUndef;
46819
46820 SmallVector<int, 16> WidenedMask;
46821 while (Mask.size() > NumSrcElts &&
46822 canWidenShuffleElements(Mask, WidenedMask))
46823 Mask = std::move(WidenedMask);
46824 }
46825 }
46826
46827 // If narrowing/widening failed, see if we can extract+zero-extend.
46828 int ExtractIdx;
46829 EVT ExtractVT;
46830 if (Mask.size() == NumSrcElts) {
46831 ExtractIdx = Mask[IdxC.getZExtValue()];
46832 ExtractVT = SrcVT;
46833 } else {
46834 unsigned Scale = Mask.size() / NumSrcElts;
46835 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
46836 return SDValue();
46837 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
46838 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
46839 return SDValue();
46840 ExtractIdx = Mask[ScaledIdx];
46841 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
46842 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
46843 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&
46844 "Failed to widen vector type");
46845 }
46846
46847 // If the shuffle source element is undef/zero then we can just accept it.
46848 if (ExtractIdx == SM_SentinelUndef)
46849 return DAG.getUNDEF(VT);
46850
46851 if (ExtractIdx == SM_SentinelZero)
46852 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
46853 : DAG.getConstant(0, dl, VT);
46854
46855 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
46856 ExtractIdx = ExtractIdx % Mask.size();
46857 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
46858 return DAG.getZExtOrTrunc(V, dl, VT);
46859
46860 if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT)
46862 N, SrcVT, peekThroughBitcasts(SrcOp), ExtractIdx, dl, DAG, DCI))
46863 return V;
46864
46865 return SDValue();
46866}
46867
46868/// Extracting a scalar FP value from vector element 0 is free, so extract each
46869/// operand first, then perform the math as a scalar op.
46871 const X86Subtarget &Subtarget,
46873 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
46874 SDValue Vec = ExtElt->getOperand(0);
46875 SDValue Index = ExtElt->getOperand(1);
46876 EVT VT = ExtElt->getValueType(0);
46877 EVT VecVT = Vec.getValueType();
46878
46879 // TODO: If this is a unary/expensive/expand op, allow extraction from a
46880 // non-zero element because the shuffle+scalar op will be cheaper?
46881 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
46882 return SDValue();
46883
46884 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
46885 // extract, the condition code), so deal with those as a special-case.
46886 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
46887 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
46888 if (OpVT != MVT::f32 && OpVT != MVT::f64)
46889 return SDValue();
46890
46891 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
46892 SDLoc DL(ExtElt);
46893 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46894 Vec.getOperand(0), Index);
46895 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46896 Vec.getOperand(1), Index);
46897 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
46898 }
46899
46900 if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
46901 VT != MVT::f64)
46902 return SDValue();
46903
46904 // Vector FP selects don't fit the pattern of FP math ops (because the
46905 // condition has a different type and we have to change the opcode), so deal
46906 // with those here.
46907 // FIXME: This is restricted to pre type legalization. If we loosen this we
46908 // need to convert vector bool to a scalar bool.
46909 if (DCI.isBeforeLegalize() && Vec.getOpcode() == ISD::VSELECT &&
46910 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
46911 Vec.getOperand(0).getOperand(0).getValueType() == VecVT &&
46912 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1) {
46913 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
46914 SDLoc DL(ExtElt);
46917 Vec.getOperand(0), Index);
46918 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46919 Vec.getOperand(1), Index);
46920 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46921 Vec.getOperand(2), Index);
46922 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
46923 }
46924
46925 // TODO: This switch could include FNEG and the x86-specific FP logic ops
46926 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
46927 // missed load folding and fma+fneg combining.
46928 switch (Vec.getOpcode()) {
46929 case ISD::FMA: // Begin 3 operands
46930 case ISD::FMAD:
46931 case ISD::FADD: // Begin 2 operands
46932 case ISD::FSUB:
46933 case ISD::FMUL:
46934 case ISD::FDIV:
46935 case ISD::FREM:
46936 case ISD::FCOPYSIGN:
46937 case ISD::FMINNUM:
46938 case ISD::FMAXNUM:
46939 case ISD::FMINNUM_IEEE:
46940 case ISD::FMAXNUM_IEEE:
46941 case ISD::FMAXIMUM:
46942 case ISD::FMINIMUM:
46943 case ISD::FMAXIMUMNUM:
46944 case ISD::FMINIMUMNUM:
46945 case X86ISD::FMAX:
46946 case X86ISD::FMIN:
46947 case ISD::FABS: // Begin 1 operand
46948 case ISD::FSQRT:
46949 case ISD::FRINT:
46950 case ISD::FCEIL:
46951 case ISD::FTRUNC:
46952 case ISD::FNEARBYINT:
46953 case ISD::FROUNDEVEN:
46954 case ISD::FROUND:
46955 case ISD::FFLOOR:
46956 case X86ISD::FRCP:
46957 case X86ISD::FRSQRT: {
46958 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
46959 SDLoc DL(ExtElt);
46961 for (SDValue Op : Vec->ops())
46962 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
46963 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
46964 }
46965 default:
46966 return SDValue();
46967 }
46968 llvm_unreachable("All opcodes should return within switch");
46969}
46970
46971/// Try to convert a vector reduction sequence composed of binops and shuffles
46972/// into horizontal ops.
46974 const X86Subtarget &Subtarget) {
46975 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
46976
46977 // We need at least SSE2 to anything here.
46978 if (!Subtarget.hasSSE2())
46979 return SDValue();
46980
46982 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
46983 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
46984 if (!Rdx)
46985 return SDValue();
46986
46987 SDValue Index = ExtElt->getOperand(1);
46988 assert(isNullConstant(Index) &&
46989 "Reduction doesn't end in an extract from index 0");
46990
46991 EVT VT = ExtElt->getValueType(0);
46992 EVT VecVT = Rdx.getValueType();
46993 if (VecVT.getScalarType() != VT)
46994 return SDValue();
46995
46996 SDLoc DL(ExtElt);
46997 unsigned NumElts = VecVT.getVectorNumElements();
46998 unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
46999
47000 // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
47001 auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
47002 if (V.getValueType() == MVT::v4i8) {
47003 if (ZeroExtend && Subtarget.hasSSE41()) {
47004 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
47005 DAG.getConstant(0, DL, MVT::v4i32),
47006 DAG.getBitcast(MVT::i32, V),
47007 DAG.getVectorIdxConstant(0, DL));
47008 return DAG.getBitcast(MVT::v16i8, V);
47009 }
47010 V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
47011 ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
47012 : DAG.getUNDEF(MVT::v4i8));
47013 }
47014 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
47015 DAG.getUNDEF(MVT::v8i8));
47016 };
47017
47018 // vXi8 mul reduction - promote to vXi16 mul reduction.
47019 if (Opc == ISD::MUL) {
47020 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
47021 return SDValue();
47022 if (VecVT.getSizeInBits() >= 128) {
47023 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
47024 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
47025 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
47026 Lo = DAG.getBitcast(WideVT, Lo);
47027 Hi = DAG.getBitcast(WideVT, Hi);
47028 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
47029 while (Rdx.getValueSizeInBits() > 128) {
47030 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47031 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
47032 }
47033 } else {
47034 Rdx = WidenToV16I8(Rdx, false);
47035 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
47036 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
47037 }
47038 if (NumElts >= 8)
47039 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
47040 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
47041 {4, 5, 6, 7, -1, -1, -1, -1}));
47042 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
47043 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
47044 {2, 3, -1, -1, -1, -1, -1, -1}));
47045 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
47046 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
47047 {1, -1, -1, -1, -1, -1, -1, -1}));
47048 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47049 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47050 }
47051
47052 // vXi8 add reduction - sub 128-bit vector.
47053 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
47054 Rdx = WidenToV16I8(Rdx, true);
47055 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
47056 DAG.getConstant(0, DL, MVT::v16i8));
47057 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47058 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47059 }
47060
47061 // Must be a >=128-bit vector with pow2 elements.
47062 if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
47063 return SDValue();
47064
47065 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
47066 if (VT == MVT::i8) {
47067 while (Rdx.getValueSizeInBits() > 128) {
47068 SDValue Lo, Hi;
47069 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47070 VecVT = Lo.getValueType();
47071 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
47072 }
47073 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
47074
47076 MVT::v16i8, DL, Rdx, Rdx,
47077 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
47078 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
47079 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
47080 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
47081 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47082 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47083 }
47084
47085 // See if we can use vXi8 PSADBW add reduction for larger zext types.
47086 // If the source vector values are 0-255, then we can use PSADBW to
47087 // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
47088 // TODO: See if its worth avoiding vXi16/i32 truncations?
47089 if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
47090 DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
47091 (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
47092 Subtarget.hasAVX512())) {
47093 if (Rdx.getValueType() == MVT::v8i16) {
47094 Rdx = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Rdx,
47095 DAG.getUNDEF(MVT::v8i16));
47096 } else {
47097 EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
47098 Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
47099 if (ByteVT.getSizeInBits() < 128)
47100 Rdx = WidenToV16I8(Rdx, true);
47101 }
47102
47103 // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
47104 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47106 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
47107 SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
47108 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
47109 };
47110 MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
47111 Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
47112
47113 // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
47114 while (Rdx.getValueSizeInBits() > 128) {
47115 SDValue Lo, Hi;
47116 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47117 VecVT = Lo.getValueType();
47118 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
47119 }
47120 assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected");
47121
47122 if (NumElts > 8) {
47123 SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
47124 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
47125 }
47126
47127 VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
47128 Rdx = DAG.getBitcast(VecVT, Rdx);
47129 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47130 }
47131
47132 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
47133 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
47134 return SDValue();
47135
47136 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
47137
47138 // 256-bit horizontal instructions operate on 128-bit chunks rather than
47139 // across the whole vector, so we need an extract + hop preliminary stage.
47140 // This is the only step where the operands of the hop are not the same value.
47141 // TODO: We could extend this to handle 512-bit or even longer vectors.
47142 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
47143 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
47144 unsigned NumElts = VecVT.getVectorNumElements();
47145 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
47146 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
47147 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
47148 VecVT = Rdx.getValueType();
47149 }
47150 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
47151 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
47152 return SDValue();
47153
47154 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
47155 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
47156 for (unsigned i = 0; i != ReductionSteps; ++i)
47157 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
47158
47159 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47160}
47161
47162/// Detect vector gather/scatter index generation and convert it from being a
47163/// bunch of shuffles and extracts into a somewhat faster sequence.
47164/// For i686, the best sequence is apparently storing the value and loading
47165/// scalars back, while for x64 we should use 64-bit extracts and shifts.
47168 const X86Subtarget &Subtarget) {
47169 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
47170 return NewOp;
47171
47172 SDValue InputVector = N->getOperand(0);
47173 SDValue EltIdx = N->getOperand(1);
47174 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
47175
47176 EVT SrcVT = InputVector.getValueType();
47177 EVT VT = N->getValueType(0);
47178 SDLoc dl(InputVector);
47179 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
47180 unsigned NumSrcElts = SrcVT.getVectorNumElements();
47181 unsigned NumEltBits = VT.getScalarSizeInBits();
47182 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47183
47184 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
47185 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
47186
47187 // Integer Constant Folding.
47188 if (CIdx && VT.isInteger()) {
47189 APInt UndefVecElts;
47190 SmallVector<APInt, 16> EltBits;
47191 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
47192 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
47193 EltBits, /*AllowWholeUndefs*/ true,
47194 /*AllowPartialUndefs*/ false)) {
47195 uint64_t Idx = CIdx->getZExtValue();
47196 if (UndefVecElts[Idx])
47197 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
47198 return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
47199 }
47200
47201 // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
47202 // Improves lowering of bool masks on rust which splits them into byte array.
47203 if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
47204 SDValue Src = peekThroughBitcasts(InputVector);
47205 if (Src.getValueType().getScalarType() == MVT::i1 &&
47206 TLI.isTypeLegal(Src.getValueType())) {
47207 MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);
47208 SDValue Sub = DAG.getNode(
47209 ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,
47210 DAG.getVectorIdxConstant(CIdx->getZExtValue() * NumEltBits, dl));
47211 return DAG.getBitcast(VT, Sub);
47212 }
47213 }
47214 }
47215
47216 if (IsPextr) {
47217 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
47218 DCI))
47219 return SDValue(N, 0);
47220
47221 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
47222 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
47223 InputVector.getOpcode() == X86ISD::PINSRW) &&
47224 InputVector.getOperand(2) == EltIdx) {
47225 assert(SrcVT == InputVector.getOperand(0).getValueType() &&
47226 "Vector type mismatch");
47227 SDValue Scl = InputVector.getOperand(1);
47228 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
47229 return DAG.getZExtOrTrunc(Scl, dl, VT);
47230 }
47231
47232 // TODO - Remove this once we can handle the implicit zero-extension of
47233 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
47234 // combineBasicSADPattern.
47235 return SDValue();
47236 }
47237
47238 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
47239 if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&
47240 InputVector.getOpcode() == ISD::BITCAST &&
47241 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
47242 isNullConstant(EltIdx) && InputVector.hasOneUse())
47243 return DAG.getBitcast(VT, InputVector);
47244
47245 // Detect mmx to i32 conversion through a v2i32 elt extract.
47246 if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&
47247 InputVector.getOpcode() == ISD::BITCAST &&
47248 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
47249 isNullConstant(EltIdx) && InputVector.hasOneUse())
47250 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,
47251 InputVector.getOperand(0));
47252
47253 // Check whether this extract is the root of a sum of absolute differences
47254 // pattern. This has to be done here because we really want it to happen
47255 // pre-legalization,
47256 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
47257 return SAD;
47258
47259 if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
47260 return VPDPBUSD;
47261
47262 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
47263 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
47264 return Cmp;
47265
47266 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
47267 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
47268 return MinMax;
47269
47270 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
47271 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
47272 return V;
47273
47274 if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget, DCI))
47275 return V;
47276
47277 if (CIdx)
47279 N, InputVector.getValueType(), InputVector, CIdx->getZExtValue(),
47280 dl, DAG, DCI))
47281 return V;
47282
47283 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
47284 // and then testing the relevant element.
47285 //
47286 // Note that we only combine extracts on the *same* result number, i.e.
47287 // t0 = merge_values a0, a1, a2, a3
47288 // i1 = extract_vector_elt t0, Constant:i64<2>
47289 // i1 = extract_vector_elt t0, Constant:i64<3>
47290 // but not
47291 // i1 = extract_vector_elt t0:1, Constant:i64<2>
47292 // since the latter would need its own MOVMSK.
47293 if (SrcVT.getScalarType() == MVT::i1) {
47294 bool IsVar = !CIdx;
47295 SmallVector<SDNode *, 16> BoolExtracts;
47296 unsigned ResNo = InputVector.getResNo();
47297 auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
47298 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
47299 Use->getOperand(0).getResNo() == ResNo &&
47300 Use->getValueType(0) == MVT::i1) {
47301 BoolExtracts.push_back(Use);
47302 IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
47303 return true;
47304 }
47305 return false;
47306 };
47307 // TODO: Can we drop the oneuse check for constant extracts?
47308 if (all_of(InputVector->users(), IsBoolExtract) &&
47309 (IsVar || BoolExtracts.size() > 1)) {
47310 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
47311 if (SDValue BC =
47312 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
47313 for (SDNode *Use : BoolExtracts) {
47314 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
47315 // Mask = 1 << MaskIdx
47316 SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
47317 SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
47318 SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
47319 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
47320 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
47321 DCI.CombineTo(Use, Res);
47322 }
47323 return SDValue(N, 0);
47324 }
47325 }
47326 }
47327
47328 // Attempt to fold extract(trunc(x),c) -> trunc(extract(x,c)).
47329 if (CIdx && InputVector.getOpcode() == ISD::TRUNCATE) {
47330 SDValue TruncSrc = InputVector.getOperand(0);
47331 EVT TruncSVT = TruncSrc.getValueType().getScalarType();
47332 if (DCI.isBeforeLegalize() && TLI.isTypeLegal(TruncSVT)) {
47333 SDValue NewExt =
47334 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TruncSVT, TruncSrc, EltIdx);
47335 return DAG.getAnyExtOrTrunc(NewExt, dl, VT);
47336 }
47337 }
47338
47339 return SDValue();
47340}
47341
47342// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
47343// This is more or less the reverse of combineBitcastvxi1.
47345 unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
47346 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
47347 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
47348 Opcode != ISD::ANY_EXTEND)
47349 return SDValue();
47350 if (!DCI.isBeforeLegalizeOps())
47351 return SDValue();
47352 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
47353 return SDValue();
47354
47355 EVT SVT = VT.getScalarType();
47356 EVT InSVT = N0.getValueType().getScalarType();
47357 unsigned EltSizeInBits = SVT.getSizeInBits();
47358
47359 // Input type must be extending a bool vector (bit-casted from a scalar
47360 // integer) to legal integer types.
47361 if (!VT.isVector())
47362 return SDValue();
47363 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
47364 return SDValue();
47365 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
47366 return SDValue();
47367
47368 SDValue N00 = N0.getOperand(0);
47369 EVT SclVT = N00.getValueType();
47370 if (!SclVT.isScalarInteger())
47371 return SDValue();
47372
47373 SDValue Vec;
47374 SmallVector<int> ShuffleMask;
47375 unsigned NumElts = VT.getVectorNumElements();
47376 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
47377
47378 // Broadcast the scalar integer to the vector elements.
47379 if (NumElts > EltSizeInBits) {
47380 // If the scalar integer is greater than the vector element size, then we
47381 // must split it down into sub-sections for broadcasting. For example:
47382 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
47383 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
47384 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
47385 unsigned Scale = NumElts / EltSizeInBits;
47386 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
47387 bool UseBroadcast = Subtarget.hasInt256() &&
47388 (!BroadcastVT.is128BitVector() || isa<LoadSDNode>(N00));
47389 Vec = UseBroadcast
47390 ? DAG.getSplat(BroadcastVT, DL, N00)
47391 : DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
47392 Vec = DAG.getBitcast(VT, Vec);
47393
47394 for (unsigned i = 0; i != Scale; ++i) {
47395 int Offset = UseBroadcast ? (i * EltSizeInBits) : 0;
47396 ShuffleMask.append(EltSizeInBits, i + Offset);
47397 }
47398 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
47399 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
47400 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
47401 // If we have register broadcast instructions, use the scalar size as the
47402 // element type for the shuffle. Then cast to the wider element type. The
47403 // widened bits won't be used, and this might allow the use of a broadcast
47404 // load.
47405 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
47406 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT,
47407 (NumElts * EltSizeInBits) / NumElts);
47408 Vec = DAG.getBitcast(VT, DAG.getSplat(BroadcastVT, DL, N00));
47409 } else {
47410 // For smaller scalar integers, we can simply any-extend it to the vector
47411 // element size (we don't care about the upper bits) and broadcast it to all
47412 // elements.
47413 Vec = DAG.getSplat(VT, DL, DAG.getAnyExtOrTrunc(N00, DL, SVT));
47414 }
47415
47416 // Now, mask the relevant bit in each element.
47418 for (unsigned i = 0; i != NumElts; ++i) {
47419 int BitIdx = (i % EltSizeInBits);
47420 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
47421 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
47422 }
47423 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
47424 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
47425
47426 // Compare against the bitmask and extend the result.
47427 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
47428 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
47429 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
47430
47431 // For SEXT, this is now done, otherwise shift the result down for
47432 // zero-extension.
47433 if (Opcode == ISD::SIGN_EXTEND)
47434 return Vec;
47435 return DAG.getNode(ISD::SRL, DL, VT, Vec,
47436 DAG.getConstant(EltSizeInBits - 1, DL, VT));
47437}
47438
47439/// If both arms of a vector select are concatenated vectors, split the select,
47440/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
47441/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
47442/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
47444 const X86Subtarget &Subtarget) {
47445 unsigned Opcode = N->getOpcode();
47446 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
47447 return SDValue();
47448
47449 // TODO: Split 512-bit vectors too?
47450 EVT VT = N->getValueType(0);
47451 if (!VT.is256BitVector())
47452 return SDValue();
47453
47454 // TODO: Split as long as any 2 of the 3 operands are concatenated?
47455 SDValue Cond = N->getOperand(0);
47456 SDValue TVal = N->getOperand(1);
47457 SDValue FVal = N->getOperand(2);
47458 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
47459 !isFreeToSplitVector(TVal, DAG) || !isFreeToSplitVector(FVal, DAG))
47460 return SDValue();
47461
47462 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
47464 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
47465 };
47466 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Cond, TVal, FVal}, makeBlend,
47467 /*CheckBWI*/ false);
47468}
47469
47471 const SDLoc &DL) {
47472 SDValue Cond = N->getOperand(0);
47473 SDValue LHS = N->getOperand(1);
47474 SDValue RHS = N->getOperand(2);
47475
47476 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
47477 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
47478 if (!TrueC || !FalseC)
47479 return SDValue();
47480
47481 // Don't do this for crazy integer types.
47482 EVT VT = N->getValueType(0);
47483 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
47484 return SDValue();
47485
47486 // We're going to use the condition bit in math or logic ops. We could allow
47487 // this with a wider condition value (post-legalization it becomes an i8),
47488 // but if nothing is creating selects that late, it doesn't matter.
47489 if (Cond.getValueType() != MVT::i1)
47490 return SDValue();
47491
47492 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
47493 // 3, 5, or 9 with i32/i64, so those get transformed too.
47494 // TODO: For constants that overflow or do not differ by power-of-2 or small
47495 // multiplier, convert to 'and' + 'add'.
47496 const APInt &TrueVal = TrueC->getAPIntValue();
47497 const APInt &FalseVal = FalseC->getAPIntValue();
47498
47499 // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
47500 if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
47501 Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
47502 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47503 if (CC == ISD::SETEQ || CC == ISD::SETNE)
47504 return SDValue();
47505 }
47506
47507 bool OV;
47508 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
47509 if (OV)
47510 return SDValue();
47511
47512 APInt AbsDiff = Diff.abs();
47513 if (AbsDiff.isPowerOf2() ||
47514 ((VT == MVT::i32 || VT == MVT::i64) &&
47515 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
47516
47517 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
47518 // of the condition can usually be folded into a compare predicate, but even
47519 // without that, the sequence should be cheaper than a CMOV alternative.
47520 if (TrueVal.slt(FalseVal)) {
47521 Cond = DAG.getNOT(DL, Cond, MVT::i1);
47522 std::swap(TrueC, FalseC);
47523 }
47524
47525 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
47526 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
47527
47528 // Multiply condition by the difference if non-one.
47529 if (!AbsDiff.isOne())
47530 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
47531
47532 // Add the base if non-zero.
47533 if (!FalseC->isZero())
47534 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
47535
47536 return R;
47537 }
47538
47539 return SDValue();
47540}
47541
47542/// If this is a *dynamic* select (non-constant condition) and we can match
47543/// this node with one of the variable blend instructions, restructure the
47544/// condition so that blends can use the high (sign) bit of each element.
47545/// This function will also call SimplifyDemandedBits on already created
47546/// BLENDV to perform additional simplifications.
47548 const SDLoc &DL,
47550 const X86Subtarget &Subtarget) {
47551 SDValue Cond = N->getOperand(0);
47552 if ((N->getOpcode() != ISD::VSELECT &&
47553 N->getOpcode() != X86ISD::BLENDV) ||
47555 return SDValue();
47556
47557 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47558 unsigned BitWidth = Cond.getScalarValueSizeInBits();
47559 EVT VT = N->getValueType(0);
47560
47561 // We can only handle the cases where VSELECT is directly legal on the
47562 // subtarget. We custom lower VSELECT nodes with constant conditions and
47563 // this makes it hard to see whether a dynamic VSELECT will correctly
47564 // lower, so we both check the operation's status and explicitly handle the
47565 // cases where a *dynamic* blend will fail even though a constant-condition
47566 // blend could be custom lowered.
47567 // FIXME: We should find a better way to handle this class of problems.
47568 // Potentially, we should combine constant-condition vselect nodes
47569 // pre-legalization into shuffles and not mark as many types as custom
47570 // lowered.
47572 return SDValue();
47573 // FIXME: We don't support i16-element blends currently. We could and
47574 // should support them by making *all* the bits in the condition be set
47575 // rather than just the high bit and using an i8-element blend.
47576 if (VT.getVectorElementType() == MVT::i16)
47577 return SDValue();
47578 // Dynamic blending was only available from SSE4.1 onward.
47579 if (VT.is128BitVector() && !Subtarget.hasSSE41())
47580 return SDValue();
47581 // Byte blends are only available in AVX2
47582 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
47583 return SDValue();
47584 // There are no 512-bit blend instructions that use sign bits.
47585 if (VT.is512BitVector())
47586 return SDValue();
47587
47588 // Don't optimize before the condition has been transformed to a legal type
47589 // and don't ever optimize vector selects that map to AVX512 mask-registers.
47591 return SDValue();
47592
47593 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
47594 for (SDUse &Use : Cond->uses())
47595 if ((Use.getUser()->getOpcode() != ISD::VSELECT &&
47596 Use.getUser()->getOpcode() != X86ISD::BLENDV) ||
47597 Use.getOperandNo() != 0)
47598 return false;
47599
47600 return true;
47601 };
47602
47604
47605 if (OnlyUsedAsSelectCond(Cond)) {
47606 KnownBits Known;
47608 !DCI.isBeforeLegalizeOps());
47609 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
47610 return SDValue();
47611
47612 // If we changed the computation somewhere in the DAG, this change will
47613 // affect all users of Cond. Update all the nodes so that we do not use
47614 // the generic VSELECT anymore. Otherwise, we may perform wrong
47615 // optimizations as we messed with the actual expectation for the vector
47616 // boolean values.
47617 for (SDNode *U : Cond->users()) {
47618 if (U->getOpcode() == X86ISD::BLENDV)
47619 continue;
47620
47621 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
47622 Cond, U->getOperand(1), U->getOperand(2));
47623 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
47624 DCI.AddToWorklist(U);
47625 }
47626 DCI.CommitTargetLoweringOpt(TLO);
47627 return SDValue(N, 0);
47628 }
47629
47630 // Otherwise we can still at least try to simplify multiple use bits.
47632 return DAG.getNode(X86ISD::BLENDV, DL, N->getValueType(0), V,
47633 N->getOperand(1), N->getOperand(2));
47634
47635 return SDValue();
47636}
47637
47638// Try to match:
47639// (or (and (M, (sub 0, X)), (pandn M, X)))
47640// which is a special case of:
47641// (select M, (sub 0, X), X)
47642// Per:
47643// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
47644// We know that, if fNegate is 0 or 1:
47645// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
47646//
47647// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
47648// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
47649// ( M ? -X : X) == ((X ^ M ) + (M & 1))
47650// This lets us transform our vselect to:
47651// (add (xor X, M), (and M, 1))
47652// And further to:
47653// (sub (xor X, M), M)
47655 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
47656 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
47657 using namespace SDPatternMatch;
47658 EVT MaskVT = Mask.getValueType();
47659 assert(MaskVT.isInteger() &&
47660 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
47661 "Mask must be zero/all-bits");
47662
47663 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT ||
47665 return SDValue();
47666
47667 SDValue V;
47668 if (!sd_match(Y, m_Neg(m_AllOf(m_Specific(X), m_Value(V)))) &&
47670 return SDValue();
47671
47672 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
47673 SDValue SubOp2 = Mask;
47674
47675 // If the negate was on the false side of the select, then
47676 // the operands of the SUB need to be swapped. PR 27251.
47677 // This is because the pattern being matched above is
47678 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
47679 // but if the pattern matched was
47680 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
47681 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
47682 // pattern also needs to be a negation of the replacement pattern above.
47683 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
47684 // sub accomplishes the negation of the replacement pattern.
47685 if (V == Y)
47686 std::swap(SubOp1, SubOp2);
47687
47688 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
47689 return DAG.getBitcast(VT, Res);
47690}
47691
47693 const X86Subtarget &Subtarget) {
47694 using namespace SDPatternMatch;
47695 if (!Subtarget.hasAVX512())
47696 return SDValue();
47697
47698 ISD::CondCode CC;
47699 SDValue Cond, X, Y, LHS, RHS;
47702 m_CondCode(CC)))),
47703 m_Value(LHS), m_Value(RHS))))
47704 return SDValue();
47705
47706 if (canCombineAsMaskOperation(LHS, Subtarget) ||
47707 !canCombineAsMaskOperation(RHS, Subtarget))
47708 return SDValue();
47709
47710 // Commute LHS and RHS to create opportunity to select mask instruction.
47711 // (vselect M, L, R) -> (vselect ~M, R, L)
47712 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, X.getValueType());
47713 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), X, Y, NewCC);
47714 return DAG.getSelect(DL, LHS.getValueType(), Cond, RHS, LHS);
47715}
47716
47717/// Do target-specific dag combines on SELECT and VSELECT nodes.
47720 const X86Subtarget &Subtarget) {
47721 SDLoc DL(N);
47722 SDValue Cond = N->getOperand(0);
47723 SDValue LHS = N->getOperand(1);
47724 SDValue RHS = N->getOperand(2);
47725
47726 // Try simplification again because we use this function to optimize
47727 // BLENDV nodes that are not handled by the generic combiner.
47728 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
47729 return V;
47730
47731 // When avx512 is available the lhs operand of select instruction can be
47732 // folded with mask instruction, while the rhs operand can't. Commute the
47733 // lhs and rhs of the select instruction to create the opportunity of
47734 // folding.
47735 if (SDValue V = commuteSelect(N, DAG, DL, Subtarget))
47736 return V;
47737
47738 EVT VT = LHS.getValueType();
47739 EVT CondVT = Cond.getValueType();
47740 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47741 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
47742
47743 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
47744 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
47745 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
47746 if (CondVT.isVector() && CondVT.isInteger() &&
47747 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
47748 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
47751 DL, DAG, Subtarget))
47752 return V;
47753
47754 if (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV) {
47755 SmallVector<int, 64> CondMask;
47756 if (createShuffleMaskFromVSELECT(CondMask, Cond,
47757 N->getOpcode() == X86ISD::BLENDV)) {
47758 // Convert vselects with constant condition into shuffles.
47759 if (DCI.isBeforeLegalizeOps())
47760 return DAG.getVectorShuffle(VT, DL, LHS, RHS, CondMask);
47761
47762 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
47763 // by forcing the unselected elements to zero.
47764 // TODO: Can we handle more shuffles with this?
47765 if (LHS.hasOneUse() && RHS.hasOneUse()) {
47766 SmallVector<SDValue, 1> LHSOps, RHSOps;
47767 SmallVector<int, 64> LHSMask, RHSMask, ByteMask;
47770 if (LHSShuf.getOpcode() == X86ISD::PSHUFB &&
47771 RHSShuf.getOpcode() == X86ISD::PSHUFB &&
47772 scaleShuffleMaskElts(VT.getSizeInBits() / 8, CondMask, ByteMask) &&
47773 getTargetShuffleMask(LHSShuf, true, LHSOps, LHSMask) &&
47774 getTargetShuffleMask(RHSShuf, true, RHSOps, RHSMask)) {
47775 assert(ByteMask.size() == LHSMask.size() &&
47776 ByteMask.size() == RHSMask.size() && "Shuffle mask mismatch");
47777 for (auto [I, M] : enumerate(ByteMask)) {
47778 // getConstVector sets negative shuffle mask values as undef, so
47779 // ensure we hardcode SM_SentinelZero values to zero (0x80).
47780 if (M < (int)ByteMask.size()) {
47781 LHSMask[I] = isUndefOrZero(LHSMask[I]) ? 0x80 : LHSMask[I];
47782 RHSMask[I] = 0x80;
47783 } else {
47784 LHSMask[I] = 0x80;
47785 RHSMask[I] = isUndefOrZero(RHSMask[I]) ? 0x80 : RHSMask[I];
47786 }
47787 }
47788 MVT ByteVT = LHSShuf.getSimpleValueType();
47789 LHS = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, LHSOps[0],
47790 getConstVector(LHSMask, ByteVT, DAG, DL, true));
47791 RHS = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, RHSOps[0],
47792 getConstVector(RHSMask, ByteVT, DAG, DL, true));
47793 return DAG.getBitcast(VT, DAG.getNode(ISD::OR, DL, ByteVT, LHS, RHS));
47794 }
47795 }
47796
47797 // Attempt to combine as shuffle.
47798 SDValue Op(N, 0);
47799 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47800 return Res;
47801 }
47802 }
47803
47804 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
47805 // instructions match the semantics of the common C idiom x<y?x:y but not
47806 // x<=y?x:y, because of how they handle negative zero (which can be
47807 // ignored in unsafe-math mode).
47808 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
47809 if ((Cond.getOpcode() == ISD::SETCC ||
47810 Cond.getOpcode() == ISD::STRICT_FSETCCS) &&
47811 VT.isFloatingPoint() && VT != MVT::f80 && VT != MVT::f128 &&
47812 !isSoftF16(VT, Subtarget) && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
47813 ((VT != MVT::v8f16 && VT != MVT::v16f16) || Subtarget.hasVLX()) &&
47814 (Subtarget.hasSSE2() ||
47815 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
47816 bool IsStrict = Cond->isStrictFPOpcode();
47817 ISD::CondCode CC =
47818 cast<CondCodeSDNode>(Cond.getOperand(IsStrict ? 3 : 2))->get();
47819 SDValue Op0 = Cond.getOperand(IsStrict ? 1 : 0);
47820 SDValue Op1 = Cond.getOperand(IsStrict ? 2 : 1);
47821
47822 unsigned Opcode = 0;
47823 // Check for x CC y ? x : y.
47824 if (DAG.isEqualTo(LHS, Op0) && DAG.isEqualTo(RHS, Op1)) {
47825 switch (CC) {
47826 default: break;
47827 case ISD::SETULT:
47828 // Converting this to a min would handle NaNs incorrectly, and swapping
47829 // the operands would cause it to handle comparisons between positive
47830 // and negative zero incorrectly.
47831 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47833 !(DAG.isKnownNeverZeroFloat(LHS) ||
47835 break;
47836 std::swap(LHS, RHS);
47837 }
47838 Opcode = X86ISD::FMIN;
47839 break;
47840 case ISD::SETOLE:
47841 // Converting this to a min would handle comparisons between positive
47842 // and negative zero incorrectly.
47845 break;
47846 Opcode = X86ISD::FMIN;
47847 break;
47848 case ISD::SETULE:
47849 // Converting this to a min would handle both negative zeros and NaNs
47850 // incorrectly, but we can swap the operands to fix both.
47851 std::swap(LHS, RHS);
47852 [[fallthrough]];
47853 case ISD::SETOLT:
47854 case ISD::SETLT:
47855 case ISD::SETLE:
47856 Opcode = X86ISD::FMIN;
47857 break;
47858
47859 case ISD::SETOGE:
47860 // Converting this to a max would handle comparisons between positive
47861 // and negative zero incorrectly.
47864 break;
47865 Opcode = X86ISD::FMAX;
47866 break;
47867 case ISD::SETUGT:
47868 // Converting this to a max would handle NaNs incorrectly, and swapping
47869 // the operands would cause it to handle comparisons between positive
47870 // and negative zero incorrectly.
47871 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47873 !(DAG.isKnownNeverZeroFloat(LHS) ||
47875 break;
47876 std::swap(LHS, RHS);
47877 }
47878 Opcode = X86ISD::FMAX;
47879 break;
47880 case ISD::SETUGE:
47881 // Converting this to a max would handle both negative zeros and NaNs
47882 // incorrectly, but we can swap the operands to fix both.
47883 std::swap(LHS, RHS);
47884 [[fallthrough]];
47885 case ISD::SETOGT:
47886 case ISD::SETGT:
47887 case ISD::SETGE:
47888 Opcode = X86ISD::FMAX;
47889 break;
47890 }
47891 // Check for x CC y ? y : x -- a min/max with reversed arms.
47892 } else if (DAG.isEqualTo(LHS, Op1) && DAG.isEqualTo(RHS, Op0)) {
47893 switch (CC) {
47894 default: break;
47895 case ISD::SETOGE:
47896 // Converting this to a min would handle comparisons between positive
47897 // and negative zero incorrectly, and swapping the operands would
47898 // cause it to handle NaNs incorrectly.
47900 !(DAG.isKnownNeverZeroFloat(LHS) ||
47901 DAG.isKnownNeverZeroFloat(RHS))) {
47902 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47903 break;
47904 std::swap(LHS, RHS);
47905 }
47906 Opcode = X86ISD::FMIN;
47907 break;
47908 case ISD::SETUGT:
47909 // Converting this to a min would handle NaNs incorrectly.
47910 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47911 break;
47912 Opcode = X86ISD::FMIN;
47913 break;
47914 case ISD::SETUGE:
47915 // Converting this to a min would handle both negative zeros and NaNs
47916 // incorrectly, but we can swap the operands to fix both.
47917 std::swap(LHS, RHS);
47918 [[fallthrough]];
47919 case ISD::SETOGT:
47920 case ISD::SETGT:
47921 case ISD::SETGE:
47922 Opcode = X86ISD::FMIN;
47923 break;
47924
47925 case ISD::SETULT:
47926 // Converting this to a max would handle NaNs incorrectly.
47927 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47928 break;
47929 Opcode = X86ISD::FMAX;
47930 break;
47931 case ISD::SETOLE:
47932 // Converting this to a max would handle comparisons between positive
47933 // and negative zero incorrectly, and swapping the operands would
47934 // cause it to handle NaNs incorrectly.
47936 !DAG.isKnownNeverZeroFloat(LHS) &&
47937 !DAG.isKnownNeverZeroFloat(RHS)) {
47938 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47939 break;
47940 std::swap(LHS, RHS);
47941 }
47942 Opcode = X86ISD::FMAX;
47943 break;
47944 case ISD::SETULE:
47945 // Converting this to a max would handle both negative zeros and NaNs
47946 // incorrectly, but we can swap the operands to fix both.
47947 std::swap(LHS, RHS);
47948 [[fallthrough]];
47949 case ISD::SETOLT:
47950 case ISD::SETLT:
47951 case ISD::SETLE:
47952 Opcode = X86ISD::FMAX;
47953 break;
47954 }
47955 }
47956
47957 if (Opcode) {
47958 if (IsStrict) {
47959 SDValue Ret = DAG.getNode(Opcode == X86ISD::FMIN ? X86ISD::STRICT_FMIN
47961 DL, {N->getValueType(0), MVT::Other},
47962 {Cond.getOperand(0), LHS, RHS});
47963 DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Ret.getValue(1));
47964 return Ret;
47965 }
47966 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
47967 }
47968 }
47969
47970 // Some mask scalar intrinsics rely on checking if only one bit is set
47971 // and implement it in C code like this:
47972 // A[0] = (U & 1) ? A[0] : W[0];
47973 // This creates some redundant instructions that break pattern matching.
47974 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
47975 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
47976 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
47977 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47978 SDValue AndNode = Cond.getOperand(0);
47979 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
47980 isNullConstant(Cond.getOperand(1)) &&
47981 isOneConstant(AndNode.getOperand(1))) {
47982 // LHS and RHS swapped due to
47983 // setcc outputting 1 when AND resulted in 0 and vice versa.
47984 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
47985 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
47986 }
47987 }
47988
47989 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
47990 // lowering on KNL. In this case we convert it to
47991 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
47992 // The same situation all vectors of i8 and i16 without BWI.
47993 // Make sure we extend these even before type legalization gets a chance to
47994 // split wide vectors.
47995 // Since SKX these selects have a proper lowering.
47996 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
47997 CondVT.getVectorElementType() == MVT::i1 &&
47998 (VT.getVectorElementType() == MVT::i8 ||
47999 VT.getVectorElementType() == MVT::i16)) {
48000 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
48001 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
48002 }
48003
48004 // AVX512 - Extend select to merge with target shuffle.
48005 // select(mask, extract_subvector(shuffle(x)), y) -->
48006 // extract_subvector(select(widen(mask), shuffle(x), widen(y)))
48007 // TODO - support non target shuffles as well with canCombineAsMaskOperation.
48008 if (Subtarget.hasAVX512() && CondVT.isVector() &&
48009 CondVT.getVectorElementType() == MVT::i1) {
48010 auto SelectableOp = [&TLI](SDValue Op, SDValue Alt) {
48011 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
48012 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
48013 isNullConstant(Op.getOperand(1)) &&
48014 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
48015 Op.hasOneUse() && Op.getOperand(0).hasOneUse() &&
48016 (Op.getOperand(0).getOpcode() != X86ISD::VPERMV3 ||
48017 ISD::isBuildVectorAllZeros(Alt.getNode()));
48018 };
48019
48020 bool SelectableLHS = SelectableOp(LHS, RHS);
48021 bool SelectableRHS = SelectableOp(RHS, LHS);
48022 if (SelectableLHS || SelectableRHS) {
48023 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
48024 : RHS.getOperand(0).getValueType();
48025 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
48026 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
48027 VT.getSizeInBits());
48028 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
48029 VT.getSizeInBits());
48030 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
48031 DAG.getUNDEF(SrcCondVT), Cond,
48032 DAG.getVectorIdxConstant(0, DL));
48033 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
48034 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
48035 }
48036 }
48037
48038 if (SDValue V = combineSelectOfTwoConstants(N, DAG, DL))
48039 return V;
48040
48041 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
48042 Cond.hasOneUse()) {
48043 EVT CondVT = Cond.getValueType();
48044 SDValue Cond0 = Cond.getOperand(0);
48045 SDValue Cond1 = Cond.getOperand(1);
48046 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
48047
48048 // Canonicalize min/max:
48049 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
48050 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
48051 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
48052 // the need for an extra compare against zero. e.g.
48053 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
48054 // subl %esi, %edi
48055 // testl %edi, %edi
48056 // movl $0, %eax
48057 // cmovgl %edi, %eax
48058 // =>
48059 // xorl %eax, %eax
48060 // subl %esi, $edi
48061 // cmovsl %eax, %edi
48062 //
48063 // We can also canonicalize
48064 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
48065 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
48066 // This allows the use of a test instruction for the compare.
48067 if (LHS == Cond0 && RHS == Cond1) {
48068 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
48069 (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
48071 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
48072 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
48073 }
48074 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
48075 ISD::CondCode NewCC = ISD::SETUGE;
48076 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
48077 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
48078 }
48079 }
48080
48081 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
48082 // fold eq + gt/lt nested selects into ge/le selects
48083 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
48084 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
48085 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
48086 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
48087 // .. etc ..
48088 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
48089 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
48090 SDValue InnerSetCC = RHS.getOperand(0);
48091 ISD::CondCode InnerCC =
48092 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
48093 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
48094 Cond0 == InnerSetCC.getOperand(0) &&
48095 Cond1 == InnerSetCC.getOperand(1)) {
48096 ISD::CondCode NewCC;
48097 switch (CC == ISD::SETEQ ? InnerCC : CC) {
48098 // clang-format off
48099 case ISD::SETGT: NewCC = ISD::SETGE; break;
48100 case ISD::SETLT: NewCC = ISD::SETLE; break;
48101 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
48102 case ISD::SETULT: NewCC = ISD::SETULE; break;
48103 default: NewCC = ISD::SETCC_INVALID; break;
48104 // clang-format on
48105 }
48106 if (NewCC != ISD::SETCC_INVALID) {
48107 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
48108 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
48109 }
48110 }
48111 }
48112 }
48113
48114 // Check if the first operand is all zeros and Cond type is vXi1.
48115 // If this an avx512 target we can improve the use of zero masking by
48116 // swapping the operands and inverting the condition.
48117 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
48118 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
48119 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
48120 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
48121 // Invert the cond to not(cond) : xor(op,allones)=not(op)
48122 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
48123 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
48124 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
48125 }
48126
48127 // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
48128 // get split by legalization.
48129 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
48130 CondVT.getVectorElementType() == MVT::i1 &&
48131 TLI.isTypeLegal(VT.getScalarType())) {
48132 EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
48134 ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
48135 ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
48136 return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
48137 }
48138 }
48139
48140 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
48141 // with out-of-bounds clamping.
48142
48143 // Unlike general shift instructions (SHL/SRL), AVX2's VSHLV/VSRLV handle
48144 // shift amounts exceeding the element bitwidth. VSHLV/VSRLV clamps the amount
48145 // to bitwidth-1 for unsigned shifts, effectively performing a maximum left
48146 // shift of bitwidth-1 positions. and returns zero for unsigned right shifts
48147 // exceeding bitwidth-1.
48148 if (N->getOpcode() == ISD::VSELECT) {
48149 using namespace llvm::SDPatternMatch;
48150 // fold select(icmp_ult(amt,BW),shl(x,amt),0) -> avx2 psllv(x,amt)
48151 // fold select(icmp_ult(amt,BW),srl(x,amt),0) -> avx2 psrlv(x,amt)
48152 if ((LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SHL) &&
48153 supportedVectorVarShift(VT, Subtarget, LHS.getOpcode()) &&
48155 sd_match(Cond, m_SetCC(m_Specific(LHS.getOperand(1)),
48158 return DAG.getNode(LHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
48159 : X86ISD::VSHLV,
48160 DL, VT, LHS.getOperand(0), LHS.getOperand(1));
48161 }
48162 // fold select(icmp_uge(amt,BW),0,shl(x,amt)) -> avx2 psllv(x,amt)
48163 // fold select(icmp_uge(amt,BW),0,srl(x,amt)) -> avx2 psrlv(x,amt)
48164 if ((RHS.getOpcode() == ISD::SRL || RHS.getOpcode() == ISD::SHL) &&
48165 supportedVectorVarShift(VT, Subtarget, RHS.getOpcode()) &&
48167 sd_match(Cond, m_SetCC(m_Specific(RHS.getOperand(1)),
48170 return DAG.getNode(RHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
48171 : X86ISD::VSHLV,
48172 DL, VT, RHS.getOperand(0), RHS.getOperand(1));
48173 }
48174 }
48175
48176 // Early exit check
48177 if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget))
48178 return SDValue();
48179
48180 if (SDValue V = combineVSelectToBLENDV(N, DAG, DL, DCI, Subtarget))
48181 return V;
48182
48183 if (SDValue V = narrowVectorSelect(N, DAG, DL, Subtarget))
48184 return V;
48185
48186 // select(~Cond, X, Y) -> select(Cond, Y, X)
48187 if (CondVT.getScalarType() != MVT::i1) {
48188 if (SDValue CondNot = IsNOT(Cond, DAG))
48189 return DAG.getNode(N->getOpcode(), DL, VT,
48190 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
48191
48192 // select(pcmpeq(and(X,Pow2),0),A,B) -> select(pcmpeq(and(X,Pow2),Pow2),B,A)
48193 if (Cond.getOpcode() == X86ISD::PCMPEQ &&
48194 Cond.getOperand(0).getOpcode() == ISD::AND &&
48195 ISD::isBuildVectorAllZeros(Cond.getOperand(1).getNode()) &&
48196 isConstantPowerOf2(Cond.getOperand(0).getOperand(1),
48197 Cond.getScalarValueSizeInBits(),
48198 /*AllowUndefs=*/true) &&
48199 Cond.hasOneUse()) {
48200 Cond = DAG.getNode(X86ISD::PCMPEQ, DL, CondVT, Cond.getOperand(0),
48201 Cond.getOperand(0).getOperand(1));
48202 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
48203 }
48204
48205 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
48206 // signbit.
48207 if (Cond.getOpcode() == X86ISD::PCMPGT &&
48208 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
48209 Cond.hasOneUse()) {
48210 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
48211 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
48212 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
48213 }
48214 }
48215
48216 // Try to optimize vXi1 selects if both operands are either all constants or
48217 // bitcasts from scalar integer type. In that case we can convert the operands
48218 // to integer and use an integer select which will be converted to a CMOV.
48219 // We need to take a little bit of care to avoid creating an i64 type after
48220 // type legalization.
48221 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
48222 VT.getVectorElementType() == MVT::i1 &&
48223 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
48225 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {
48226 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
48227 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
48228
48229 if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&
48230 LHS.getOperand(0).getValueType() == IntVT)) &&
48231 (RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&
48232 RHS.getOperand(0).getValueType() == IntVT))) {
48233 if (LHSIsConst)
48235 else
48236 LHS = LHS.getOperand(0);
48237
48238 if (RHSIsConst)
48240 else
48241 RHS = RHS.getOperand(0);
48242
48243 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
48244 return DAG.getBitcast(VT, Select);
48245 }
48246 }
48247 }
48248
48249 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
48250 // single bits, then invert the predicate and swap the select operands.
48251 // This can lower using a vector shift bit-hack rather than mask and compare.
48252 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
48253 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
48254 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
48255 Cond.getOperand(0).getOpcode() == ISD::AND &&
48256 isNullOrNullSplat(Cond.getOperand(1)) &&
48257 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
48258 Cond.getOperand(0).getValueType() == VT) {
48259 // The 'and' mask must be composed of power-of-2 constants.
48260 SDValue And = Cond.getOperand(0);
48261 auto *C = isConstOrConstSplat(And.getOperand(1));
48262 if (C && C->getAPIntValue().isPowerOf2()) {
48263 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
48264 SDValue NotCond =
48265 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
48266 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
48267 }
48268
48269 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
48270 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
48271 // 16-bit lacks a proper blendv.
48272 unsigned EltBitWidth = VT.getScalarSizeInBits();
48273 bool CanShiftBlend =
48274 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
48275 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
48276 (Subtarget.hasXOP()));
48277 if (CanShiftBlend &&
48278 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
48279 return C->getAPIntValue().isPowerOf2();
48280 })) {
48281 // Create a left-shift constant to get the mask bits over to the sign-bit.
48282 SDValue Mask = And.getOperand(1);
48283 SmallVector<int, 32> ShlVals;
48284 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
48285 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
48286 ShlVals.push_back(EltBitWidth - 1 -
48287 MaskVal->getAPIntValue().exactLogBase2());
48288 }
48289 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
48290 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
48291 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
48292 SDValue NewCond =
48293 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
48294 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
48295 }
48296 }
48297
48298 return SDValue();
48299}
48300
48301/// Combine:
48302/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
48303/// to:
48304/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
48305/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
48306/// Note that this is only legal for some op/cc combinations.
48308 SelectionDAG &DAG,
48309 const X86Subtarget &Subtarget) {
48310 // This combine only operates on CMP-like nodes.
48311 if (!(Cmp.getOpcode() == X86ISD::CMP ||
48312 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
48313 return SDValue();
48314
48315 // Can't replace the cmp if it has more uses than the one we're looking at.
48316 // FIXME: We would like to be able to handle this, but would need to make sure
48317 // all uses were updated.
48318 if (!Cmp.hasOneUse())
48319 return SDValue();
48320
48321 // This only applies to variations of the common case:
48322 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
48323 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
48324 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
48325 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
48326 // Using the proper condcodes (see below), overflow is checked for.
48327
48328 // FIXME: We can generalize both constraints:
48329 // - XOR/OR/AND (if they were made to survive AtomicExpand)
48330 // - LHS != 1
48331 // if the result is compared.
48332
48333 SDValue CmpLHS = Cmp.getOperand(0);
48334 SDValue CmpRHS = Cmp.getOperand(1);
48335 EVT CmpVT = CmpLHS.getValueType();
48336
48337 if (!CmpLHS.hasOneUse())
48338 return SDValue();
48339
48340 unsigned Opc = CmpLHS.getOpcode();
48341 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
48342 return SDValue();
48343
48344 SDValue OpRHS = CmpLHS.getOperand(2);
48345 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
48346 if (!OpRHSC)
48347 return SDValue();
48348
48349 APInt Addend = OpRHSC->getAPIntValue();
48350 if (Opc == ISD::ATOMIC_LOAD_SUB)
48351 Addend = -Addend;
48352
48353 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
48354 if (!CmpRHSC)
48355 return SDValue();
48356
48357 APInt Comparison = CmpRHSC->getAPIntValue();
48358 APInt NegAddend = -Addend;
48359
48360 // See if we can adjust the CC to make the comparison match the negated
48361 // addend.
48362 if (Comparison != NegAddend) {
48363 APInt IncComparison = Comparison + 1;
48364 if (IncComparison == NegAddend) {
48365 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
48366 Comparison = IncComparison;
48367 CC = X86::COND_AE;
48368 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
48369 Comparison = IncComparison;
48370 CC = X86::COND_L;
48371 }
48372 }
48373 APInt DecComparison = Comparison - 1;
48374 if (DecComparison == NegAddend) {
48375 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
48376 Comparison = DecComparison;
48377 CC = X86::COND_A;
48378 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
48379 Comparison = DecComparison;
48380 CC = X86::COND_LE;
48381 }
48382 }
48383 }
48384
48385 // If the addend is the negation of the comparison value, then we can do
48386 // a full comparison by emitting the atomic arithmetic as a locked sub.
48387 if (Comparison == NegAddend) {
48388 // The CC is fine, but we need to rewrite the LHS of the comparison as an
48389 // atomic sub.
48390 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
48391 auto AtomicSub = DAG.getAtomic(
48392 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
48393 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
48394 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
48395 AN->getMemOperand());
48396 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
48397 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
48398 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
48399 return LockOp;
48400 }
48401
48402 // We can handle comparisons with zero in a number of cases by manipulating
48403 // the CC used.
48404 if (!Comparison.isZero())
48405 return SDValue();
48406
48407 if (CC == X86::COND_S && Addend == 1)
48408 CC = X86::COND_LE;
48409 else if (CC == X86::COND_NS && Addend == 1)
48410 CC = X86::COND_G;
48411 else if (CC == X86::COND_G && Addend == -1)
48412 CC = X86::COND_GE;
48413 else if (CC == X86::COND_LE && Addend == -1)
48414 CC = X86::COND_L;
48415 else
48416 return SDValue();
48417
48418 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
48419 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
48420 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
48421 return LockOp;
48422}
48423
48424// Check whether we're just testing the signbit, and whether we can simplify
48425// this by tracking where the signbit came from.
48427 SelectionDAG &DAG) {
48428 if (CC != X86::COND_S && CC != X86::COND_NS)
48429 return SDValue();
48430
48431 if (!Cmp.hasOneUse())
48432 return SDValue();
48433
48434 SDValue Src;
48435 if (Cmp.getOpcode() == X86ISD::CMP) {
48436 // CMP(X,0) -> signbit test
48437 if (!isNullConstant(Cmp.getOperand(1)))
48438 return SDValue();
48439 Src = Cmp.getOperand(0);
48440 // Peek through a SRA node as we just need the signbit.
48441 // TODO: Remove one use limit once sdiv-fix regressions are fixed.
48442 // TODO: Use SimplifyDemandedBits instead of just SRA?
48443 if (Src.getOpcode() != ISD::SRA || !Src.hasOneUse())
48444 return SDValue();
48445 Src = Src.getOperand(0);
48446 } else if (Cmp.getOpcode() == X86ISD::OR) {
48447 // OR(X,Y) -> see if only one operand contributes to the signbit.
48448 // TODO: XOR(X,Y) -> see if only one operand contributes to the signbit.
48449 if (DAG.SignBitIsZero(Cmp.getOperand(0)))
48450 Src = Cmp.getOperand(1);
48451 else if (DAG.SignBitIsZero(Cmp.getOperand(1)))
48452 Src = Cmp.getOperand(0);
48453 else
48454 return SDValue();
48455 } else {
48456 return SDValue();
48457 }
48458
48459 // Replace with a TEST on the MSB.
48460 SDLoc DL(Cmp);
48461 MVT SrcVT = Src.getSimpleValueType();
48462 APInt BitMask = APInt::getSignMask(SrcVT.getScalarSizeInBits());
48463
48464 // If Src came from a SIGN_EXTEND_INREG or SHL (probably from an expanded
48465 // SIGN_EXTEND_INREG), then peek through and adjust the TEST bit.
48466 if (Src.getOpcode() == ISD::SHL) {
48467 if (std::optional<unsigned> ShiftAmt = DAG.getValidShiftAmount(Src)) {
48468 Src = Src.getOperand(0);
48469 BitMask.lshrInPlace(*ShiftAmt);
48470 }
48471 } else if (Src.getOpcode() == ISD::SIGN_EXTEND_INREG) {
48472 EVT ExtVT = cast<VTSDNode>(Src.getOperand(1))->getVT();
48473 Src = Src.getOperand(0);
48474 BitMask.lshrInPlace(BitMask.getBitWidth() - ExtVT.getScalarSizeInBits());
48475 }
48476
48477 SDValue Mask = DAG.getNode(ISD::AND, DL, SrcVT, Src,
48478 DAG.getConstant(BitMask, DL, SrcVT));
48479 CC = CC == X86::COND_S ? X86::COND_NE : X86::COND_E;
48480 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Mask,
48481 DAG.getConstant(0, DL, SrcVT));
48482}
48483
48484// Check whether a boolean test is testing a boolean value generated by
48485// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
48486// code.
48487//
48488// Simplify the following patterns:
48489// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
48490// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
48491// to (Op EFLAGS Cond)
48492//
48493// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
48494// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
48495// to (Op EFLAGS !Cond)
48496//
48497// where Op could be BRCOND or CMOV.
48498//
48500 // This combine only operates on CMP-like nodes.
48501 if (!(Cmp.getOpcode() == X86ISD::CMP ||
48502 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
48503 return SDValue();
48504
48505 // Quit if not used as a boolean value.
48506 if (CC != X86::COND_E && CC != X86::COND_NE)
48507 return SDValue();
48508
48509 // Check CMP operands. One of them should be 0 or 1 and the other should be
48510 // an SetCC or extended from it.
48511 SDValue Op1 = Cmp.getOperand(0);
48512 SDValue Op2 = Cmp.getOperand(1);
48513
48514 SDValue SetCC;
48515 const ConstantSDNode* C = nullptr;
48516 bool needOppositeCond = (CC == X86::COND_E);
48517 bool checkAgainstTrue = false; // Is it a comparison against 1?
48518
48519 if ((C = dyn_cast<ConstantSDNode>(Op1)))
48520 SetCC = Op2;
48521 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
48522 SetCC = Op1;
48523 else // Quit if all operands are not constants.
48524 return SDValue();
48525
48526 if (C->getZExtValue() == 1) {
48527 needOppositeCond = !needOppositeCond;
48528 checkAgainstTrue = true;
48529 } else if (C->getZExtValue() != 0)
48530 // Quit if the constant is neither 0 or 1.
48531 return SDValue();
48532
48533 bool truncatedToBoolWithAnd = false;
48534 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
48535 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
48536 SetCC.getOpcode() == ISD::TRUNCATE ||
48537 SetCC.getOpcode() == ISD::AND) {
48538 if (SetCC.getOpcode() == ISD::AND) {
48539 int OpIdx = -1;
48540 if (isOneConstant(SetCC.getOperand(0)))
48541 OpIdx = 1;
48542 if (isOneConstant(SetCC.getOperand(1)))
48543 OpIdx = 0;
48544 if (OpIdx < 0)
48545 break;
48546 SetCC = SetCC.getOperand(OpIdx);
48547 truncatedToBoolWithAnd = true;
48548 } else
48549 SetCC = SetCC.getOperand(0);
48550 }
48551
48552 switch (SetCC.getOpcode()) {
48554 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
48555 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
48556 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
48557 // truncated to i1 using 'and'.
48558 if (checkAgainstTrue && !truncatedToBoolWithAnd)
48559 break;
48561 "Invalid use of SETCC_CARRY!");
48562 [[fallthrough]];
48563 case X86ISD::SETCC:
48564 // Set the condition code or opposite one if necessary.
48565 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
48566 if (needOppositeCond)
48568 return SetCC.getOperand(1);
48569 case X86ISD::CMOV: {
48570 // Check whether false/true value has canonical one, i.e. 0 or 1.
48573 // Quit if true value is not a constant.
48574 if (!TVal)
48575 return SDValue();
48576 // Quit if false value is not a constant.
48577 if (!FVal) {
48578 SDValue Op = SetCC.getOperand(0);
48579 // Skip 'zext' or 'trunc' node.
48580 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
48581 Op.getOpcode() == ISD::TRUNCATE)
48582 Op = Op.getOperand(0);
48583 // A special case for rdrand/rdseed, where 0 is set if false cond is
48584 // found.
48585 if ((Op.getOpcode() != X86ISD::RDRAND &&
48586 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
48587 return SDValue();
48588 }
48589 // Quit if false value is not the constant 0 or 1.
48590 bool FValIsFalse = true;
48591 if (FVal && FVal->getZExtValue() != 0) {
48592 if (FVal->getZExtValue() != 1)
48593 return SDValue();
48594 // If FVal is 1, opposite cond is needed.
48595 needOppositeCond = !needOppositeCond;
48596 FValIsFalse = false;
48597 }
48598 // Quit if TVal is not the constant opposite of FVal.
48599 if (FValIsFalse && TVal->getZExtValue() != 1)
48600 return SDValue();
48601 if (!FValIsFalse && TVal->getZExtValue() != 0)
48602 return SDValue();
48603 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
48604 if (needOppositeCond)
48606 return SetCC.getOperand(3);
48607 }
48608 }
48609
48610 return SDValue();
48611}
48612
48613/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
48614/// Match:
48615/// (X86or (X86setcc) (X86setcc))
48616/// (X86cmp (and (X86setcc) (X86setcc)), 0)
48618 X86::CondCode &CC1, SDValue &Flags,
48619 bool &isAnd) {
48620 if (Cond->getOpcode() == X86ISD::CMP) {
48621 if (!isNullConstant(Cond->getOperand(1)))
48622 return false;
48623
48624 Cond = Cond->getOperand(0);
48625 }
48626
48627 isAnd = false;
48628
48629 SDValue SetCC0, SetCC1;
48630 switch (Cond->getOpcode()) {
48631 default: return false;
48632 case ISD::AND:
48633 case X86ISD::AND:
48634 isAnd = true;
48635 [[fallthrough]];
48636 case ISD::OR:
48637 case X86ISD::OR:
48638 SetCC0 = Cond->getOperand(0);
48639 SetCC1 = Cond->getOperand(1);
48640 break;
48641 };
48642
48643 // Make sure we have SETCC nodes, using the same flags value.
48644 if (SetCC0.getOpcode() != X86ISD::SETCC ||
48645 SetCC1.getOpcode() != X86ISD::SETCC ||
48646 SetCC0->getOperand(1) != SetCC1->getOperand(1))
48647 return false;
48648
48649 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
48650 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
48651 Flags = SetCC0->getOperand(1);
48652 return true;
48653}
48654
48655// When legalizing carry, we create carries via add X, -1
48656// If that comes from an actual carry, via setcc, we use the
48657// carry directly.
48659 if (EFLAGS.getOpcode() == X86ISD::ADD) {
48660 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
48661 bool FoundAndLSB = false;
48662 SDValue Carry = EFLAGS.getOperand(0);
48663 while (Carry.getOpcode() == ISD::TRUNCATE ||
48664 Carry.getOpcode() == ISD::ZERO_EXTEND ||
48665 (Carry.getOpcode() == ISD::AND &&
48666 isOneConstant(Carry.getOperand(1)))) {
48667 FoundAndLSB |= Carry.getOpcode() == ISD::AND;
48668 Carry = Carry.getOperand(0);
48669 }
48670 if (Carry.getOpcode() == X86ISD::SETCC ||
48671 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
48672 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
48673 uint64_t CarryCC = Carry.getConstantOperandVal(0);
48674 SDValue CarryOp1 = Carry.getOperand(1);
48675 if (CarryCC == X86::COND_B)
48676 return CarryOp1;
48677 if (CarryCC == X86::COND_A) {
48678 // Try to convert COND_A into COND_B in an attempt to facilitate
48679 // materializing "setb reg".
48680 //
48681 // Do not flip "e > c", where "c" is a constant, because Cmp
48682 // instruction cannot take an immediate as its first operand.
48683 //
48684 if (CarryOp1.getOpcode() == X86ISD::SUB &&
48685 CarryOp1.getNode()->hasOneUse() &&
48686 CarryOp1.getValueType().isInteger() &&
48687 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
48688 SDValue SubCommute =
48689 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
48690 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
48691 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
48692 }
48693 }
48694 // If this is a check of the z flag of an add with 1, switch to the
48695 // C flag.
48696 if (CarryCC == X86::COND_E &&
48697 CarryOp1.getOpcode() == X86ISD::ADD &&
48698 isOneConstant(CarryOp1.getOperand(1)))
48699 return CarryOp1;
48700 } else if (FoundAndLSB) {
48701 SDLoc DL(Carry);
48702 SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
48703 if (Carry.getOpcode() == ISD::SRL) {
48704 BitNo = Carry.getOperand(1);
48705 Carry = Carry.getOperand(0);
48706 }
48707 return getBT(Carry, BitNo, DL, DAG);
48708 }
48709 }
48710 }
48711
48712 return SDValue();
48713}
48714
48715/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
48716/// to avoid the inversion.
48718 SelectionDAG &DAG,
48719 const X86Subtarget &Subtarget) {
48720 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
48721 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
48722 EFLAGS.getOpcode() != X86ISD::TESTP)
48723 return SDValue();
48724
48725 // PTEST/TESTP sets EFLAGS as:
48726 // TESTZ: ZF = (Op0 & Op1) == 0
48727 // TESTC: CF = (~Op0 & Op1) == 0
48728 // TESTNZC: ZF == 0 && CF == 0
48729 MVT VT = EFLAGS.getSimpleValueType();
48730 SDValue Op0 = EFLAGS.getOperand(0);
48731 SDValue Op1 = EFLAGS.getOperand(1);
48732 MVT OpVT = Op0.getSimpleValueType();
48733 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48734
48735 // TEST*(~X,Y) == TEST*(X,Y)
48736 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
48737 X86::CondCode InvCC;
48738 switch (CC) {
48739 case X86::COND_B:
48740 // testc -> testz.
48741 InvCC = X86::COND_E;
48742 break;
48743 case X86::COND_AE:
48744 // !testc -> !testz.
48745 InvCC = X86::COND_NE;
48746 break;
48747 case X86::COND_E:
48748 // testz -> testc.
48749 InvCC = X86::COND_B;
48750 break;
48751 case X86::COND_NE:
48752 // !testz -> !testc.
48753 InvCC = X86::COND_AE;
48754 break;
48755 case X86::COND_A:
48756 case X86::COND_BE:
48757 // testnzc -> testnzc (no change).
48758 InvCC = CC;
48759 break;
48760 default:
48761 InvCC = X86::COND_INVALID;
48762 break;
48763 }
48764
48765 if (InvCC != X86::COND_INVALID) {
48766 CC = InvCC;
48767 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48768 DAG.getBitcast(OpVT, NotOp0), Op1);
48769 }
48770 }
48771
48772 if (CC == X86::COND_B || CC == X86::COND_AE) {
48773 // TESTC(X,~X) == TESTC(X,-1)
48774 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48775 if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {
48776 SDLoc DL(EFLAGS);
48777 return DAG.getNode(
48778 EFLAGS.getOpcode(), DL, VT, DAG.getBitcast(OpVT, NotOp1),
48779 DAG.getBitcast(OpVT,
48780 DAG.getAllOnesConstant(DL, NotOp1.getValueType())));
48781 }
48782 }
48783 // PTESTC(PCMPEQ(X,0),-1) == PTESTZ(X,X)
48784 if (EFLAGS.getOpcode() == X86ISD::PTEST &&
48786 SDValue BC0 = peekThroughBitcasts(Op0);
48787 if (BC0.getOpcode() == X86ISD::PCMPEQ &&
48789 SDLoc DL(EFLAGS);
48790 CC = (CC == X86::COND_B ? X86::COND_E : X86::COND_NE);
48791 SDValue X = DAG.getBitcast(OpVT, BC0.getOperand(0));
48792 return DAG.getNode(EFLAGS.getOpcode(), DL, VT, X, X);
48793 }
48794 }
48795 }
48796
48797 if (CC == X86::COND_E || CC == X86::COND_NE) {
48798 // TESTZ(X,~Y) == TESTC(Y,X)
48799 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48800 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
48801 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48802 DAG.getBitcast(OpVT, NotOp1), Op0);
48803 }
48804
48805 if (Op0 == Op1) {
48806 SDValue BC = peekThroughBitcasts(Op0);
48807 EVT BCVT = BC.getValueType();
48808
48809 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
48810 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
48811 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48812 DAG.getBitcast(OpVT, BC.getOperand(0)),
48813 DAG.getBitcast(OpVT, BC.getOperand(1)));
48814 }
48815
48816 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
48817 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
48818 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
48819 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48820 DAG.getBitcast(OpVT, BC.getOperand(0)),
48821 DAG.getBitcast(OpVT, BC.getOperand(1)));
48822 }
48823
48824 // If every element is an all-sign value, see if we can use TESTP/MOVMSK
48825 // to more efficiently extract the sign bits and compare that.
48826 // TODO: Handle TESTC with comparison inversion.
48827 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
48828 // TESTP/MOVMSK combines to make sure its never worse than PTEST?
48829 if (BCVT.isVector() && TLI.isTypeLegal(BCVT)) {
48830 unsigned EltBits = BCVT.getScalarSizeInBits();
48831 if (DAG.ComputeNumSignBits(BC) == EltBits) {
48832 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
48833 APInt SignMask = APInt::getSignMask(EltBits);
48834 if (SDValue Res =
48835 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
48836 // For vXi16 cases we need to use pmovmksb and extract every other
48837 // sign bit.
48838 SDLoc DL(EFLAGS);
48839 if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) {
48840 MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
48841 MVT FloatVT =
48842 MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);
48843 Res = DAG.getBitcast(FloatVT, Res);
48844 return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);
48845 } else if (EltBits == 16) {
48846 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
48847 Res = DAG.getBitcast(MovmskVT, Res);
48848 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48849 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
48850 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
48851 } else {
48852 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48853 }
48854 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
48855 DAG.getConstant(0, DL, MVT::i32));
48856 }
48857 }
48858 }
48859 }
48860
48861 // TESTZ(-1,X) == TESTZ(X,X)
48863 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
48864
48865 // TESTZ(X,-1) == TESTZ(X,X)
48867 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
48868
48869 // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
48870 // TODO: Add COND_NE handling?
48871 if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
48872 SDValue Src0 = peekThroughBitcasts(Op0);
48873 SDValue Src1 = peekThroughBitcasts(Op1);
48874 if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
48876 peekThroughBitcasts(Src0.getOperand(1)), true);
48878 peekThroughBitcasts(Src1.getOperand(1)), true);
48879 if (Src0 && Src1) {
48880 MVT OpVT2 = OpVT.getDoubleNumVectorElementsVT();
48881 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48882 DAG.getBitcast(OpVT2, Src0),
48883 DAG.getBitcast(OpVT2, Src1));
48884 }
48885 }
48886 }
48887 }
48888
48889 return SDValue();
48890}
48891
48892// Attempt to simplify the MOVMSK input based on the comparison type.
48894 SelectionDAG &DAG,
48895 const X86Subtarget &Subtarget) {
48896 // Handle eq/ne against zero (any_of).
48897 // Handle eq/ne against -1 (all_of).
48898 if (!(CC == X86::COND_E || CC == X86::COND_NE))
48899 return SDValue();
48900 if (EFLAGS.getValueType() != MVT::i32)
48901 return SDValue();
48902 unsigned CmpOpcode = EFLAGS.getOpcode();
48903 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
48904 return SDValue();
48905 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
48906 if (!CmpConstant)
48907 return SDValue();
48908 const APInt &CmpVal = CmpConstant->getAPIntValue();
48909
48910 SDValue CmpOp = EFLAGS.getOperand(0);
48911 unsigned CmpBits = CmpOp.getValueSizeInBits();
48912 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
48913
48914 // Peek through any truncate.
48915 if (CmpOp.getOpcode() == ISD::TRUNCATE)
48916 CmpOp = CmpOp.getOperand(0);
48917
48918 // Bail if we don't find a MOVMSK.
48919 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
48920 return SDValue();
48921
48922 SDValue Vec = CmpOp.getOperand(0);
48923 MVT VecVT = Vec.getSimpleValueType();
48924 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
48925 "Unexpected MOVMSK operand");
48926 unsigned NumElts = VecVT.getVectorNumElements();
48927 unsigned NumEltBits = VecVT.getScalarSizeInBits();
48928
48929 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
48930 bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
48931 NumElts <= CmpBits && CmpVal.isMask(NumElts);
48932 if (!IsAnyOf && !IsAllOf)
48933 return SDValue();
48934
48935 // TODO: Check more combining cases for me.
48936 // Here we check the cmp use number to decide do combining or not.
48937 // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"
48938 // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.
48939 bool IsOneUse = CmpOp.getNode()->hasOneUse();
48940
48941 // See if we can peek through to a vector with a wider element type, if the
48942 // signbits extend down to all the sub-elements as well.
48943 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
48944 // potential SimplifyDemandedBits/Elts cases.
48945 // If we looked through a truncate that discard bits, we can't do this
48946 // transform.
48947 // FIXME: We could do this transform for truncates that discarded bits by
48948 // inserting an AND mask between the new MOVMSK and the CMP.
48949 if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
48950 SDValue BC = peekThroughBitcasts(Vec);
48951 MVT BCVT = BC.getSimpleValueType();
48952 unsigned BCNumElts = BCVT.getVectorNumElements();
48953 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
48954 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
48955 BCNumEltBits > NumEltBits &&
48956 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
48957 SDLoc DL(EFLAGS);
48958 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
48959 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48960 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
48961 DAG.getConstant(CmpMask, DL, MVT::i32));
48962 }
48963 }
48964
48965 // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).
48966 // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
48967 // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
48968 // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
48969 if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
48971 if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
48972 Ops.size() == 2) {
48973 SDLoc DL(EFLAGS);
48974 EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
48975 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
48976 SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
48977 DAG.getBitcast(SubVT, Ops[0]),
48978 DAG.getBitcast(SubVT, Ops[1]));
48979 V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
48980 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48981 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
48982 DAG.getConstant(CmpMask, DL, MVT::i32));
48983 }
48984 }
48985
48986 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
48987 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
48988 // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).
48989 // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).
48990 if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {
48991 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
48992 SDValue BC = peekThroughBitcasts(Vec);
48993 // Ensure MOVMSK was testing every signbit of BC.
48994 if (BC.getValueType().getVectorNumElements() <= NumElts) {
48995 if (BC.getOpcode() == X86ISD::PCMPEQ) {
48996 SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),
48997 BC.getOperand(0), BC.getOperand(1));
48998 V = DAG.getBitcast(TestVT, V);
48999 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
49000 }
49001 // Check for 256-bit split vector cases.
49002 if (BC.getOpcode() == ISD::AND &&
49003 BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
49004 BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
49005 SDValue LHS = BC.getOperand(0);
49006 SDValue RHS = BC.getOperand(1);
49007 LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),
49008 LHS.getOperand(0), LHS.getOperand(1));
49009 RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),
49010 RHS.getOperand(0), RHS.getOperand(1));
49011 LHS = DAG.getBitcast(TestVT, LHS);
49012 RHS = DAG.getBitcast(TestVT, RHS);
49013 SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
49014 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
49015 }
49016 }
49017 }
49018
49019 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
49020 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
49021 // sign bits prior to the comparison with zero unless we know that
49022 // the vXi16 splats the sign bit down to the lower i8 half.
49023 // TODO: Handle all_of patterns.
49024 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
49025 SDValue VecOp0 = Vec.getOperand(0);
49026 SDValue VecOp1 = Vec.getOperand(1);
49027 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
49028 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
49029 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
49030 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
49031 SDLoc DL(EFLAGS);
49032 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
49033 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49034 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
49035 if (!SignExt0) {
49036 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
49037 DAG.getConstant(0xAAAA, DL, MVT::i16));
49038 }
49039 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
49040 DAG.getConstant(0, DL, MVT::i16));
49041 }
49042 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
49043 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
49044 if (CmpBits >= 16 && Subtarget.hasInt256() &&
49045 (IsAnyOf || (SignExt0 && SignExt1))) {
49046 if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
49047 SDLoc DL(EFLAGS);
49048 SDValue Result = peekThroughBitcasts(Src);
49049 if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
49050 Result.getValueType().getVectorNumElements() <= NumElts) {
49051 SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),
49052 Result.getOperand(0), Result.getOperand(1));
49053 V = DAG.getBitcast(MVT::v4i64, V);
49054 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
49055 }
49056 Result = DAG.getBitcast(MVT::v32i8, Result);
49057 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49058 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
49059 if (!SignExt0 || !SignExt1) {
49060 assert(IsAnyOf &&
49061 "Only perform v16i16 signmasks for any_of patterns");
49062 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
49063 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
49064 }
49065 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
49066 DAG.getConstant(CmpMask, DL, MVT::i32));
49067 }
49068 }
49069 }
49070
49071 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
49072 // Since we peek through a bitcast, we need to be careful if the base vector
49073 // type has smaller elements than the MOVMSK type. In that case, even if
49074 // all the elements are demanded by the shuffle mask, only the "high"
49075 // elements which have highbits that align with highbits in the MOVMSK vec
49076 // elements are actually demanded. A simplification of spurious operations
49077 // on the "low" elements take place during other simplifications.
49078 //
49079 // For example:
49080 // MOVMSK64(BITCAST(SHUF32 X, (1,0,3,2))) even though all the elements are
49081 // demanded, because we are swapping around the result can change.
49082 //
49083 // To address this, we check that we can scale the shuffle mask to MOVMSK
49084 // element width (this will ensure "high" elements match). Its slightly overly
49085 // conservative, but fine for an edge case fold.
49086 SmallVector<int, 32> ShuffleMask;
49087 SmallVector<SDValue, 2> ShuffleInputs;
49088 if (NumElts <= CmpBits &&
49089 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
49090 ShuffleMask, DAG) &&
49091 ShuffleInputs.size() == 1 && isCompletePermute(ShuffleMask) &&
49092 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits() &&
49093 canScaleShuffleElements(ShuffleMask, NumElts)) {
49094 SDLoc DL(EFLAGS);
49095 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
49096 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49097 Result =
49098 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
49099 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, EFLAGS.getOperand(1));
49100 }
49101
49102 // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)
49103 // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)
49104 // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
49105 // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
49106 // iff every element is referenced.
49107 if (NumElts <= CmpBits && Subtarget.hasAVX() &&
49108 !Subtarget.preferMovmskOverVTest() && IsOneUse &&
49109 (NumEltBits == 32 || NumEltBits == 64)) {
49110 SDLoc DL(EFLAGS);
49111 MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);
49112 MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts);
49113 MVT IntVT = FloatVT.changeVectorElementTypeToInteger();
49114 SDValue LHS = Vec;
49115 SDValue RHS = IsAnyOf ? Vec : DAG.getAllOnesConstant(DL, IntVT);
49116 CC = IsAnyOf ? CC : (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
49117 return DAG.getNode(X86ISD::TESTP, DL, MVT::i32,
49118 DAG.getBitcast(FloatVT, LHS),
49119 DAG.getBitcast(FloatVT, RHS));
49120 }
49121
49122 return SDValue();
49123}
49124
49125/// Optimize an EFLAGS definition used according to the condition code \p CC
49126/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
49127/// uses of chain values.
49129 SelectionDAG &DAG,
49130 const X86Subtarget &Subtarget) {
49131 if (CC == X86::COND_B)
49132 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
49133 return Flags;
49134
49135 if (SDValue R = checkSignTestSetCCCombine(EFLAGS, CC, DAG))
49136 return R;
49137
49138 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
49139 return R;
49140
49141 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
49142 return R;
49143
49144 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
49145 return R;
49146
49147 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
49148}
49149
49150/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
49153 const X86Subtarget &Subtarget) {
49154 SDLoc DL(N);
49155 EVT VT = N->getValueType(0);
49156 SDValue FalseOp = N->getOperand(0);
49157 SDValue TrueOp = N->getOperand(1);
49158 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
49159 SDValue Cond = N->getOperand(3);
49160
49161 // cmov X, X, ?, ? --> X
49162 if (TrueOp == FalseOp)
49163 return TrueOp;
49164
49165 // Try to simplify the EFLAGS and condition code operands.
49166 // We can't always do this as FCMOV only supports a subset of X86 cond.
49167 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
49168 if (!(FalseOp.getValueType() == MVT::f80 ||
49169 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
49170 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
49171 !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
49172 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
49173 Flags};
49174 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49175 }
49176 }
49177
49178 // If this is a select between two integer constants, try to do some
49179 // optimizations. Note that the operands are ordered the opposite of SELECT
49180 // operands.
49181 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
49182 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
49183 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
49184 // larger than FalseC (the false value).
49185 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
49187 std::swap(TrueC, FalseC);
49188 std::swap(TrueOp, FalseOp);
49189 }
49190
49191 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
49192 // This is efficient for any integer data type (including i8/i16) and
49193 // shift amount.
49194 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
49195 Cond = getSETCC(CC, Cond, DL, DAG);
49196
49197 // Zero extend the condition if needed.
49198 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
49199
49200 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
49201 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
49202 DAG.getConstant(ShAmt, DL, MVT::i8));
49203 return Cond;
49204 }
49205
49206 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
49207 // for any integer data type, including i8/i16.
49208 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
49209 Cond = getSETCC(CC, Cond, DL, DAG);
49210
49211 // Zero extend the condition if needed.
49213 FalseC->getValueType(0), Cond);
49214 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
49215 SDValue(FalseC, 0));
49216 return Cond;
49217 }
49218
49219 // Optimize cases that will turn into an LEA instruction. This requires
49220 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
49221 if (VT == MVT::i32 || VT == MVT::i64) {
49222 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
49223 assert(Diff.getBitWidth() == VT.getSizeInBits() &&
49224 "Implicit constant truncation");
49225
49226 bool isFastMultiplier = false;
49227 if (Diff.ult(10)) {
49228 switch (Diff.getZExtValue()) {
49229 default: break;
49230 case 1: // result = add base, cond
49231 case 2: // result = lea base( , cond*2)
49232 case 3: // result = lea base(cond, cond*2)
49233 case 4: // result = lea base( , cond*4)
49234 case 5: // result = lea base(cond, cond*4)
49235 case 8: // result = lea base( , cond*8)
49236 case 9: // result = lea base(cond, cond*8)
49237 isFastMultiplier = true;
49238 break;
49239 }
49240 }
49241
49242 if (isFastMultiplier) {
49243 Cond = getSETCC(CC, Cond, DL ,DAG);
49244 // Zero extend the condition if needed.
49245 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
49246 Cond);
49247 // Scale the condition by the difference.
49248 if (Diff != 1)
49249 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
49250 DAG.getConstant(Diff, DL, Cond.getValueType()));
49251
49252 // Add the base if non-zero.
49253 if (FalseC->getAPIntValue() != 0)
49254 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
49255 SDValue(FalseC, 0));
49256 return Cond;
49257 }
49258 }
49259 }
49260 }
49261
49262 // Handle these cases:
49263 // (select (x != c), e, c) -> select (x != c), e, x),
49264 // (select (x == c), c, e) -> select (x == c), x, e)
49265 // where the c is an integer constant, and the "select" is the combination
49266 // of CMOV and CMP.
49267 //
49268 // The rationale for this change is that the conditional-move from a constant
49269 // needs two instructions, however, conditional-move from a register needs
49270 // only one instruction.
49271 //
49272 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
49273 // some instruction-combining opportunities. This opt needs to be
49274 // postponed as late as possible.
49275 //
49276 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
49277 // the DCI.xxxx conditions are provided to postpone the optimization as
49278 // late as possible.
49279
49280 ConstantSDNode *CmpAgainst = nullptr;
49281 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
49282 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
49283 !isa<ConstantSDNode>(Cond.getOperand(0))) {
49284
49285 if (CC == X86::COND_NE &&
49286 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
49288 std::swap(TrueOp, FalseOp);
49289 }
49290
49291 if (CC == X86::COND_E && CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
49292 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
49293 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
49294 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49295 }
49296 }
49297 }
49298
49299 // Transform:
49300 //
49301 // (cmov 1 T (uge T 2))
49302 //
49303 // to:
49304 //
49305 // (adc T 0 (sub T 1))
49306 if (CC == X86::COND_AE && isOneConstant(FalseOp) &&
49307 Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {
49308 SDValue Cond0 = Cond.getOperand(0);
49309 if (Cond0.getOpcode() == ISD::TRUNCATE)
49310 Cond0 = Cond0.getOperand(0);
49311 auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
49312 if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
49313 EVT CondVT = Cond->getValueType(0);
49314 // Subtract 1 and generate a carry.
49315 SDValue NewSub =
49316 DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
49317 DAG.getConstant(1, DL, CondVT));
49318 SDValue EFLAGS(NewSub.getNode(), 1);
49319 return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(VT, MVT::i32), TrueOp,
49320 DAG.getConstant(0, DL, VT), EFLAGS);
49321 }
49322 }
49323
49324 // Fold and/or of setcc's to double CMOV:
49325 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
49326 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
49327 //
49328 // This combine lets us generate:
49329 // cmovcc1 (jcc1 if we don't have CMOV)
49330 // cmovcc2 (same)
49331 // instead of:
49332 // setcc1
49333 // setcc2
49334 // and/or
49335 // cmovne (jne if we don't have CMOV)
49336 // When we can't use the CMOV instruction, it might increase branch
49337 // mispredicts.
49338 // When we can use CMOV, or when there is no mispredict, this improves
49339 // throughput and reduces register pressure.
49340 //
49341 if (CC == X86::COND_NE) {
49342 SDValue Flags;
49343 X86::CondCode CC0, CC1;
49344 bool isAndSetCC;
49345 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
49346 if (isAndSetCC) {
49347 std::swap(FalseOp, TrueOp);
49350 }
49351
49352 SDValue LOps[] = {FalseOp, TrueOp,
49353 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
49354 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, VT, LOps);
49355 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
49356 Flags};
49357 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49358 return CMOV;
49359 }
49360 }
49361
49362 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
49363 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
49364 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
49365 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
49366 // Or (CMOV (BSR ?, X), Y, (X == 0)) -> (BSR Y, X)
49367 // TODO: Or (CMOV (BSF ?, X), Y, (X == 0)) -> (BSF Y, X)
49368 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
49369 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
49370 SDValue Add = TrueOp;
49371 SDValue Const = FalseOp;
49372 // Canonicalize the condition code for easier matching and output.
49373 if (CC == X86::COND_E)
49374 std::swap(Add, Const);
49375
49376 // TODO: ADD BSF support, but requires changes to the "REP BSF" CTTZ hack.
49377 if (Subtarget.hasBitScanPassThrough() && Add.getOpcode() == X86ISD::BSR &&
49378 Add.getResNo() == 0 && Add.hasOneUse() &&
49379 Add.getOperand(1) == Cond.getOperand(0)) {
49380 return DAG.getNode(Add.getOpcode(), DL, Add->getVTList(), Const,
49381 Add.getOperand(1));
49382 }
49383
49384 // We might have replaced the constant in the cmov with the LHS of the
49385 // compare. If so change it to the RHS of the compare.
49386 if (Const == Cond.getOperand(0))
49387 Const = Cond.getOperand(1);
49388
49389 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
49390 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
49391 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
49392 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
49393 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
49394 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
49395 // This should constant fold.
49396 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
49397 SDValue CMov =
49398 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
49399 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
49400 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
49401 }
49402 }
49403
49404 return SDValue();
49405}
49406
49407/// Different mul shrinking modes.
49409
49411 EVT VT = N->getOperand(0).getValueType();
49412 if (VT.getScalarSizeInBits() != 32)
49413 return false;
49414
49415 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
49416 unsigned SignBits[2] = {1, 1};
49417 bool IsPositive[2] = {false, false};
49418 for (unsigned i = 0; i < 2; i++) {
49419 SDValue Opd = N->getOperand(i);
49420
49421 SignBits[i] = DAG.ComputeNumSignBits(Opd);
49422 IsPositive[i] = DAG.SignBitIsZero(Opd);
49423 }
49424
49425 bool AllPositive = IsPositive[0] && IsPositive[1];
49426 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
49427 // When ranges are from -128 ~ 127, use MULS8 mode.
49428 if (MinSignBits >= 25)
49430 // When ranges are from 0 ~ 255, use MULU8 mode.
49431 else if (AllPositive && MinSignBits >= 24)
49433 // When ranges are from -32768 ~ 32767, use MULS16 mode.
49434 else if (MinSignBits >= 17)
49436 // When ranges are from 0 ~ 65535, use MULU16 mode.
49437 else if (AllPositive && MinSignBits >= 16)
49439 else
49440 return false;
49441 return true;
49442}
49443
49444/// When the operands of vector mul are extended from smaller size values,
49445/// like i8 and i16, the type of mul may be shrinked to generate more
49446/// efficient code. Two typical patterns are handled:
49447/// Pattern1:
49448/// %2 = sext/zext <N x i8> %1 to <N x i32>
49449/// %4 = sext/zext <N x i8> %3 to <N x i32>
49450// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
49451/// %5 = mul <N x i32> %2, %4
49452///
49453/// Pattern2:
49454/// %2 = zext/sext <N x i16> %1 to <N x i32>
49455/// %4 = zext/sext <N x i16> %3 to <N x i32>
49456/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
49457/// %5 = mul <N x i32> %2, %4
49458///
49459/// There are four mul shrinking modes:
49460/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
49461/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
49462/// generate pmullw+sext32 for it (MULS8 mode).
49463/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
49464/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
49465/// generate pmullw+zext32 for it (MULU8 mode).
49466/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
49467/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
49468/// generate pmullw+pmulhw for it (MULS16 mode).
49469/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
49470/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
49471/// generate pmullw+pmulhuw for it (MULU16 mode).
49473 const X86Subtarget &Subtarget) {
49474 // Check for legality
49475 // pmullw/pmulhw are not supported by SSE.
49476 if (!Subtarget.hasSSE2())
49477 return SDValue();
49478
49479 // Check for profitability
49480 // pmulld is supported since SSE41. It is better to use pmulld
49481 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
49482 // the expansion.
49483 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
49484 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
49485 return SDValue();
49486
49488 if (!canReduceVMulWidth(N, DAG, Mode))
49489 return SDValue();
49490
49491 SDValue N0 = N->getOperand(0);
49492 SDValue N1 = N->getOperand(1);
49493 EVT VT = N->getOperand(0).getValueType();
49494 unsigned NumElts = VT.getVectorNumElements();
49495 if ((NumElts % 2) != 0)
49496 return SDValue();
49497
49498 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
49499
49500 // Shrink the operands of mul.
49501 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
49502 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
49503
49504 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
49505 // lower part is needed.
49506 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
49510 DL, VT, MulLo);
49511
49512 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
49513 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
49514 // the higher part is also needed.
49515 SDValue MulHi =
49517 ReducedVT, NewN0, NewN1);
49518
49519 // Repack the lower part and higher part result of mul into a wider
49520 // result.
49521 // Generate shuffle functioning as punpcklwd.
49522 SmallVector<int, 16> ShuffleMask(NumElts);
49523 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
49524 ShuffleMask[2 * i] = i;
49525 ShuffleMask[2 * i + 1] = i + NumElts;
49526 }
49527 SDValue ResLo =
49528 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
49529 ResLo = DAG.getBitcast(ResVT, ResLo);
49530 // Generate shuffle functioning as punpckhwd.
49531 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
49532 ShuffleMask[2 * i] = i + NumElts / 2;
49533 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
49534 }
49535 SDValue ResHi =
49536 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
49537 ResHi = DAG.getBitcast(ResVT, ResHi);
49538 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
49539}
49540
49542 EVT VT, const SDLoc &DL) {
49543
49544 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
49545 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49546 DAG.getConstant(Mult, DL, VT));
49547 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
49548 DAG.getConstant(Shift, DL, MVT::i8));
49549 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
49550 N->getOperand(0));
49551 return Result;
49552 };
49553
49554 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
49555 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49556 DAG.getConstant(Mul1, DL, VT));
49557 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
49558 DAG.getConstant(Mul2, DL, VT));
49559 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
49560 N->getOperand(0));
49561 return Result;
49562 };
49563
49564 switch (MulAmt) {
49565 default:
49566 break;
49567 case 11:
49568 // mul x, 11 => add ((shl (mul x, 5), 1), x)
49569 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
49570 case 21:
49571 // mul x, 21 => add ((shl (mul x, 5), 2), x)
49572 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
49573 case 41:
49574 // mul x, 41 => add ((shl (mul x, 5), 3), x)
49575 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
49576 case 22:
49577 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
49578 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
49579 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
49580 case 19:
49581 // mul x, 19 => add ((shl (mul x, 9), 1), x)
49582 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
49583 case 37:
49584 // mul x, 37 => add ((shl (mul x, 9), 2), x)
49585 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
49586 case 73:
49587 // mul x, 73 => add ((shl (mul x, 9), 3), x)
49588 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
49589 case 13:
49590 // mul x, 13 => add ((shl (mul x, 3), 2), x)
49591 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
49592 case 23:
49593 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
49594 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
49595 case 26:
49596 // mul x, 26 => add ((mul (mul x, 5), 5), x)
49597 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
49598 case 28:
49599 // mul x, 28 => add ((mul (mul x, 9), 3), x)
49600 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
49601 case 29:
49602 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
49603 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
49604 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
49605 }
49606
49607 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
49608 // by a single LEA.
49609 // First check if this a sum of two power of 2s because that's easy. Then
49610 // count how many zeros are up to the first bit.
49611 // TODO: We can do this even without LEA at a cost of two shifts and an add.
49612 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
49613 unsigned ScaleShift = llvm::countr_zero(MulAmt);
49614 if (ScaleShift >= 1 && ScaleShift < 4) {
49615 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
49616 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49617 DAG.getConstant(ShiftAmt, DL, MVT::i8));
49618 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49619 DAG.getConstant(ScaleShift, DL, MVT::i8));
49620 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
49621 }
49622 }
49623
49624 return SDValue();
49625}
49626
49627// If the upper 17 bits of either element are zero and the other element are
49628// zero/sign bits then we can use PMADDWD, which is always at least as quick as
49629// PMULLD, except on KNL.
49631 SelectionDAG &DAG,
49632 const X86Subtarget &Subtarget) {
49633 if (!Subtarget.hasSSE2())
49634 return SDValue();
49635
49636 if (Subtarget.isPMADDWDSlow())
49637 return SDValue();
49638
49639 EVT VT = N->getValueType(0);
49640
49641 // Only support vXi32 vectors.
49642 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
49643 return SDValue();
49644
49645 // Make sure the type is legal or can split/widen to a legal type.
49646 // With AVX512 but without BWI, we would need to split v32i16.
49647 unsigned NumElts = VT.getVectorNumElements();
49648 if (NumElts == 1 || !isPowerOf2_32(NumElts))
49649 return SDValue();
49650
49651 // With AVX512 but without BWI, we would need to split v32i16.
49652 if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
49653 return SDValue();
49654
49655 SDValue N0 = N->getOperand(0);
49656 SDValue N1 = N->getOperand(1);
49657
49658 // If we are zero/sign extending two steps without SSE4.1, its better to
49659 // reduce the vmul width instead.
49660 if (!Subtarget.hasSSE41() &&
49661 (((N0.getOpcode() == ISD::ZERO_EXTEND &&
49662 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
49663 (N1.getOpcode() == ISD::ZERO_EXTEND &&
49664 N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
49665 ((N0.getOpcode() == ISD::SIGN_EXTEND &&
49666 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
49667 (N1.getOpcode() == ISD::SIGN_EXTEND &&
49668 N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
49669 return SDValue();
49670
49671 // If we are sign extending a wide vector without SSE4.1, its better to reduce
49672 // the vmul width instead.
49673 if (!Subtarget.hasSSE41() &&
49674 (N0.getOpcode() == ISD::SIGN_EXTEND &&
49675 N0.getOperand(0).getValueSizeInBits() > 128) &&
49676 (N1.getOpcode() == ISD::SIGN_EXTEND &&
49677 N1.getOperand(0).getValueSizeInBits() > 128))
49678 return SDValue();
49679
49680 // Sign bits must extend down to the lowest i16.
49681 if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
49682 DAG.ComputeMaxSignificantBits(N0) > 16)
49683 return SDValue();
49684
49685 // At least one of the elements must be zero in the upper 17 bits, or can be
49686 // safely made zero without altering the final result.
49687 auto GetZeroableOp = [&](SDValue Op) {
49688 APInt Mask17 = APInt::getHighBitsSet(32, 17);
49689 if (DAG.MaskedValueIsZero(Op, Mask17))
49690 return Op;
49691 // Mask off upper 16-bits of sign-extended constants.
49693 return DAG.getNode(ISD::AND, DL, VT, Op, DAG.getConstant(0xFFFF, DL, VT));
49694 if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
49695 SDValue Src = Op.getOperand(0);
49696 // Convert sext(vXi16) to zext(vXi16).
49697 if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
49698 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49699 // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
49700 // which will expand the extension.
49701 if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
49702 EVT ExtVT = VT.changeVectorElementType(MVT::i16);
49703 Src = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, Src);
49704 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49705 }
49706 }
49707 // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
49708 if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
49709 N->isOnlyUserOf(Op.getNode())) {
49710 SDValue Src = Op.getOperand(0);
49711 if (Src.getScalarValueSizeInBits() == 16)
49712 return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Src);
49713 }
49714 // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
49715 if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
49716 N->isOnlyUserOf(Op.getNode())) {
49717 return DAG.getNode(X86ISD::VSRLI, DL, VT, Op.getOperand(0),
49718 Op.getOperand(1));
49719 }
49720 return SDValue();
49721 };
49722 SDValue ZeroN0 = GetZeroableOp(N0);
49723 SDValue ZeroN1 = GetZeroableOp(N1);
49724 if (!ZeroN0 && !ZeroN1)
49725 return SDValue();
49726 N0 = ZeroN0 ? ZeroN0 : N0;
49727 N1 = ZeroN1 ? ZeroN1 : N1;
49728
49729 // Use SplitOpsAndApply to handle AVX splitting.
49730 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49732 MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
49733 MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);
49734 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
49735 DAG.getBitcast(OpVT, Ops[0]),
49736 DAG.getBitcast(OpVT, Ops[1]));
49737 };
49738 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDWDBuilder);
49739}
49740
49742 const X86Subtarget &Subtarget) {
49743 if (!Subtarget.hasSSE2())
49744 return SDValue();
49745
49746 EVT VT = N->getValueType(0);
49747
49748 // Only support vXi64 vectors.
49749 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
49750 VT.getVectorNumElements() < 2 ||
49752 return SDValue();
49753
49754 SDValue N0 = N->getOperand(0);
49755 SDValue N1 = N->getOperand(1);
49756
49757 // MULDQ returns the 64-bit result of the signed multiplication of the lower
49758 // 32-bits. We can lower with this if the sign bits stretch that far.
49759 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
49760 DAG.ComputeNumSignBits(N1) > 32) {
49761 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49763 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
49764 };
49765 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULDQBuilder,
49766 /*CheckBWI*/ false);
49767 }
49768
49769 // If the upper bits are zero we can use a single pmuludq.
49770 APInt Mask = APInt::getHighBitsSet(64, 32);
49771 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
49772 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49774 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
49775 };
49776 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULUDQBuilder,
49777 /*CheckBWI*/ false);
49778 }
49779
49780 return SDValue();
49781}
49782
49785 const X86Subtarget &Subtarget) {
49786 EVT VT = N->getValueType(0);
49787 SDLoc DL(N);
49788
49789 if (SDValue V = combineMulToPMADDWD(N, DL, DAG, Subtarget))
49790 return V;
49791
49792 if (SDValue V = combineMulToPMULDQ(N, DL, DAG, Subtarget))
49793 return V;
49794
49795 if (DCI.isBeforeLegalize() && VT.isVector())
49796 return reduceVMULWidth(N, DL, DAG, Subtarget);
49797
49798 if (VT != MVT::i64 && VT != MVT::i32 &&
49799 (!VT.isVector() || !VT.isSimple() || !VT.isInteger()))
49800 return SDValue();
49801
49802 KnownBits Known1 = DAG.computeKnownBits(N->getOperand(1));
49803 if (!Known1.isConstant())
49804 return SDValue();
49805
49806 const APInt &C = Known1.getConstant();
49807 if (C.isZero())
49808 return DAG.getConstant(0, DL, VT);
49809
49810 if (C.isAllOnes())
49811 return DAG.getNegative(N->getOperand(0), DL, VT);
49812
49813 if (isPowerOf2_64(C.getZExtValue()))
49814 return SDValue();
49815
49816 // Optimize a single multiply with constant into two operations in order to
49817 // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
49819 return SDValue();
49820
49821 // An imul is usually smaller than the alternative sequence.
49823 return SDValue();
49824
49825 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
49826 return SDValue();
49827
49828 int64_t SignMulAmt = C.getSExtValue();
49829 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
49830 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
49831
49832 SDValue NewMul = SDValue();
49833 if (VT == MVT::i64 || VT == MVT::i32) {
49834 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
49835 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49836 DAG.getConstant(AbsMulAmt, DL, VT));
49837 if (SignMulAmt < 0)
49838 NewMul = DAG.getNegative(NewMul, DL, VT);
49839
49840 return NewMul;
49841 }
49842
49843 uint64_t MulAmt1 = 0;
49844 uint64_t MulAmt2 = 0;
49845 if ((AbsMulAmt % 9) == 0) {
49846 MulAmt1 = 9;
49847 MulAmt2 = AbsMulAmt / 9;
49848 } else if ((AbsMulAmt % 5) == 0) {
49849 MulAmt1 = 5;
49850 MulAmt2 = AbsMulAmt / 5;
49851 } else if ((AbsMulAmt % 3) == 0) {
49852 MulAmt1 = 3;
49853 MulAmt2 = AbsMulAmt / 3;
49854 }
49855
49856 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
49857 if (MulAmt2 &&
49858 (isPowerOf2_64(MulAmt2) ||
49859 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
49860
49861 if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() &&
49862 N->user_begin()->getOpcode() == ISD::ADD))
49863 // If second multiplifer is pow2, issue it first. We want the multiply
49864 // by 3, 5, or 9 to be folded into the addressing mode unless the lone
49865 // use is an add. Only do this for positive multiply amounts since the
49866 // negate would prevent it from being used as an address mode anyway.
49867 std::swap(MulAmt1, MulAmt2);
49868
49869 if (isPowerOf2_64(MulAmt1))
49870 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49871 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
49872 else
49873 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49874 DAG.getConstant(MulAmt1, DL, VT));
49875
49876 if (isPowerOf2_64(MulAmt2))
49877 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
49878 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
49879 else
49880 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
49881 DAG.getConstant(MulAmt2, DL, VT));
49882
49883 // Negate the result.
49884 if (SignMulAmt < 0)
49885 NewMul = DAG.getNegative(NewMul, DL, VT);
49886 } else if (!Subtarget.slowLEA())
49887 NewMul = combineMulSpecial(C.getZExtValue(), N, DAG, VT, DL);
49888 }
49889 if (!NewMul) {
49890 EVT ShiftVT = VT.isVector() ? VT : MVT::i8;
49891 if (isPowerOf2_64(AbsMulAmt - 1)) {
49892 // (mul x, 2^N + 1) => (add (shl x, N), x)
49893 NewMul = DAG.getNode(
49894 ISD::ADD, DL, VT, N->getOperand(0),
49895 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49896 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT)));
49897 if (SignMulAmt < 0)
49898 NewMul = DAG.getNegative(NewMul, DL, VT);
49899 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
49900 // (mul x, 2^N - 1) => (sub (shl x, N), x)
49901 NewMul =
49902 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49903 DAG.getConstant(Log2_64(AbsMulAmt + 1), DL, ShiftVT));
49904 // To negate, reverse the operands of the subtract.
49905 if (SignMulAmt < 0)
49906 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
49907 else
49908 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
49909 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2) &&
49910 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49911 // (mul x, 2^N + 2) => (add (shl x, N), (add x, x))
49912 NewMul =
49913 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49914 DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, ShiftVT));
49915 NewMul = DAG.getNode(
49916 ISD::ADD, DL, VT, NewMul,
49917 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49918 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2) &&
49919 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49920 // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
49921 NewMul =
49922 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49923 DAG.getConstant(Log2_64(AbsMulAmt + 2), DL, ShiftVT));
49924 NewMul = DAG.getNode(
49925 ISD::SUB, DL, VT, NewMul,
49926 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49927 } else if (SignMulAmt >= 0 && VT.isVector() &&
49928 Subtarget.fastImmVectorShift()) {
49929 uint64_t AbsMulAmtLowBit = AbsMulAmt & (-AbsMulAmt);
49930 uint64_t ShiftAmt1;
49931 std::optional<unsigned> Opc;
49932 if (isPowerOf2_64(AbsMulAmt - AbsMulAmtLowBit)) {
49933 ShiftAmt1 = AbsMulAmt - AbsMulAmtLowBit;
49934 Opc = ISD::ADD;
49935 } else if (isPowerOf2_64(AbsMulAmt + AbsMulAmtLowBit)) {
49936 ShiftAmt1 = AbsMulAmt + AbsMulAmtLowBit;
49937 Opc = ISD::SUB;
49938 }
49939
49940 if (Opc) {
49941 SDValue Shift1 =
49942 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49943 DAG.getConstant(Log2_64(ShiftAmt1), DL, ShiftVT));
49944 SDValue Shift2 =
49945 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49946 DAG.getConstant(Log2_64(AbsMulAmtLowBit), DL, ShiftVT));
49947 NewMul = DAG.getNode(*Opc, DL, VT, Shift1, Shift2);
49948 }
49949 }
49950 }
49951
49952 return NewMul;
49953}
49954
49955// Try to form a MULHU or MULHS node by looking for
49956// (srl (mul ext, ext), 16)
49957// TODO: This is X86 specific because we want to be able to handle wide types
49958// before type legalization. But we can only do it if the vector will be
49959// legalized via widening/splitting. Type legalization can't handle promotion
49960// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
49961// combiner.
49963 const SDLoc &DL,
49964 const X86Subtarget &Subtarget) {
49965 using namespace SDPatternMatch;
49966 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
49967 "SRL or SRA node is required here!");
49968
49969 if (!Subtarget.hasSSE2())
49970 return SDValue();
49971
49972 // Input type should be at least vXi32.
49973 EVT VT = N->getValueType(0);
49974 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
49975 return SDValue();
49976
49977 // The operation must be a multiply shifted right by 16.
49978 SDValue LHS, RHS;
49979 if (!sd_match(N->getOperand(1), m_SpecificInt(16)) ||
49980 !sd_match(N->getOperand(0), m_OneUse(m_Mul(m_Value(LHS), m_Value(RHS)))))
49981 return SDValue();
49982
49983 unsigned ExtOpc = LHS.getOpcode();
49984 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
49985 RHS.getOpcode() != ExtOpc)
49986 return SDValue();
49987
49988 // Peek through the extends.
49989 LHS = LHS.getOperand(0);
49990 RHS = RHS.getOperand(0);
49991
49992 // Ensure the input types match.
49993 EVT MulVT = LHS.getValueType();
49994 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
49995 return SDValue();
49996
49997 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
49998 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
49999
50000 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
50001 return DAG.getNode(ExtOpc, DL, VT, Mulh);
50002}
50003
50005 const X86Subtarget &Subtarget) {
50006 using namespace llvm::SDPatternMatch;
50007 SDValue N0 = N->getOperand(0);
50008 SDValue N1 = N->getOperand(1);
50010 EVT VT = N0.getValueType();
50011 unsigned EltSizeInBits = VT.getScalarSizeInBits();
50012 SDLoc DL(N);
50013
50014 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
50015 // with out-of-bounds clamping.
50016 if (N0.getOpcode() == ISD::VSELECT &&
50017 supportedVectorVarShift(VT, Subtarget, ISD::SHL)) {
50018 SDValue Cond = N0.getOperand(0);
50019 SDValue N00 = N0.getOperand(1);
50020 SDValue N01 = N0.getOperand(2);
50021 // fold shl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psllv(x,amt)
50023 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50025 return DAG.getNode(X86ISD::VSHLV, DL, VT, N00, N1);
50026 }
50027 // fold shl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psllv(x,amt)
50029 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50031 return DAG.getNode(X86ISD::VSHLV, DL, VT, N01, N1);
50032 }
50033 }
50034
50035 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
50036 // since the result of setcc_c is all zero's or all ones.
50037 if (VT.isInteger() && !VT.isVector() &&
50038 N1C && N0.getOpcode() == ISD::AND &&
50039 N0.getOperand(1).getOpcode() == ISD::Constant) {
50040 SDValue N00 = N0.getOperand(0);
50041 APInt Mask = N0.getConstantOperandAPInt(1);
50042 Mask <<= N1C->getAPIntValue();
50043 bool MaskOK = false;
50044 // We can handle cases concerning bit-widening nodes containing setcc_c if
50045 // we carefully interrogate the mask to make sure we are semantics
50046 // preserving.
50047 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
50048 // of the underlying setcc_c operation if the setcc_c was zero extended.
50049 // Consider the following example:
50050 // zext(setcc_c) -> i32 0x0000FFFF
50051 // c1 -> i32 0x0000FFFF
50052 // c2 -> i32 0x00000001
50053 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
50054 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
50055 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
50056 MaskOK = true;
50057 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
50059 MaskOK = true;
50060 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
50061 N00.getOpcode() == ISD::ANY_EXTEND) &&
50063 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
50064 }
50065 if (MaskOK && Mask != 0)
50066 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
50067 }
50068
50069 return SDValue();
50070}
50071
50073 const X86Subtarget &Subtarget) {
50074 using namespace llvm::SDPatternMatch;
50075 SDValue N0 = N->getOperand(0);
50076 SDValue N1 = N->getOperand(1);
50077 EVT VT = N0.getValueType();
50078 unsigned Size = VT.getSizeInBits();
50079 SDLoc DL(N);
50080
50081 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
50082 return V;
50083
50084 // fold sra(x,umin(amt,bw-1)) -> avx2 psrav(x,amt)
50085 if (supportedVectorVarShift(VT, Subtarget, ISD::SRA)) {
50086 SDValue ShrAmtVal;
50087 if (sd_match(N1, m_UMin(m_Value(ShrAmtVal),
50089 return DAG.getNode(X86ISD::VSRAV, DL, VT, N0, ShrAmtVal);
50090 }
50091
50092 // fold (SRA (SHL X, ShlConst), SraConst)
50093 // into (SHL (sext_in_reg X), ShlConst - SraConst)
50094 // or (sext_in_reg X)
50095 // or (SRA (sext_in_reg X), SraConst - ShlConst)
50096 // depending on relation between SraConst and ShlConst.
50097 // We only do this if (Size - ShlConst) is equal to 8, 16 or 32. That allows
50098 // us to do the sext_in_reg from corresponding bit.
50099
50100 // sexts in X86 are MOVs. The MOVs have the same code size
50101 // as above SHIFTs (only SHIFT on 1 has lower code size).
50102 // However the MOVs have 2 advantages to a SHIFT:
50103 // 1. MOVs can write to a register that differs from source
50104 // 2. MOVs accept memory operands
50105
50106 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
50107 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
50109 return SDValue();
50110
50111 SDValue N00 = N0.getOperand(0);
50112 SDValue N01 = N0.getOperand(1);
50113 APInt ShlConst = N01->getAsAPIntVal();
50114 APInt SraConst = N1->getAsAPIntVal();
50115 EVT CVT = N1.getValueType();
50116
50117 if (CVT != N01.getValueType())
50118 return SDValue();
50119 if (SraConst.isNegative())
50120 return SDValue();
50121
50122 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
50123 unsigned ShiftSize = SVT.getSizeInBits();
50124 // Only deal with (Size - ShlConst) being equal to 8, 16 or 32.
50125 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
50126 continue;
50127 SDValue NN =
50128 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
50129 if (SraConst.eq(ShlConst))
50130 return NN;
50131 if (SraConst.ult(ShlConst))
50132 return DAG.getNode(ISD::SHL, DL, VT, NN,
50133 DAG.getConstant(ShlConst - SraConst, DL, CVT));
50134 return DAG.getNode(ISD::SRA, DL, VT, NN,
50135 DAG.getConstant(SraConst - ShlConst, DL, CVT));
50136 }
50137 return SDValue();
50138}
50139
50142 const X86Subtarget &Subtarget) {
50143 using namespace llvm::SDPatternMatch;
50144 SDValue N0 = N->getOperand(0);
50145 SDValue N1 = N->getOperand(1);
50146 EVT VT = N0.getValueType();
50147 unsigned EltSizeInBits = VT.getScalarSizeInBits();
50148 SDLoc DL(N);
50149
50150 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
50151 return V;
50152
50153 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
50154 // with out-of-bounds clamping.
50155 if (N0.getOpcode() == ISD::VSELECT &&
50156 supportedVectorVarShift(VT, Subtarget, ISD::SRL)) {
50157 SDValue Cond = N0.getOperand(0);
50158 SDValue N00 = N0.getOperand(1);
50159 SDValue N01 = N0.getOperand(2);
50160 // fold srl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psrlv(x,amt)
50162 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50164 return DAG.getNode(X86ISD::VSRLV, DL, VT, N00, N1);
50165 }
50166 // fold srl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psrlv(x,amt)
50168 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50170 return DAG.getNode(X86ISD::VSRLV, DL, VT, N01, N1);
50171 }
50172 }
50173
50174 // Only do this on the last DAG combine as it can interfere with other
50175 // combines.
50176 if (!DCI.isAfterLegalizeDAG())
50177 return SDValue();
50178
50179 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
50180 // TODO: This is a generic DAG combine that became an x86-only combine to
50181 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
50182 // and-not ('andn').
50183 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
50184 return SDValue();
50185
50186 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
50187 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
50188 if (!ShiftC || !AndC)
50189 return SDValue();
50190
50191 // If we can shrink the constant mask below 8-bits or 32-bits, then this
50192 // transform should reduce code size. It may also enable secondary transforms
50193 // from improved known-bits analysis or instruction selection.
50194 APInt MaskVal = AndC->getAPIntValue();
50195
50196 // If this can be matched by a zero extend, don't optimize.
50197 if (MaskVal.isMask()) {
50198 unsigned TO = MaskVal.countr_one();
50199 if (TO >= 8 && isPowerOf2_32(TO))
50200 return SDValue();
50201 }
50202
50203 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
50204 unsigned OldMaskSize = MaskVal.getSignificantBits();
50205 unsigned NewMaskSize = NewMaskVal.getSignificantBits();
50206 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
50207 (OldMaskSize > 32 && NewMaskSize <= 32)) {
50208 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
50209 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
50210 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
50211 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
50212 }
50213 return SDValue();
50214}
50215
50217 const X86Subtarget &Subtarget) {
50218 unsigned Opcode = N->getOpcode();
50219 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode");
50220
50221 SDLoc DL(N);
50222 EVT VT = N->getValueType(0);
50223 SDValue N0 = N->getOperand(0);
50224 SDValue N1 = N->getOperand(1);
50225 EVT SrcVT = N0.getValueType();
50226
50227 SDValue BC0 =
50228 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
50229 SDValue BC1 =
50230 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
50231
50232 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
50233 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
50234 // truncation trees that help us avoid lane crossing shuffles.
50235 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
50236 // TODO: We don't handle vXf64 shuffles yet.
50237 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
50238 if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
50240 SmallVector<int> ShuffleMask, ScaledMask;
50241 SDValue Vec = peekThroughBitcasts(BCSrc);
50242 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
50244 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
50245 // shuffle to a v4X64 width - we can probably relax this in the future.
50246 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
50247 ShuffleOps[0].getValueType().is256BitVector() &&
50248 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
50249 SDValue Lo, Hi;
50250 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
50251 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
50252 Lo = DAG.getBitcast(SrcVT, Lo);
50253 Hi = DAG.getBitcast(SrcVT, Hi);
50254 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
50255 Res = DAG.getBitcast(ShufVT, Res);
50256 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
50257 return DAG.getBitcast(VT, Res);
50258 }
50259 }
50260 }
50261 }
50262
50263 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
50264 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
50265 // If either/both ops are a shuffle that can scale to v2x64,
50266 // then see if we can perform this as a v4x32 post shuffle.
50267 SmallVector<SDValue> Ops0, Ops1;
50268 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
50269 bool IsShuf0 =
50270 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
50271 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
50272 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
50273 bool IsShuf1 =
50274 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
50275 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
50276 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
50277 if (IsShuf0 || IsShuf1) {
50278 if (!IsShuf0) {
50279 Ops0.assign({BC0});
50280 ScaledMask0.assign({0, 1});
50281 }
50282 if (!IsShuf1) {
50283 Ops1.assign({BC1});
50284 ScaledMask1.assign({0, 1});
50285 }
50286
50287 SDValue LHS, RHS;
50288 int PostShuffle[4] = {-1, -1, -1, -1};
50289 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
50290 if (M < 0)
50291 return true;
50292 Idx = M % 2;
50293 SDValue Src = Ops[M / 2];
50294 if (!LHS || LHS == Src) {
50295 LHS = Src;
50296 return true;
50297 }
50298 if (!RHS || RHS == Src) {
50299 Idx += 2;
50300 RHS = Src;
50301 return true;
50302 }
50303 return false;
50304 };
50305 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
50306 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
50307 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
50308 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
50309 LHS = DAG.getBitcast(SrcVT, LHS);
50310 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
50311 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
50312 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
50313 Res = DAG.getBitcast(ShufVT, Res);
50314 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
50315 return DAG.getBitcast(VT, Res);
50316 }
50317 }
50318 }
50319
50320 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
50321 if (VT.is256BitVector() && Subtarget.hasInt256()) {
50322 SmallVector<int> Mask0, Mask1;
50323 SmallVector<SDValue> Ops0, Ops1;
50324 SmallVector<int, 2> ScaledMask0, ScaledMask1;
50325 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
50326 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
50327 !Ops0.empty() && !Ops1.empty() &&
50328 all_of(Ops0,
50329 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
50330 all_of(Ops1,
50331 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
50332 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
50333 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
50334 SDValue Op00 = peekThroughBitcasts(Ops0.front());
50335 SDValue Op10 = peekThroughBitcasts(Ops1.front());
50336 SDValue Op01 = peekThroughBitcasts(Ops0.back());
50337 SDValue Op11 = peekThroughBitcasts(Ops1.back());
50338 if ((Op00 == Op11) && (Op01 == Op10)) {
50339 std::swap(Op10, Op11);
50341 }
50342 if ((Op00 == Op10) && (Op01 == Op11)) {
50343 const int Map[4] = {0, 2, 1, 3};
50344 SmallVector<int, 4> ShuffleMask(
50345 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
50346 Map[ScaledMask1[1]]});
50347 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
50348 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
50349 DAG.getBitcast(SrcVT, Op01));
50350 Res = DAG.getBitcast(ShufVT, Res);
50351 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
50352 return DAG.getBitcast(VT, Res);
50353 }
50354 }
50355 }
50356
50357 return SDValue();
50358}
50359
50362 const X86Subtarget &Subtarget) {
50363 unsigned Opcode = N->getOpcode();
50364 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
50365 "Unexpected pack opcode");
50366
50367 EVT VT = N->getValueType(0);
50368 SDValue N0 = N->getOperand(0);
50369 SDValue N1 = N->getOperand(1);
50370 unsigned NumDstElts = VT.getVectorNumElements();
50371 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
50372 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
50373 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
50374 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
50375 "Unexpected PACKSS/PACKUS input type");
50376
50377 bool IsSigned = (X86ISD::PACKSS == Opcode);
50378
50379 // Constant Folding.
50380 APInt UndefElts0, UndefElts1;
50381 SmallVector<APInt, 32> EltBits0, EltBits1;
50382 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
50383 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
50384 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0,
50385 /*AllowWholeUndefs*/ true,
50386 /*AllowPartialUndefs*/ true) &&
50387 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1,
50388 /*AllowWholeUndefs*/ true,
50389 /*AllowPartialUndefs*/ true)) {
50390 unsigned NumLanes = VT.getSizeInBits() / 128;
50391 unsigned NumSrcElts = NumDstElts / 2;
50392 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
50393 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
50394
50395 APInt Undefs(NumDstElts, 0);
50396 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
50397 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
50398 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
50399 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
50400 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
50401 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
50402
50403 if (UndefElts[SrcIdx]) {
50404 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
50405 continue;
50406 }
50407
50408 APInt &Val = EltBits[SrcIdx];
50409 if (IsSigned) {
50410 // PACKSS: Truncate signed value with signed saturation.
50411 // Source values less than dst minint are saturated to minint.
50412 // Source values greater than dst maxint are saturated to maxint.
50413 Val = Val.truncSSat(DstBitsPerElt);
50414 } else {
50415 // PACKUS: Truncate signed value with unsigned saturation.
50416 // Source values less than zero are saturated to zero.
50417 // Source values greater than dst maxuint are saturated to maxuint.
50418 // NOTE: This is different from APInt::truncUSat.
50419 if (Val.isIntN(DstBitsPerElt))
50420 Val = Val.trunc(DstBitsPerElt);
50421 else if (Val.isNegative())
50422 Val = APInt::getZero(DstBitsPerElt);
50423 else
50424 Val = APInt::getAllOnes(DstBitsPerElt);
50425 }
50426 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
50427 }
50428 }
50429
50430 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
50431 }
50432
50433 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
50434 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
50435 return V;
50436
50437 // Try to fold PACKSS(NOT(X),NOT(Y)) -> NOT(PACKSS(X,Y)).
50438 // Currently limit this to allsignbits cases only.
50439 if (IsSigned &&
50440 (N0.isUndef() || DAG.ComputeNumSignBits(N0) == SrcBitsPerElt) &&
50441 (N1.isUndef() || DAG.ComputeNumSignBits(N1) == SrcBitsPerElt)) {
50442 SDValue Not0 = N0.isUndef() ? N0 : IsNOT(N0, DAG);
50443 SDValue Not1 = N1.isUndef() ? N1 : IsNOT(N1, DAG);
50444 if (Not0 && Not1) {
50445 SDLoc DL(N);
50446 MVT SrcVT = N0.getSimpleValueType();
50447 SDValue Pack =
50448 DAG.getNode(X86ISD::PACKSS, DL, VT, DAG.getBitcast(SrcVT, Not0),
50449 DAG.getBitcast(SrcVT, Not1));
50450 return DAG.getNOT(DL, Pack, VT);
50451 }
50452 }
50453
50454 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
50455 // truncate to create a larger truncate.
50456 if (Subtarget.hasAVX512() &&
50457 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
50458 N0.getOperand(0).getValueType() == MVT::v8i32) {
50459 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
50460 (!IsSigned &&
50461 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
50462 if (Subtarget.hasVLX())
50463 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
50464
50465 // Widen input to v16i32 so we can truncate that.
50466 SDLoc dl(N);
50467 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
50468 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
50469 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
50470 }
50471 }
50472
50473 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
50474 if (VT.is128BitVector()) {
50475 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
50476 SDValue Src0, Src1;
50477 if (N0.getOpcode() == ExtOpc &&
50479 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
50480 Src0 = N0.getOperand(0);
50481 }
50482 if (N1.getOpcode() == ExtOpc &&
50484 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
50485 Src1 = N1.getOperand(0);
50486 }
50487 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
50488 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
50489 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
50490 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
50491 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
50492 }
50493
50494 // Try again with pack(*_extend_vector_inreg, undef).
50495 unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
50497 if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
50498 N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
50499 return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
50500 DAG);
50501 }
50502
50503 // Attempt to combine as shuffle.
50504 SDValue Op(N, 0);
50505 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50506 return Res;
50507
50508 return SDValue();
50509}
50510
50513 const X86Subtarget &Subtarget) {
50514 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
50515 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
50516 "Unexpected horizontal add/sub opcode");
50517
50518 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
50519 MVT VT = N->getSimpleValueType(0);
50520 SDValue LHS = N->getOperand(0);
50521 SDValue RHS = N->getOperand(1);
50522
50523 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
50524 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
50525 LHS.getOpcode() == RHS.getOpcode() &&
50526 LHS.getValueType() == RHS.getValueType() &&
50527 N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
50528 SDValue LHS0 = LHS.getOperand(0);
50529 SDValue LHS1 = LHS.getOperand(1);
50530 SDValue RHS0 = RHS.getOperand(0);
50531 SDValue RHS1 = RHS.getOperand(1);
50532 if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
50533 (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
50534 SDLoc DL(N);
50535 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
50536 LHS0.isUndef() ? LHS1 : LHS0,
50537 RHS0.isUndef() ? RHS1 : RHS0);
50538 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
50539 Res = DAG.getBitcast(ShufVT, Res);
50540 SDValue NewLHS =
50541 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
50542 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
50543 SDValue NewRHS =
50544 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
50545 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
50546 return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
50547 DAG.getBitcast(VT, NewRHS));
50548 }
50549 }
50550 }
50551
50552 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
50553 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
50554 return V;
50555
50556 return SDValue();
50557}
50558
50561 const X86Subtarget &Subtarget) {
50562 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
50563 X86ISD::VSRL == N->getOpcode()) &&
50564 "Unexpected shift opcode");
50565 EVT VT = N->getValueType(0);
50566 SDValue N0 = N->getOperand(0);
50567 SDValue N1 = N->getOperand(1);
50568
50569 // Shift zero -> zero.
50571 return DAG.getConstant(0, SDLoc(N), VT);
50572
50573 // Detect constant shift amounts.
50574 APInt UndefElts;
50575 SmallVector<APInt, 32> EltBits;
50576 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits,
50577 /*AllowWholeUndefs*/ true,
50578 /*AllowPartialUndefs*/ false)) {
50579 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
50580 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
50581 EltBits[0].getZExtValue(), DAG);
50582 }
50583
50584 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50585 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
50586 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
50587 return SDValue(N, 0);
50588
50589 return SDValue();
50590}
50591
50594 const X86Subtarget &Subtarget) {
50595 unsigned Opcode = N->getOpcode();
50596 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
50597 X86ISD::VSRLI == Opcode) &&
50598 "Unexpected shift opcode");
50599 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
50600 EVT VT = N->getValueType(0);
50601 SDValue N0 = N->getOperand(0);
50602 SDValue N1 = N->getOperand(1);
50603 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
50604 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
50605 "Unexpected value type");
50606 assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");
50607
50608 // (shift undef, X) -> 0
50609 if (N0.isUndef())
50610 return DAG.getConstant(0, SDLoc(N), VT);
50611
50612 // Out of range logical bit shifts are guaranteed to be zero.
50613 // Out of range arithmetic bit shifts splat the sign bit.
50614 unsigned ShiftVal = N->getConstantOperandVal(1);
50615 if (ShiftVal >= NumBitsPerElt) {
50616 if (LogicalShift)
50617 return DAG.getConstant(0, SDLoc(N), VT);
50618 ShiftVal = NumBitsPerElt - 1;
50619 }
50620
50621 // (shift X, 0) -> X
50622 if (!ShiftVal)
50623 return N0;
50624
50625 // (shift 0, C) -> 0
50627 // N0 is all zeros or undef. We guarantee that the bits shifted into the
50628 // result are all zeros, not undef.
50629 return DAG.getConstant(0, SDLoc(N), VT);
50630
50631 // (VSRAI -1, C) -> -1
50632 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
50633 // N0 is all ones or undef. We guarantee that the bits shifted into the
50634 // result are all ones, not undef.
50635 return DAG.getAllOnesConstant(SDLoc(N), VT);
50636
50637 auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {
50638 unsigned NewShiftVal = Amt0 + Amt1;
50639 if (NewShiftVal >= NumBitsPerElt) {
50640 // Out of range logical bit shifts are guaranteed to be zero.
50641 // Out of range arithmetic bit shifts splat the sign bit.
50642 if (LogicalShift)
50643 return DAG.getConstant(0, SDLoc(N), VT);
50644 NewShiftVal = NumBitsPerElt - 1;
50645 }
50646 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
50647 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
50648 };
50649
50650 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
50651 if (Opcode == N0.getOpcode())
50652 return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));
50653
50654 // (shl (add X, X), C) -> (shl X, (C + 1))
50655 if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&
50656 N0.getOperand(0) == N0.getOperand(1))
50657 return MergeShifts(N0.getOperand(0), ShiftVal, 1);
50658
50659 // We can decode 'whole byte' logical bit shifts as shuffles.
50660 if (LogicalShift && (ShiftVal % 8) == 0) {
50661 SDValue Op(N, 0);
50662 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50663 return Res;
50664 }
50665
50666 // Attempt to detect an expanded vXi64 SIGN_EXTEND_INREG vXi1 pattern, and
50667 // convert to a splatted v2Xi32 SIGN_EXTEND_INREG pattern:
50668 // psrad(pshufd(psllq(X,63),1,1,3,3),31) ->
50669 // pshufd(psrad(pslld(X,31),31),0,0,2,2).
50670 if (Opcode == X86ISD::VSRAI && NumBitsPerElt == 32 && ShiftVal == 31 &&
50671 N0.getOpcode() == X86ISD::PSHUFD &&
50672 N0.getConstantOperandVal(1) == getV4X86ShuffleImm({1, 1, 3, 3}) &&
50673 N0->hasOneUse()) {
50675 if (BC.getOpcode() == X86ISD::VSHLI &&
50676 BC.getScalarValueSizeInBits() == 64 &&
50677 BC.getConstantOperandVal(1) == 63) {
50678 SDLoc DL(N);
50679 SDValue Src = BC.getOperand(0);
50680 Src = DAG.getBitcast(VT, Src);
50681 Src = DAG.getNode(X86ISD::PSHUFD, DL, VT, Src,
50682 getV4X86ShuffleImm8ForMask({0, 0, 2, 2}, DL, DAG));
50683 Src = DAG.getNode(X86ISD::VSHLI, DL, VT, Src, N1);
50684 Src = DAG.getNode(X86ISD::VSRAI, DL, VT, Src, N1);
50685 return Src;
50686 }
50687 }
50688
50689 auto TryConstantFold = [&](SDValue V) {
50690 APInt UndefElts;
50691 SmallVector<APInt, 32> EltBits;
50692 if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits,
50693 /*AllowWholeUndefs*/ true,
50694 /*AllowPartialUndefs*/ true))
50695 return SDValue();
50696 assert(EltBits.size() == VT.getVectorNumElements() &&
50697 "Unexpected shift value type");
50698 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
50699 // created an undef input due to no input bits being demanded, but user
50700 // still expects 0 in other bits.
50701 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
50702 APInt &Elt = EltBits[i];
50703 if (UndefElts[i])
50704 Elt = 0;
50705 else if (X86ISD::VSHLI == Opcode)
50706 Elt <<= ShiftVal;
50707 else if (X86ISD::VSRAI == Opcode)
50708 Elt.ashrInPlace(ShiftVal);
50709 else
50710 Elt.lshrInPlace(ShiftVal);
50711 }
50712 // Reset undef elements since they were zeroed above.
50713 UndefElts = 0;
50714 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
50715 };
50716
50717 // Constant Folding.
50718 if (N->isOnlyUserOf(N0.getNode())) {
50719 if (SDValue C = TryConstantFold(N0))
50720 return C;
50721
50722 // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))
50723 // Don't break NOT patterns.
50725 if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&
50726 BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&
50728 if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {
50729 SDLoc DL(N);
50730 SDValue LHS = DAG.getNode(Opcode, DL, VT,
50731 DAG.getBitcast(VT, BC.getOperand(0)), N1);
50732 return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);
50733 }
50734 }
50735 }
50736
50737 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50738 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
50739 DCI))
50740 return SDValue(N, 0);
50741
50742 return SDValue();
50743}
50744
50747 const X86Subtarget &Subtarget) {
50748 EVT VT = N->getValueType(0);
50749 unsigned Opcode = N->getOpcode();
50750 assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||
50751 (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||
50752 Opcode == ISD::INSERT_VECTOR_ELT) &&
50753 "Unexpected vector insertion");
50754
50755 SDValue Vec = N->getOperand(0);
50756 SDValue Scl = N->getOperand(1);
50757 SDValue Idx = N->getOperand(2);
50758
50759 // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
50760 if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))
50761 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);
50762
50763 if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {
50764 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
50765 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50766 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
50767 APInt::getAllOnes(NumBitsPerElt), DCI))
50768 return SDValue(N, 0);
50769 }
50770
50771 // Attempt to combine insertion patterns to a shuffle.
50772 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
50773 SDValue Op(N, 0);
50774 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50775 return Res;
50776 }
50777
50778 return SDValue();
50779}
50780
50781/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
50782/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
50783/// OR -> CMPNEQSS.
50786 const X86Subtarget &Subtarget) {
50787 unsigned opcode;
50788
50789 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
50790 // we're requiring SSE2 for both.
50791 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
50792 SDValue N0 = N->getOperand(0);
50793 SDValue N1 = N->getOperand(1);
50794 SDValue CMP0 = N0.getOperand(1);
50795 SDValue CMP1 = N1.getOperand(1);
50796 SDLoc DL(N);
50797
50798 // The SETCCs should both refer to the same CMP.
50799 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
50800 return SDValue();
50801
50802 SDValue CMP00 = CMP0->getOperand(0);
50803 SDValue CMP01 = CMP0->getOperand(1);
50804 EVT VT = CMP00.getValueType();
50805
50806 if (VT == MVT::f32 || VT == MVT::f64 ||
50807 (VT == MVT::f16 && Subtarget.hasFP16())) {
50808 bool ExpectingFlags = false;
50809 // Check for any users that want flags:
50810 for (const SDNode *U : N->users()) {
50811 if (ExpectingFlags)
50812 break;
50813
50814 switch (U->getOpcode()) {
50815 default:
50816 case ISD::BR_CC:
50817 case ISD::BRCOND:
50818 case ISD::SELECT:
50819 ExpectingFlags = true;
50820 break;
50821 case ISD::CopyToReg:
50822 case ISD::SIGN_EXTEND:
50823 case ISD::ZERO_EXTEND:
50824 case ISD::ANY_EXTEND:
50825 break;
50826 }
50827 }
50828
50829 if (!ExpectingFlags) {
50830 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
50831 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
50832
50833 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
50834 X86::CondCode tmp = cc0;
50835 cc0 = cc1;
50836 cc1 = tmp;
50837 }
50838
50839 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
50840 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
50841 // FIXME: need symbolic constants for these magic numbers.
50842 // See X86ATTInstPrinter.cpp:printSSECC().
50843 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
50844 if (Subtarget.hasAVX512()) {
50845 SDValue FSetCC =
50846 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
50847 DAG.getTargetConstant(x86cc, DL, MVT::i8));
50848 // Need to fill with zeros to ensure the bitcast will produce zeroes
50849 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
50850 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
50851 DAG.getConstant(0, DL, MVT::v16i1),
50852 FSetCC, DAG.getVectorIdxConstant(0, DL));
50853 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
50854 N->getSimpleValueType(0));
50855 }
50856 SDValue OnesOrZeroesF =
50857 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
50858 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
50859
50860 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
50861 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
50862
50863 if (is64BitFP && !Subtarget.is64Bit()) {
50864 // On a 32-bit target, we cannot bitcast the 64-bit float to a
50865 // 64-bit integer, since that's not a legal type. Since
50866 // OnesOrZeroesF is all ones or all zeroes, we don't need all the
50867 // bits, but can do this little dance to extract the lowest 32 bits
50868 // and work with those going forward.
50869 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL,
50870 MVT::v2f64, OnesOrZeroesF);
50871 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
50872 OnesOrZeroesF =
50873 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Vector32,
50874 DAG.getVectorIdxConstant(0, DL));
50875 IntVT = MVT::i32;
50876 }
50877
50878 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
50879 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
50880 DAG.getConstant(1, DL, IntVT));
50881 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
50882 ANDed);
50883 return OneBitOfTruth;
50884 }
50885 }
50886 }
50887 }
50888 return SDValue();
50889}
50890
50891/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
50893 SelectionDAG &DAG) {
50894 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50895
50896 MVT VT = N->getSimpleValueType(0);
50897 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
50898 return SDValue();
50899
50900 SDValue X, Y;
50901 SDValue N0 = N->getOperand(0);
50902 SDValue N1 = N->getOperand(1);
50903
50904 if (SDValue Not = IsNOT(N0, DAG)) {
50905 X = Not;
50906 Y = N1;
50907 } else if (SDValue Not = IsNOT(N1, DAG)) {
50908 X = Not;
50909 Y = N0;
50910 } else
50911 return SDValue();
50912
50913 X = DAG.getBitcast(VT, X);
50914 Y = DAG.getBitcast(VT, Y);
50915 return DAG.getNode(X86ISD::ANDNP, DL, VT, X, Y);
50916}
50917
50918/// Try to fold:
50919/// and (vector_shuffle<Z,...,Z>
50920/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y
50921/// ->
50922/// andnp (vector_shuffle<Z,...,Z>
50923/// (insert_vector_elt undef, X, Z), undef), Y
50925 const X86Subtarget &Subtarget) {
50926 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50927
50928 EVT VT = N->getValueType(0);
50929 // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original
50930 // value and require extra moves.
50931 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
50932 ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))
50933 return SDValue();
50934
50935 auto GetNot = [&DAG](SDValue V) {
50937 // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
50938 // end-users are ISD::AND including cases
50939 // (and(extract_vector_element(SVN), Y)).
50940 if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
50941 !SVN->getOperand(1).isUndef()) {
50942 return SDValue();
50943 }
50944 SDValue IVEN = SVN->getOperand(0);
50945 if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||
50946 !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())
50947 return SDValue();
50948 if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||
50949 IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
50950 return SDValue();
50951 SDValue Src = IVEN.getOperand(1);
50952 if (SDValue Not = IsNOT(Src, DAG)) {
50953 SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);
50954 SDValue NotIVEN =
50956 IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));
50957 return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
50958 SVN->getOperand(1), SVN->getMask());
50959 }
50960 return SDValue();
50961 };
50962
50963 SDValue X, Y;
50964 SDValue N0 = N->getOperand(0);
50965 SDValue N1 = N->getOperand(1);
50966 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50967
50968 if (SDValue Not = GetNot(N0)) {
50969 X = Not;
50970 Y = N1;
50971 } else if (SDValue Not = GetNot(N1)) {
50972 X = Not;
50973 Y = N0;
50974 } else
50975 return SDValue();
50976
50977 X = DAG.getBitcast(VT, X);
50978 Y = DAG.getBitcast(VT, Y);
50979 SDLoc DL(N);
50980
50981 // We do not split for SSE at all, but we need to split vectors for AVX1 and
50982 // AVX2.
50983 if (!Subtarget.useAVX512Regs() && VT.is512BitVector() &&
50985 SDValue LoX, HiX;
50986 std::tie(LoX, HiX) = splitVector(X, DAG, DL);
50987 SDValue LoY, HiY;
50988 std::tie(LoY, HiY) = splitVector(Y, DAG, DL);
50989 EVT SplitVT = LoX.getValueType();
50990 SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});
50991 SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
50992 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
50993 }
50994
50995 if (TLI.isTypeLegal(VT))
50996 return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
50997
50998 return SDValue();
50999}
51000
51001// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
51002// logical operations, like in the example below.
51003// or (and (truncate x, truncate y)),
51004// (xor (truncate z, build_vector (constants)))
51005// Given a target type \p VT, we generate
51006// or (and x, y), (xor z, zext(build_vector (constants)))
51007// given x, y and z are of type \p VT. We can do so, if operands are either
51008// truncates from VT types, the second operand is a vector of constants, can
51009// be recursively promoted or is an existing extension we can extend further.
51011 SelectionDAG &DAG,
51012 const X86Subtarget &Subtarget,
51013 unsigned Depth) {
51014 // Limit recursion to avoid excessive compile times.
51016 return SDValue();
51017
51018 if (!ISD::isBitwiseLogicOp(N.getOpcode()))
51019 return SDValue();
51020
51021 SDValue N0 = N.getOperand(0);
51022 SDValue N1 = N.getOperand(1);
51023
51024 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51025 if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT))
51026 return SDValue();
51027
51028 if (SDValue NN0 =
51029 PromoteMaskArithmetic(N0, DL, VT, DAG, Subtarget, Depth + 1))
51030 N0 = NN0;
51031 else {
51032 // The left side has to be a 'trunc'.
51033 bool LHSTrunc = N0.getOpcode() == ISD::TRUNCATE &&
51034 N0.getOperand(0).getValueType() == VT;
51035 if (LHSTrunc)
51036 N0 = N0.getOperand(0);
51037 else
51038 return SDValue();
51039 }
51040
51041 if (SDValue NN1 =
51042 PromoteMaskArithmetic(N1, DL, VT, DAG, Subtarget, Depth + 1))
51043 N1 = NN1;
51044 else {
51045 // The right side has to be a 'trunc', a (foldable) constant or an
51046 // existing extension we can extend further.
51047 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
51048 N1.getOperand(0).getValueType() == VT;
51049 if (RHSTrunc)
51050 N1 = N1.getOperand(0);
51051 else if (ISD::isExtVecInRegOpcode(N1.getOpcode()) && VT.is256BitVector() &&
51052 Subtarget.hasInt256() && N1.hasOneUse())
51053 N1 = DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0));
51054 else if (SDValue Cst =
51056 N1 = Cst;
51057 else
51058 return SDValue();
51059 }
51060
51061 return DAG.getNode(N.getOpcode(), DL, VT, N0, N1);
51062}
51063
51064// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
51065// register. In most cases we actually compare or select YMM-sized registers
51066// and mixing the two types creates horrible code. This method optimizes
51067// some of the transition sequences.
51068// Even with AVX-512 this is still useful for removing casts around logical
51069// operations on vXi1 mask types.
51071 SelectionDAG &DAG,
51072 const X86Subtarget &Subtarget) {
51073 EVT VT = N.getValueType();
51074 assert(VT.isVector() && "Expected vector type");
51075 assert((N.getOpcode() == ISD::ANY_EXTEND ||
51076 N.getOpcode() == ISD::ZERO_EXTEND ||
51077 N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
51078
51079 SDValue Narrow = N.getOperand(0);
51080 EVT NarrowVT = Narrow.getValueType();
51081
51082 // Generate the wide operation.
51083 SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, Subtarget, 0);
51084 if (!Op)
51085 return SDValue();
51086 switch (N.getOpcode()) {
51087 default: llvm_unreachable("Unexpected opcode");
51088 case ISD::ANY_EXTEND:
51089 return Op;
51090 case ISD::ZERO_EXTEND:
51091 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
51092 case ISD::SIGN_EXTEND:
51093 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
51094 Op, DAG.getValueType(NarrowVT));
51095 }
51096}
51097
51098static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
51099 unsigned FPOpcode;
51100 switch (Opcode) {
51101 // clang-format off
51102 default: llvm_unreachable("Unexpected input node for FP logic conversion");
51103 case ISD::AND: FPOpcode = X86ISD::FAND; break;
51104 case ISD::OR: FPOpcode = X86ISD::FOR; break;
51105 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
51106 // clang-format on
51107 }
51108 return FPOpcode;
51109}
51110
51111/// If both input operands of a logic op are being cast from floating-point
51112/// types or FP compares, try to convert this into a floating-point logic node
51113/// to avoid unnecessary moves from SSE to integer registers.
51114static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT,
51115 SDValue N0, SDValue N1,
51116 SelectionDAG &DAG,
51118 const X86Subtarget &Subtarget) {
51119 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51120 "Unexpected bit opcode");
51121
51122 if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
51123 (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
51124 return SDValue();
51125
51126 SDValue N00 = N0.getOperand(0);
51127 SDValue N10 = N1.getOperand(0);
51128 EVT N00Type = N00.getValueType();
51129 EVT N10Type = N10.getValueType();
51130
51131 // Ensure that both types are the same and are legal scalar fp types.
51132 if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
51133 (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
51134 (Subtarget.hasFP16() && N00Type == MVT::f16)))
51135 return SDValue();
51136
51137 if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
51138 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(Opc);
51139 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
51140 return DAG.getBitcast(VT, FPLogic);
51141 }
51142
51143 if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
51144 !N1.hasOneUse())
51145 return SDValue();
51146
51147 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
51148 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
51149
51150 // The vector ISA for FP predicates is incomplete before AVX, so converting
51151 // COMIS* to CMPS* may not be a win before AVX.
51152 if (!Subtarget.hasAVX() &&
51153 !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
51154 return SDValue();
51155
51156 // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
51157 // and vector logic:
51158 // logic (setcc N00, N01), (setcc N10, N11) -->
51159 // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
51160 unsigned NumElts = 128 / N00Type.getSizeInBits();
51161 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
51162 EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
51163 SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
51164 SDValue N01 = N0.getOperand(1);
51165 SDValue N11 = N1.getOperand(1);
51166 SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
51167 SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
51168 SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
51169 SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
51170 SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
51171 SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
51172 SDValue Logic = DAG.getNode(Opc, DL, BoolVecVT, Setcc0, Setcc1);
51173 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
51174}
51175
51176// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
51177// to reduce XMM->GPR traffic.
51178static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0,
51179 SDValue N1, SelectionDAG &DAG) {
51180 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51181 "Unexpected bit opcode");
51182
51183 // Both operands must be single use MOVMSK.
51184 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
51185 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
51186 return SDValue();
51187
51188 SDValue Vec0 = N0.getOperand(0);
51189 SDValue Vec1 = N1.getOperand(0);
51190 EVT VecVT0 = Vec0.getValueType();
51191 EVT VecVT1 = Vec1.getValueType();
51192
51193 // Both MOVMSK operands must be from vectors of the same size and same element
51194 // size, but its OK for a fp/int diff.
51195 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
51196 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
51197 return SDValue();
51198
51199 unsigned VecOpc =
51201 SDValue Result =
51202 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
51203 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
51204}
51205
51206// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
51207// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
51208// handles in InstCombine.
51209static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT,
51210 SDValue N0, SDValue N1,
51211 SelectionDAG &DAG) {
51212 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51213 "Unexpected bit opcode");
51214
51215 // Both operands must be single use.
51216 if (!N0.hasOneUse() || !N1.hasOneUse())
51217 return SDValue();
51218
51219 // Search for matching shifts.
51222
51223 unsigned BCOpc = BC0.getOpcode();
51224 EVT BCVT = BC0.getValueType();
51225 if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
51226 return SDValue();
51227
51228 switch (BCOpc) {
51229 case X86ISD::VSHLI:
51230 case X86ISD::VSRLI:
51231 case X86ISD::VSRAI: {
51232 if (BC0.getOperand(1) != BC1.getOperand(1))
51233 return SDValue();
51234 SDValue BitOp =
51235 DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
51236 SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
51237 return DAG.getBitcast(VT, Shift);
51238 }
51239 }
51240
51241 return SDValue();
51242}
51243
51244// Attempt to fold:
51245// BITOP(PACKSS(X,Z),PACKSS(Y,W)) --> PACKSS(BITOP(X,Y),BITOP(Z,W)).
51246// TODO: Handle PACKUS handling.
51247static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT,
51248 SDValue N0, SDValue N1, SelectionDAG &DAG) {
51249 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51250 "Unexpected bit opcode");
51251
51252 // Both operands must be single use.
51253 if (!N0.hasOneUse() || !N1.hasOneUse())
51254 return SDValue();
51255
51256 // Search for matching packs.
51259
51260 if (N0.getOpcode() != X86ISD::PACKSS || N1.getOpcode() != X86ISD::PACKSS)
51261 return SDValue();
51262
51263 MVT DstVT = N0.getSimpleValueType();
51264 if (DstVT != N1.getSimpleValueType())
51265 return SDValue();
51266
51267 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
51268 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
51269
51270 // Limit to allsignbits packing.
51271 if (DAG.ComputeNumSignBits(N0.getOperand(0)) != NumSrcBits ||
51272 DAG.ComputeNumSignBits(N0.getOperand(1)) != NumSrcBits ||
51273 DAG.ComputeNumSignBits(N1.getOperand(0)) != NumSrcBits ||
51274 DAG.ComputeNumSignBits(N1.getOperand(1)) != NumSrcBits)
51275 return SDValue();
51276
51277 SDValue LHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(0), N1.getOperand(0));
51278 SDValue RHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(1), N1.getOperand(1));
51279 return DAG.getBitcast(VT, DAG.getNode(X86ISD::PACKSS, DL, DstVT, LHS, RHS));
51280}
51281
51282/// If this is a zero/all-bits result that is bitwise-anded with a low bits
51283/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
51284/// with a shift-right to eliminate loading the vector constant mask value.
51286 SelectionDAG &DAG,
51287 const X86Subtarget &Subtarget) {
51288 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
51289 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
51290 EVT VT = Op0.getValueType();
51291 if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
51292 return SDValue();
51293
51294 // Try to convert an "is positive" signbit masking operation into arithmetic
51295 // shift and "andn". This saves a materialization of a -1 vector constant.
51296 // The "is negative" variant should be handled more generally because it only
51297 // requires "and" rather than "andn":
51298 // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
51299 //
51300 // This is limited to the original type to avoid producing even more bitcasts.
51301 // If the bitcasts can't be eliminated, then it is unlikely that this fold
51302 // will be profitable.
51303 if (N->getValueType(0) == VT &&
51304 supportedVectorShiftWithImm(VT, Subtarget, ISD::SRA)) {
51305 SDValue X, Y;
51306 if (Op1.getOpcode() == X86ISD::PCMPGT &&
51307 isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {
51308 X = Op1.getOperand(0);
51309 Y = Op0;
51310 } else if (Op0.getOpcode() == X86ISD::PCMPGT &&
51311 isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {
51312 X = Op0.getOperand(0);
51313 Y = Op1;
51314 }
51315 if (X && Y) {
51316 SDValue Sra =
51318 VT.getScalarSizeInBits() - 1, DAG);
51319 return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
51320 }
51321 }
51322
51323 APInt SplatVal;
51324 if (!X86::isConstantSplat(Op1, SplatVal, false) || !SplatVal.isMask())
51325 return SDValue();
51326
51327 // Don't prevent creation of ANDN.
51328 if (isBitwiseNot(Op0))
51329 return SDValue();
51330
51331 if (!supportedVectorShiftWithImm(VT, Subtarget, ISD::SRL))
51332 return SDValue();
51333
51334 unsigned EltBitWidth = VT.getScalarSizeInBits();
51335 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
51336 return SDValue();
51337
51338 unsigned ShiftVal = SplatVal.countr_one();
51339 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
51340 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
51341 return DAG.getBitcast(N->getValueType(0), Shift);
51342}
51343
51344// Get the index node from the lowered DAG of a GEP IR instruction with one
51345// indexing dimension.
51347 if (Ld->isIndexed())
51348 return SDValue();
51349
51350 SDValue Base = Ld->getBasePtr();
51351 if (Base.getOpcode() != ISD::ADD)
51352 return SDValue();
51353
51354 SDValue ShiftedIndex = Base.getOperand(0);
51355 if (ShiftedIndex.getOpcode() != ISD::SHL)
51356 return SDValue();
51357
51358 return ShiftedIndex.getOperand(0);
51359}
51360
51361static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
51362 return Subtarget.hasBMI2() &&
51363 (VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit()));
51364}
51365
51366/// Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z))
51367/// This undoes the inverse fold performed in InstCombine
51369 SelectionDAG &DAG) {
51370 using namespace llvm::SDPatternMatch;
51371 MVT VT = N->getSimpleValueType(0);
51372 if (!DAG.getTargetLoweringInfo().hasAndNot(SDValue(N, 0)))
51373 return SDValue();
51374
51375 SDValue X, Y, Z;
51376 if (sd_match(N, m_And(m_Value(X),
51377 m_OneUse(m_Or(m_Value(Y), m_Not(m_Value(Z))))))) {
51378 // Don't fold if Y or Z are constants to prevent infinite loops.
51381 return DAG.getNode(
51382 ISD::AND, DL, VT, X,
51383 DAG.getNOT(
51384 DL, DAG.getNode(ISD::AND, DL, VT, DAG.getNOT(DL, Y, VT), Z), VT));
51385 }
51386
51387 return SDValue();
51388}
51389
51390// This function recognizes cases where X86 bzhi instruction can replace and
51391// 'and-load' sequence.
51392// In case of loading integer value from an array of constants which is defined
51393// as follows:
51394//
51395// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
51396//
51397// then applying a bitwise and on the result with another input.
51398// It's equivalent to performing bzhi (zero high bits) on the input, with the
51399// same index of the load.
51401 const X86Subtarget &Subtarget) {
51402 MVT VT = Node->getSimpleValueType(0);
51403 SDLoc dl(Node);
51404
51405 // Check if subtarget has BZHI instruction for the node's type
51406 if (!hasBZHI(Subtarget, VT))
51407 return SDValue();
51408
51409 // Try matching the pattern for both operands.
51410 for (unsigned i = 0; i < 2; i++) {
51411 // continue if the operand is not a load instruction
51412 auto *Ld = dyn_cast<LoadSDNode>(Node->getOperand(i));
51413 if (!Ld)
51414 continue;
51415 const Value *MemOp = Ld->getMemOperand()->getValue();
51416 if (!MemOp)
51417 continue;
51418 // Get the Node which indexes into the array.
51420 if (!Index)
51421 continue;
51422
51423 if (auto *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
51424 if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
51425 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
51426 Constant *Init = GV->getInitializer();
51427 Type *Ty = Init->getType();
51429 !Ty->getArrayElementType()->isIntegerTy() ||
51430 Ty->getArrayElementType()->getScalarSizeInBits() !=
51431 VT.getSizeInBits() ||
51432 Ty->getArrayNumElements() >
51433 Ty->getArrayElementType()->getScalarSizeInBits())
51434 continue;
51435
51436 // Check if the array's constant elements are suitable to our case.
51437 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
51438 bool ConstantsMatch = true;
51439 for (uint64_t j = 0; j < ArrayElementCount; j++) {
51440 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
51441 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
51442 ConstantsMatch = false;
51443 break;
51444 }
51445 }
51446 if (!ConstantsMatch)
51447 continue;
51448
51449 // Do the transformation (For 32-bit type):
51450 // -> (and (load arr[idx]), inp)
51451 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
51452 // that will be replaced with one bzhi instruction.
51453 SDValue Inp = Node->getOperand(i == 0 ? 1 : 0);
51454 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
51455
51456 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
51457 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
51458 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
51459
51460 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
51461 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
51462 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
51463 }
51464 }
51465 }
51466 }
51467 return SDValue();
51468}
51469
51470// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
51471// Where C is a mask containing the same number of bits as the setcc and
51472// where the setcc will freely 0 upper bits of k-register. We can replace the
51473// undef in the concat with 0s and remove the AND. This mainly helps with
51474// v2i1/v4i1 setcc being casted to scalar.
51476 const X86Subtarget &Subtarget) {
51477 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
51478
51479 EVT VT = N->getValueType(0);
51480
51481 // Make sure this is an AND with constant. We will check the value of the
51482 // constant later.
51483 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
51484 if (!C1)
51485 return SDValue();
51486
51487 // This is implied by the ConstantSDNode.
51488 assert(!VT.isVector() && "Expected scalar VT!");
51489
51490 SDValue Src = N->getOperand(0);
51491 if (!Src.hasOneUse())
51492 return SDValue();
51493
51494 // (Optionally) peek through any_extend().
51495 if (Src.getOpcode() == ISD::ANY_EXTEND) {
51496 if (!Src.getOperand(0).hasOneUse())
51497 return SDValue();
51498 Src = Src.getOperand(0);
51499 }
51500
51501 if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())
51502 return SDValue();
51503
51504 Src = Src.getOperand(0);
51505 EVT SrcVT = Src.getValueType();
51506
51507 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51508 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
51509 !TLI.isTypeLegal(SrcVT))
51510 return SDValue();
51511
51512 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
51513 return SDValue();
51514
51515 // We only care about the first subvector of the concat, we expect the
51516 // other subvectors to be ignored due to the AND if we make the change.
51517 SDValue SubVec = Src.getOperand(0);
51518 EVT SubVecVT = SubVec.getValueType();
51519
51520 // The RHS of the AND should be a mask with as many bits as SubVec.
51521 if (!TLI.isTypeLegal(SubVecVT) ||
51522 !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
51523 return SDValue();
51524
51525 // First subvector should be a setcc with a legal result type or a
51526 // AND containing at least one setcc with a legal result type.
51527 auto IsLegalSetCC = [&](SDValue V) {
51528 if (V.getOpcode() != ISD::SETCC)
51529 return false;
51530 EVT SetccVT = V.getOperand(0).getValueType();
51531 if (!TLI.isTypeLegal(SetccVT) ||
51532 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
51533 return false;
51534 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
51535 return false;
51536 return true;
51537 };
51538 if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&
51539 (IsLegalSetCC(SubVec.getOperand(0)) ||
51540 IsLegalSetCC(SubVec.getOperand(1))))))
51541 return SDValue();
51542
51543 // We passed all the checks. Rebuild the concat_vectors with zeroes
51544 // and cast it back to VT.
51545 SDLoc dl(N);
51546 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
51547 DAG.getConstant(0, dl, SubVecVT));
51548 Ops[0] = SubVec;
51549 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
51550 Ops);
51551 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
51552 return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
51553}
51554
51556 SDValue OpMustEq, SDValue Op, unsigned Depth) {
51557 // We don't want to go crazy with the recursion here. This isn't a super
51558 // important optimization.
51559 static constexpr unsigned kMaxDepth = 2;
51560
51561 // Only do this re-ordering if op has one use.
51562 if (!Op.hasOneUse())
51563 return SDValue();
51564
51565 SDLoc DL(Op);
51566 // If we hit another assosiative op, recurse further.
51567 if (Op.getOpcode() == Opc) {
51568 // Done recursing.
51569 if (Depth++ >= kMaxDepth)
51570 return SDValue();
51571
51572 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
51573 if (SDValue R =
51574 getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))
51575 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,
51576 Op.getOperand(1 - OpIdx));
51577
51578 } else if (Op.getOpcode() == ISD::SUB) {
51579 if (Opc == ISD::AND) {
51580 // BLSI: (and x, (sub 0, x))
51581 if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)
51582 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51583 }
51584 // Opc must be ISD::AND or ISD::XOR
51585 // BLSR: (and x, (sub x, 1))
51586 // BLSMSK: (xor x, (sub x, 1))
51587 if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
51588 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51589
51590 } else if (Op.getOpcode() == ISD::ADD) {
51591 // Opc must be ISD::AND or ISD::XOR
51592 // BLSR: (and x, (add x, -1))
51593 // BLSMSK: (xor x, (add x, -1))
51594 if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
51595 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51596 }
51597 return SDValue();
51598}
51599
51601 const X86Subtarget &Subtarget) {
51602 EVT VT = N->getValueType(0);
51603 // Make sure this node is a candidate for BMI instructions.
51604 if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||
51605 (VT != MVT::i32 && VT != MVT::i64))
51606 return SDValue();
51607
51608 assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR);
51609
51610 // Try and match LHS and RHS.
51611 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
51612 if (SDValue OpMatch =
51613 getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),
51614 N->getOperand(1 - OpIdx), 0))
51615 return OpMatch;
51616 return SDValue();
51617}
51618
51619/// Fold AND(Y, XOR(X, NEG(X))) -> ANDN(Y, BLSMSK(X)) if BMI is available.
51621 SelectionDAG &DAG,
51622 const X86Subtarget &Subtarget) {
51623 using namespace llvm::SDPatternMatch;
51624
51625 EVT VT = And->getValueType(0);
51626 // Make sure this node is a candidate for BMI instructions.
51627 if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
51628 return SDValue();
51629
51630 SDValue X;
51631 SDValue Y;
51634 m_Value(Y))))
51635 return SDValue();
51636
51637 SDValue BLSMSK =
51638 DAG.getNode(ISD::XOR, DL, VT, X,
51639 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getConstant(1, DL, VT)));
51640 SDValue AndN = DAG.getNode(ISD::AND, DL, VT, Y, DAG.getNOT(DL, BLSMSK, VT));
51641 return AndN;
51642}
51643
51645 SelectionDAG &DAG,
51647 const X86Subtarget &ST) {
51648 // cmp(setcc(cc, X), 0)
51649 // brcond ne
51650 // ->
51651 // X
51652 // brcond cc
51653
51654 // sub(setcc(cc, X), 1)
51655 // brcond ne
51656 // ->
51657 // X
51658 // brcond ~cc
51659 //
51660 // if only flag has users
51661
51662 SDValue SetCC = N->getOperand(0);
51663
51664 if (SetCC.getOpcode() != X86ISD::SETCC || !Flag.hasOneUse())
51665 return SDValue();
51666
51667 // Check the only user of flag is `brcond ne`.
51668 SDNode *BrCond = *Flag->user_begin();
51669 if (BrCond->getOpcode() != X86ISD::BRCOND)
51670 return SDValue();
51671 unsigned CondNo = 2;
51672 if (static_cast<X86::CondCode>(BrCond->getConstantOperandVal(CondNo)) !=
51674 return SDValue();
51675
51676 SDValue X = SetCC.getOperand(1);
51677 // sub has two results while X only have one. DAG combine assumes the value
51678 // type matches.
51679 if (N->getOpcode() == X86ISD::SUB)
51680 X = DAG.getMergeValues({N->getOperand(0), X}, SDLoc(N));
51681
51682 SDValue CCN = SetCC.getOperand(0);
51683 X86::CondCode CC =
51684 static_cast<X86::CondCode>(CCN->getAsAPIntVal().getSExtValue());
51686 // Update CC for the consumer of the flag.
51687 // The old CC is `ne`. Hence, when comparing the result with 0, we are
51688 // checking if the second condition evaluates to true. When comparing the
51689 // result with 1, we are checking uf the second condition evaluates to false.
51691 if (isNullConstant(N->getOperand(1)))
51692 Ops[CondNo] = CCN;
51693 else if (isOneConstant(N->getOperand(1)))
51694 Ops[CondNo] = DAG.getTargetConstant(OppositeCC, SDLoc(BrCond), MVT::i8);
51695 else
51696 llvm_unreachable("expect constant 0 or 1");
51697
51698 SDValue NewBrCond =
51699 DAG.getNode(X86ISD::BRCOND, SDLoc(BrCond), BrCond->getValueType(0), Ops);
51700 // Avoid self-assign error b/c CC1 can be `e/ne`.
51701 if (BrCond != NewBrCond.getNode())
51702 DCI.CombineTo(BrCond, NewBrCond);
51703 return X;
51704}
51705
51708 const X86Subtarget &ST) {
51709 // and/or(setcc(cc0, flag0), setcc(cc1, sub (X, Y)))
51710 // ->
51711 // setcc(cc1, ccmp(X, Y, ~cflags/cflags, cc0/~cc0, flag0))
51712
51713 // and/or(setcc(cc0, flag0), setcc(cc1, cmp (X, 0)))
51714 // ->
51715 // setcc(cc1, ctest(X, X, ~cflags/cflags, cc0/~cc0, flag0))
51716 //
51717 // where cflags is determined by cc1.
51718
51719 if (!ST.hasCCMP())
51720 return SDValue();
51721
51722 SDValue SetCC0 = N->getOperand(0);
51723 SDValue SetCC1 = N->getOperand(1);
51724 if (SetCC0.getOpcode() != X86ISD::SETCC ||
51725 SetCC1.getOpcode() != X86ISD::SETCC)
51726 return SDValue();
51727
51728 auto GetCombineToOpc = [&](SDValue V) -> unsigned {
51729 SDValue Op = V.getOperand(1);
51730 unsigned Opc = Op.getOpcode();
51731 if (Opc == X86ISD::SUB)
51732 return X86ISD::CCMP;
51733 if (Opc == X86ISD::CMP && isNullConstant(Op.getOperand(1)))
51734 return X86ISD::CTEST;
51735 return 0U;
51736 };
51737
51738 unsigned NewOpc = 0;
51739
51740 // AND/OR is commutable. Canonicalize the operands to make SETCC with SUB/CMP
51741 // appear on the right.
51742 if (!(NewOpc = GetCombineToOpc(SetCC1))) {
51743 std::swap(SetCC0, SetCC1);
51744 if (!(NewOpc = GetCombineToOpc(SetCC1)))
51745 return SDValue();
51746 }
51747
51748 X86::CondCode CC0 =
51749 static_cast<X86::CondCode>(SetCC0.getConstantOperandVal(0));
51750 // CCMP/CTEST is not conditional when the source condition is COND_P/COND_NP.
51751 if (CC0 == X86::COND_P || CC0 == X86::COND_NP)
51752 return SDValue();
51753
51754 bool IsOR = N->getOpcode() == ISD::OR;
51755
51756 // CMP/TEST is executed and updates the EFLAGS normally only when SrcCC
51757 // evaluates to true. So we need to inverse CC0 as SrcCC when the logic
51758 // operator is OR. Similar for CC1.
51759 SDValue SrcCC =
51761 SDLoc(SetCC0.getOperand(0)), MVT::i8)
51762 : SetCC0.getOperand(0);
51763 SDValue CC1N = SetCC1.getOperand(0);
51764 X86::CondCode CC1 =
51765 static_cast<X86::CondCode>(CC1N->getAsAPIntVal().getSExtValue());
51767 X86::CondCode CFlagsCC = IsOR ? CC1 : OppositeCC1;
51768 SDLoc DL(N);
51769 SDValue CFlags = DAG.getTargetConstant(
51770 X86::getCCMPCondFlagsFromCondCode(CFlagsCC), DL, MVT::i8);
51771 SDValue Sub = SetCC1.getOperand(1);
51772
51773 // Replace any uses of the old flag produced by SUB/CMP with the new one
51774 // produced by CCMP/CTEST.
51775 SDValue CCMP = (NewOpc == X86ISD::CCMP)
51776 ? DAG.getNode(X86ISD::CCMP, DL, MVT::i32,
51777 {Sub.getOperand(0), Sub.getOperand(1),
51778 CFlags, SrcCC, SetCC0.getOperand(1)})
51779 : DAG.getNode(X86ISD::CTEST, DL, MVT::i32,
51780 {Sub.getOperand(0), Sub.getOperand(0),
51781 CFlags, SrcCC, SetCC0.getOperand(1)});
51782
51783 return DAG.getNode(X86ISD::SETCC, DL, MVT::i8, {CC1N, CCMP});
51784}
51785
51788 const X86Subtarget &Subtarget) {
51789 using namespace SDPatternMatch;
51790
51791 SDValue N0 = N->getOperand(0);
51792 SDValue N1 = N->getOperand(1);
51793 EVT VT = N->getValueType(0);
51794 SDLoc dl(N);
51795 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51796
51797 // If this is SSE1 only convert to FAND to avoid scalarization.
51798 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
51799 return DAG.getBitcast(MVT::v4i32,
51800 DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
51801 DAG.getBitcast(MVT::v4f32, N0),
51802 DAG.getBitcast(MVT::v4f32, N1)));
51803 }
51804
51805 // Use a 32-bit and+zext if upper bits known zero.
51806 if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
51807 APInt HiMask = APInt::getHighBitsSet(64, 32);
51808 if (DAG.MaskedValueIsZero(N1, HiMask) ||
51809 DAG.MaskedValueIsZero(N0, HiMask)) {
51810 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
51811 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
51812 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
51813 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
51814 }
51815 }
51816
51817 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
51818 // TODO: Support multiple SrcOps.
51819 if (VT == MVT::i1) {
51821 SmallVector<APInt, 2> SrcPartials;
51822 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
51823 SrcOps.size() == 1) {
51824 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
51825 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
51826 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
51827 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
51828 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
51829 if (Mask) {
51830 assert(SrcPartials[0].getBitWidth() == NumElts &&
51831 "Unexpected partial reduction mask");
51832 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
51833 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
51834 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
51835 }
51836 }
51837 }
51838
51839 // InstCombine converts:
51840 // `(-x << C0) & C1`
51841 // to
51842 // `(x * (Pow2_Ceil(C1) - (1 << C0))) & C1`
51843 // This saves an IR instruction but on x86 the neg/shift version is preferable
51844 // so undo the transform.
51845
51846 if (N0.getOpcode() == ISD::MUL && N0.hasOneUse()) {
51847 // TODO: We don't actually need a splat for this, we just need the checks to
51848 // hold for each element.
51849 ConstantSDNode *N1C = isConstOrConstSplat(N1, /*AllowUndefs*/ true,
51850 /*AllowTruncation*/ false);
51851 ConstantSDNode *N01C =
51852 isConstOrConstSplat(N0.getOperand(1), /*AllowUndefs*/ true,
51853 /*AllowTruncation*/ false);
51854 if (N1C && N01C) {
51855 const APInt &MulC = N01C->getAPIntValue();
51856 const APInt &AndC = N1C->getAPIntValue();
51857 APInt MulCLowBit = MulC & (-MulC);
51858 if (MulC.uge(AndC) && !MulC.isPowerOf2() &&
51859 (MulCLowBit + MulC).isPowerOf2()) {
51860 SDValue Neg = DAG.getNegative(N0.getOperand(0), dl, VT);
51861 int32_t MulCLowBitLog = MulCLowBit.exactLogBase2();
51862 assert(MulCLowBitLog != -1 &&
51863 "Isolated lowbit is somehow not a power of 2!");
51864 SDValue Shift = DAG.getNode(ISD::SHL, dl, VT, Neg,
51865 DAG.getConstant(MulCLowBitLog, dl, VT));
51866 return DAG.getNode(ISD::AND, dl, VT, Shift, N1);
51867 }
51868 }
51869 }
51870
51871 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
51872 return SetCC;
51873
51874 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
51875 return V;
51876
51877 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
51878 return R;
51879
51880 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
51881 return R;
51882
51883 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
51884 return R;
51885
51886 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
51887 DAG, DCI, Subtarget))
51888 return FPLogic;
51889
51890 if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))
51891 return R;
51892
51893 if (DCI.isBeforeLegalizeOps())
51894 return SDValue();
51895
51896 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
51897 return R;
51898
51899 if (SDValue R = combineAndNotIntoANDNP(N, dl ,DAG))
51900 return R;
51901
51902 if (SDValue ShiftRight = combineAndMaskToShift(N, dl, DAG, Subtarget))
51903 return ShiftRight;
51904
51905 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
51906 return R;
51907
51908 if (SDValue R = combineAndNotOrIntoAndNotAnd(N, dl, DAG))
51909 return R;
51910
51911 // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
51912 // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
51913 // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
51914 if (VT.isVector() && getTargetConstantFromNode(N1)) {
51915 unsigned Opc0 = N0.getOpcode();
51916 if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
51918 DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
51919 N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
51920 SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
51921 return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
51922 }
51923 }
51924
51925 // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.
51926 // to make use of predicated selects.
51927 // AND(X,SEXT(SETCC())) -> SELECT(SETCC(),X,0)
51928 if (DCI.isAfterLegalizeDAG() && VT.isVector()) {
51929 SDValue X, Y;
51930 EVT CondVT = VT.changeVectorElementType(MVT::i1);
51931 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(CondVT) &&
51932 (VT.is512BitVector() || Subtarget.hasVLX()) &&
51933 (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
51936 m_Value(Y), m_SpecificVT(CondVT),
51937 m_SetCC(m_Value(), m_Value(), m_Value()))))))) {
51938 return DAG.getSelect(dl, VT, Y, X,
51939 getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl));
51940 }
51941 }
51942
51943 // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
51944 // avoids slow variable shift (moving shift amount to ECX etc.)
51945 if (isOneConstant(N1) && N0->hasOneUse()) {
51946 SDValue Src = N0;
51947 while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
51948 Src.getOpcode() == ISD::TRUNCATE) &&
51949 Src.getOperand(0)->hasOneUse())
51950 Src = Src.getOperand(0);
51951 bool ContainsNOT = false;
51952 X86::CondCode X86CC = X86::COND_B;
51953 // Peek through AND(NOT(SRL(X,Y)),1).
51954 if (isBitwiseNot(Src)) {
51955 Src = Src.getOperand(0);
51956 X86CC = X86::COND_AE;
51957 ContainsNOT = true;
51958 }
51959 if (Src.getOpcode() == ISD::SRL &&
51960 !isa<ConstantSDNode>(Src.getOperand(1))) {
51961 SDValue BitNo = Src.getOperand(1);
51962 Src = Src.getOperand(0);
51963 // Peek through AND(SRL(NOT(X),Y),1).
51964 if (isBitwiseNot(Src)) {
51965 Src = Src.getOperand(0);
51966 X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
51967 ContainsNOT = true;
51968 }
51969 // If we have BMI2 then SHRX should be faster for i32/i64 cases.
51970 if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))
51971 if (SDValue BT = getBT(Src, BitNo, dl, DAG))
51972 return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
51973 }
51974 }
51975
51976 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
51977 // Attempt to recursively combine a bitmask AND with shuffles.
51978 SDValue Op(N, 0);
51979 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
51980 return Res;
51981
51982 // If either operand is a constant mask, then only the elements that aren't
51983 // zero are actually demanded by the other operand.
51984 auto GetDemandedMasks = [&](SDValue Op) {
51985 APInt UndefElts;
51986 SmallVector<APInt> EltBits;
51987 int NumElts = VT.getVectorNumElements();
51988 int EltSizeInBits = VT.getScalarSizeInBits();
51989 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
51990 APInt DemandedElts = APInt::getAllOnes(NumElts);
51991 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
51992 EltBits)) {
51993 DemandedBits.clearAllBits();
51994 DemandedElts.clearAllBits();
51995 for (int I = 0; I != NumElts; ++I) {
51996 if (UndefElts[I]) {
51997 // We can't assume an undef src element gives an undef dst - the
51998 // other src might be zero.
51999 DemandedBits.setAllBits();
52000 DemandedElts.setBit(I);
52001 } else if (!EltBits[I].isZero()) {
52002 DemandedBits |= EltBits[I];
52003 DemandedElts.setBit(I);
52004 }
52005 }
52006 }
52007 return std::make_pair(DemandedBits, DemandedElts);
52008 };
52009 APInt Bits0, Elts0;
52010 APInt Bits1, Elts1;
52011 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
52012 std::tie(Bits1, Elts1) = GetDemandedMasks(N0);
52013
52014 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
52015 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
52016 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
52017 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
52018 if (N->getOpcode() != ISD::DELETED_NODE)
52019 DCI.AddToWorklist(N);
52020 return SDValue(N, 0);
52021 }
52022
52023 SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);
52024 SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);
52025 if (NewN0 || NewN1)
52026 return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
52027 NewN1 ? NewN1 : N1);
52028 }
52029
52030 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
52031 if ((VT.getScalarSizeInBits() % 8) == 0 &&
52033 isa<ConstantSDNode>(N0.getOperand(1)) && N0->hasOneUse()) {
52034 SDValue BitMask = N1;
52035 SDValue SrcVec = N0.getOperand(0);
52036 EVT SrcVecVT = SrcVec.getValueType();
52037
52038 // Check that the constant bitmask masks whole bytes.
52039 APInt UndefElts;
52040 SmallVector<APInt, 64> EltBits;
52041 if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
52042 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
52043 llvm::all_of(EltBits, [](const APInt &M) {
52044 return M.isZero() || M.isAllOnes();
52045 })) {
52046 unsigned NumElts = SrcVecVT.getVectorNumElements();
52047 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
52048 unsigned Idx = N0.getConstantOperandVal(1);
52049
52050 // Create a root shuffle mask from the byte mask and the extracted index.
52051 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
52052 for (unsigned i = 0; i != Scale; ++i) {
52053 if (UndefElts[i])
52054 continue;
52055 int VecIdx = Scale * Idx + i;
52056 ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
52057 }
52058
52060 {SrcVec}, 0, SrcVec.getOpcode(), SrcVec.getSimpleValueType(),
52061 ShuffleMask, {}, /*Depth=*/1, X86::MaxShuffleCombineDepth,
52062 /*AllowVariableCrossLaneMask=*/true,
52063 /*AllowVariablePerLaneMask=*/true,
52064 /*IsMaskedShuffle=*/false, DAG, SDLoc(SrcVec), Subtarget))
52065 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
52066 N0.getOperand(1));
52067 }
52068 }
52069
52070 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
52071 return R;
52072
52073 if (SDValue R = combineAndXorSubWithBMI(N, dl, DAG, Subtarget))
52074 return R;
52075
52076 return SDValue();
52077}
52078
52079// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
52081 SelectionDAG &DAG,
52082 const X86Subtarget &Subtarget) {
52083 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
52084
52085 MVT VT = N->getSimpleValueType(0);
52086 unsigned EltSizeInBits = VT.getScalarSizeInBits();
52087 if (!VT.isVector() || (EltSizeInBits % 8) != 0)
52088 return SDValue();
52089
52090 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
52091 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
52092 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
52093 return SDValue();
52094
52095 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
52096 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
52097 if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
52098 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
52099 return SDValue();
52100
52101 // Attempt to extract constant byte masks.
52102 APInt UndefElts0, UndefElts1;
52103 SmallVector<APInt, 32> EltBits0, EltBits1;
52104 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
52105 /*AllowWholeUndefs*/ false,
52106 /*AllowPartialUndefs*/ false))
52107 return SDValue();
52108 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
52109 /*AllowWholeUndefs*/ false,
52110 /*AllowPartialUndefs*/ false))
52111 return SDValue();
52112
52113 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
52114 // TODO - add UNDEF elts support.
52115 if (UndefElts0[i] || UndefElts1[i])
52116 return SDValue();
52117 if (EltBits0[i] != ~EltBits1[i])
52118 return SDValue();
52119 }
52120
52121 if (useVPTERNLOG(Subtarget, VT)) {
52122 // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
52123 // VPTERNLOG is only available as vXi32/64-bit types.
52124 MVT OpSVT = EltSizeInBits <= 32 ? MVT::i32 : MVT::i64;
52125 MVT OpVT =
52126 MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
52127 SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
52128 SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
52129 SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
52130 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
52131 SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
52132 DAG, Subtarget);
52133 return DAG.getBitcast(VT, Res);
52134 }
52135
52136 SDValue X = N->getOperand(0);
52137 SDValue Y =
52138 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
52139 DAG.getBitcast(VT, N1.getOperand(0)));
52140 return DAG.getNode(ISD::OR, DL, VT, X, Y);
52141}
52142
52143// Try to match OR(ANDNP(MASK,X),AND(MASK,Y)) logic pattern.
52144// TODO: Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
52145// Waiting for ANDNP combine allows other combines to happen that prevent
52146// matching.
52147static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
52148 using namespace SDPatternMatch;
52149 return sd_match(N, m_Or(m_BinOp(X86ISD::ANDNP, m_Value(Mask), m_Value(X)),
52150 m_And(m_Deferred(Mask), m_Value(Y))));
52151}
52152
52153// Try to fold:
52154// (or (and (m, y), (pandn m, x)))
52155// into:
52156// (vselect m, x, y)
52157// As a special case, try to fold:
52158// (or (and (m, (sub 0, x)), (pandn m, x)))
52159// into:
52160// (sub (xor X, M), M)
52162 SelectionDAG &DAG,
52163 const X86Subtarget &Subtarget) {
52164 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
52165
52166 EVT VT = N->getValueType(0);
52167 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
52168 (VT.is256BitVector() && Subtarget.hasInt256())))
52169 return SDValue();
52170
52171 SDValue X, Y, Mask;
52172 if (!matchLogicBlend(N, X, Y, Mask))
52173 return SDValue();
52174
52175 // Validate that X, Y, and Mask are bitcasts, and see through them.
52176 Mask = peekThroughBitcasts(Mask);
52179
52180 EVT MaskVT = Mask.getValueType();
52181 unsigned EltBits = MaskVT.getScalarSizeInBits();
52182
52183 // TODO: Attempt to handle floating point cases as well?
52184 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
52185 return SDValue();
52186
52187 // Attempt to combine to conditional negate: (sub (xor X, M), M)
52188 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
52189 DAG, Subtarget))
52190 return Res;
52191
52192 // PBLENDVB is only available on SSE 4.1.
52193 if (!Subtarget.hasSSE41())
52194 return SDValue();
52195
52196 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
52197 if (Subtarget.hasVLX())
52198 return SDValue();
52199
52200 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
52201
52202 X = DAG.getBitcast(BlendVT, X);
52203 Y = DAG.getBitcast(BlendVT, Y);
52204 Mask = DAG.getBitcast(BlendVT, Mask);
52205 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
52206 return DAG.getBitcast(VT, Mask);
52207}
52208
52209// Helper function for combineOrCmpEqZeroToCtlzSrl
52210// Transforms:
52211// seteq(cmp x, 0)
52212// into:
52213// srl(ctlz x), log2(bitsize(x))
52214// Input pattern is checked by caller.
52216 SDValue Cmp = Op.getOperand(1);
52217 EVT VT = Cmp.getOperand(0).getValueType();
52218 unsigned Log2b = Log2_32(VT.getSizeInBits());
52219 SDLoc dl(Op);
52220 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
52221 // The result of the shift is true or false, and on X86, the 32-bit
52222 // encoding of shr and lzcnt is more desirable.
52223 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
52224 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
52225 DAG.getConstant(Log2b, dl, MVT::i8));
52226 return Scc;
52227}
52228
52229// Try to transform:
52230// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
52231// into:
52232// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
52233// Will also attempt to match more generic cases, eg:
52234// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
52235// Only applies if the target supports the FastLZCNT feature.
52238 const X86Subtarget &Subtarget) {
52239 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
52240 return SDValue();
52241
52242 auto isORCandidate = [](SDValue N) {
52243 return (N->getOpcode() == ISD::OR && N->hasOneUse());
52244 };
52245
52246 // Check the zero extend is extending to 32-bit or more. The code generated by
52247 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
52248 // instructions to clear the upper bits.
52249 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
52250 !isORCandidate(N->getOperand(0)))
52251 return SDValue();
52252
52253 // Check the node matches: setcc(eq, cmp 0)
52254 auto isSetCCCandidate = [](SDValue N) {
52255 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
52256 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
52257 N->getOperand(1).getOpcode() == X86ISD::CMP &&
52258 isNullConstant(N->getOperand(1).getOperand(1)) &&
52259 N->getOperand(1).getValueType().bitsGE(MVT::i32);
52260 };
52261
52262 SDNode *OR = N->getOperand(0).getNode();
52263 SDValue LHS = OR->getOperand(0);
52264 SDValue RHS = OR->getOperand(1);
52265
52266 // Save nodes matching or(or, setcc(eq, cmp 0)).
52268 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
52269 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
52270 ORNodes.push_back(OR);
52271 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
52272 LHS = OR->getOperand(0);
52273 RHS = OR->getOperand(1);
52274 }
52275
52276 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
52277 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
52278 !isORCandidate(SDValue(OR, 0)))
52279 return SDValue();
52280
52281 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
52282 // to
52283 // or(srl(ctlz),srl(ctlz)).
52284 // The dag combiner can then fold it into:
52285 // srl(or(ctlz, ctlz)).
52286 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
52287 SDValue Ret, NewRHS;
52288 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
52289 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
52290
52291 if (!Ret)
52292 return SDValue();
52293
52294 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
52295 while (!ORNodes.empty()) {
52296 OR = ORNodes.pop_back_val();
52297 LHS = OR->getOperand(0);
52298 RHS = OR->getOperand(1);
52299 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
52300 if (RHS->getOpcode() == ISD::OR)
52301 std::swap(LHS, RHS);
52302 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
52303 if (!NewRHS)
52304 return SDValue();
52305 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
52306 }
52307
52308 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
52309}
52310
52311/// If this is an add or subtract where one operand is produced by a cmp+setcc,
52312/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
52313/// with CMP+{ADC, SBB}.
52314/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
52315static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
52316 SDValue X, SDValue Y,
52317 SelectionDAG &DAG,
52318 bool ZeroSecondOpOnly = false) {
52319 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
52320 return SDValue();
52321
52322 // Look through a one-use zext.
52323 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
52324 Y = Y.getOperand(0);
52325
52326 X86::CondCode CC;
52327 SDValue EFLAGS;
52328 if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
52329 CC = (X86::CondCode)Y.getConstantOperandVal(0);
52330 EFLAGS = Y.getOperand(1);
52331 } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
52332 Y.hasOneUse()) {
52333 EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
52334 }
52335
52336 if (!EFLAGS)
52337 return SDValue();
52338
52339 // If X is -1 or 0, then we have an opportunity to avoid constants required in
52340 // the general case below.
52341 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
52342 if (ConstantX && !ZeroSecondOpOnly) {
52343 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
52344 (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
52345 // This is a complicated way to get -1 or 0 from the carry flag:
52346 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
52347 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
52348 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52349 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52350 EFLAGS);
52351 }
52352
52353 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
52354 (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
52355 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
52356 EFLAGS.getValueType().isInteger() &&
52357 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52358 // Swap the operands of a SUB, and we have the same pattern as above.
52359 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
52360 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
52361 SDValue NewSub = DAG.getNode(
52362 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52363 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52364 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
52365 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52366 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52367 NewEFLAGS);
52368 }
52369 }
52370 }
52371
52372 if (CC == X86::COND_B) {
52373 // X + SETB Z --> adc X, 0
52374 // X - SETB Z --> sbb X, 0
52375 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52376 DAG.getVTList(VT, MVT::i32), X,
52377 DAG.getConstant(0, DL, VT), EFLAGS);
52378 }
52379
52380 if (ZeroSecondOpOnly)
52381 return SDValue();
52382
52383 if (CC == X86::COND_A) {
52384 // Try to convert COND_A into COND_B in an attempt to facilitate
52385 // materializing "setb reg".
52386 //
52387 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
52388 // cannot take an immediate as its first operand.
52389 //
52390 // If EFLAGS is from a CMP that compares the same operands as the earlier
52391 // SUB producing X (i.e. CMP X, Y), we can directly use the carry flag with
52392 // SBB/ADC without creating a flipped SUB.
52393 if (EFLAGS.getOpcode() == X86ISD::CMP &&
52394 EFLAGS.getValueType().isInteger() && X == EFLAGS.getOperand(0)) {
52395 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52396 DAG.getVTList(VT, MVT::i32), X,
52397 DAG.getConstant(0, DL, VT), EFLAGS);
52398 }
52399
52400 if (EFLAGS.getOpcode() == X86ISD::SUB &&
52401 EFLAGS.getValueType().isInteger() &&
52402 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52403 // Only create NewSub if we know one of the folds will succeed to avoid
52404 // introducing a temporary node that may persist and affect one-use checks
52405 // below.
52406 if (EFLAGS.getNode()->hasOneUse()) {
52407 SDValue NewSub = DAG.getNode(
52408 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52409 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52410 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52411 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52412 DAG.getVTList(VT, MVT::i32), X,
52413 DAG.getConstant(0, DL, VT), NewEFLAGS);
52414 }
52415
52416 if (IsSub && X == EFLAGS.getValue(0)) {
52417 SDValue NewSub = DAG.getNode(
52418 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52419 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52420 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52421 return DAG.getNode(X86ISD::SBB, DL, DAG.getVTList(VT, MVT::i32),
52422 EFLAGS.getOperand(0), EFLAGS.getOperand(1),
52423 NewEFLAGS);
52424 }
52425 }
52426 }
52427
52428 if (CC == X86::COND_AE) {
52429 // X + SETAE --> sbb X, -1
52430 // X - SETAE --> adc X, -1
52431 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52432 DAG.getVTList(VT, MVT::i32), X,
52433 DAG.getAllOnesConstant(DL, VT), EFLAGS);
52434 }
52435
52436 if (CC == X86::COND_BE) {
52437 // X + SETBE --> sbb X, -1
52438 // X - SETBE --> adc X, -1
52439 // Try to convert COND_BE into COND_AE in an attempt to facilitate
52440 // materializing "setae reg".
52441 //
52442 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
52443 // cannot take an immediate as its first operand.
52444 //
52445 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
52446 EFLAGS.getValueType().isInteger() &&
52447 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52448 SDValue NewSub =
52449 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52450 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52451 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52452 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52453 DAG.getVTList(VT, MVT::i32), X,
52454 DAG.getAllOnesConstant(DL, VT), NewEFLAGS);
52455 }
52456 }
52457
52458 if (CC != X86::COND_E && CC != X86::COND_NE)
52459 return SDValue();
52460
52461 if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
52462 !X86::isZeroNode(EFLAGS.getOperand(1)) ||
52463 !EFLAGS.getOperand(0).getValueType().isInteger())
52464 return SDValue();
52465
52466 SDValue Z = EFLAGS.getOperand(0);
52467 EVT ZVT = Z.getValueType();
52468
52469 // If X is -1 or 0, then we have an opportunity to avoid constants required in
52470 // the general case below.
52471 if (ConstantX) {
52472 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
52473 // fake operands:
52474 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
52475 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
52476 if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
52477 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
52478 SDValue Zero = DAG.getConstant(0, DL, ZVT);
52479 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52480 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
52481 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52482 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52483 SDValue(Neg.getNode(), 1));
52484 }
52485
52486 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
52487 // with fake operands:
52488 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
52489 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
52490 if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
52491 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
52492 SDValue One = DAG.getConstant(1, DL, ZVT);
52493 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52494 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52495 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52496 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52497 Cmp1.getValue(1));
52498 }
52499 }
52500
52501 // (cmp Z, 1) sets the carry flag if Z is 0.
52502 SDValue One = DAG.getConstant(1, DL, ZVT);
52503 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52504 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52505
52506 // Add the flags type for ADC/SBB nodes.
52507 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
52508
52509 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
52510 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
52511 if (CC == X86::COND_NE)
52512 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
52513 DAG.getAllOnesConstant(DL, VT), Cmp1.getValue(1));
52514
52515 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
52516 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
52517 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
52518 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
52519}
52520
52521/// If this is an add or subtract where one operand is produced by a cmp+setcc,
52522/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
52523/// with CMP+{ADC, SBB}.
52525 SelectionDAG &DAG) {
52526 bool IsSub = N->getOpcode() == ISD::SUB;
52527 SDValue X = N->getOperand(0);
52528 SDValue Y = N->getOperand(1);
52529 EVT VT = N->getValueType(0);
52530
52531 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
52532 return ADCOrSBB;
52533
52534 // Commute and try again (negate the result for subtracts).
52535 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
52536 if (IsSub)
52537 ADCOrSBB = DAG.getNegative(ADCOrSBB, DL, VT);
52538 return ADCOrSBB;
52539 }
52540
52541 return SDValue();
52542}
52543
52544static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT,
52545 SDValue N0, SDValue N1,
52546 SelectionDAG &DAG) {
52547 assert((Opc == ISD::XOR || Opc == ISD::OR) && "Unexpected opcode");
52548
52549 // Delegate to combineAddOrSubToADCOrSBB if we have:
52550 //
52551 // (xor/or (zero_extend (setcc)) imm)
52552 //
52553 // where imm is odd if and only if we have xor, in which case the XOR/OR are
52554 // equivalent to a SUB/ADD, respectively.
52555 if (N0.getOpcode() == ISD::ZERO_EXTEND &&
52556 N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {
52557 if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {
52558 bool IsSub = Opc == ISD::XOR;
52559 bool N1COdd = N1C->getZExtValue() & 1;
52560 if (IsSub ? N1COdd : !N1COdd)
52561 if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))
52562 return R;
52563 }
52564 }
52565
52566 // not(pcmpeq(and(X,CstPow2),0)) -> pcmpeq(and(X,CstPow2),CstPow2)
52567 if (Opc == ISD::XOR && N0.getOpcode() == X86ISD::PCMPEQ &&
52568 N0.getOperand(0).getOpcode() == ISD::AND &&
52572 VT.getScalarSizeInBits(), /*AllowUndefs=*/true)) {
52573 return DAG.getNode(X86ISD::PCMPEQ, DL, VT, N0.getOperand(0),
52574 N0.getOperand(0).getOperand(1));
52575 }
52576
52577 return SDValue();
52578}
52579
52582 const X86Subtarget &Subtarget) {
52583 SDValue N0 = N->getOperand(0);
52584 SDValue N1 = N->getOperand(1);
52585 EVT VT = N->getValueType(0);
52586 SDLoc dl(N);
52587 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52588
52589 // If this is SSE1 only convert to FOR to avoid scalarization.
52590 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
52591 return DAG.getBitcast(MVT::v4i32,
52592 DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
52593 DAG.getBitcast(MVT::v4f32, N0),
52594 DAG.getBitcast(MVT::v4f32, N1)));
52595 }
52596
52597 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
52598 // TODO: Support multiple SrcOps.
52599 if (VT == MVT::i1) {
52601 SmallVector<APInt, 2> SrcPartials;
52602 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
52603 SrcOps.size() == 1) {
52604 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
52605 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
52606 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
52607 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
52608 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
52609 if (Mask) {
52610 assert(SrcPartials[0].getBitWidth() == NumElts &&
52611 "Unexpected partial reduction mask");
52612 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
52613 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
52614 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
52615 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
52616 }
52617 }
52618 }
52619
52620 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
52621 return SetCC;
52622
52623 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
52624 return R;
52625
52626 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
52627 return R;
52628
52629 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
52630 return R;
52631
52632 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
52633 DAG, DCI, Subtarget))
52634 return FPLogic;
52635
52636 if (DCI.isBeforeLegalizeOps())
52637 return SDValue();
52638
52639 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
52640 return R;
52641
52642 if (SDValue R = canonicalizeBitSelect(N, dl, DAG, Subtarget))
52643 return R;
52644
52645 if (SDValue R = combineLogicBlendIntoPBLENDV(N, dl, DAG, Subtarget))
52646 return R;
52647
52648 // Combine `(x86isd::setcc_carry) | C` and `(0 - SetCC) | C`
52649 // into `(zext (not SetCC)) * (C + 1) - 1` if we can get a LEA out of it.
52650 if ((VT == MVT::i32 || VT == MVT::i64) && N0.hasOneUse()) {
52651 if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {
52652 uint64_t Val = CN->getZExtValue();
52653 if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 ||
52654 Val == 8) {
52655 SDValue NotCond;
52656 if (N0.getOpcode() == X86ISD::SETCC_CARRY &&
52657 N0.getOperand(1).hasOneUse()) {
52660 NotCond = getSETCC(NewCC, N0.getOperand(1), SDLoc(N0), DAG);
52661 } else if (N0.getOpcode() == ISD::SUB &&
52662 isNullConstant(N0.getOperand(0))) {
52663 SDValue Cond = N0.getOperand(1);
52664 if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())
52665 Cond = Cond.getOperand(0);
52666 if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {
52667 X86::CondCode OldCC = (X86::CondCode)Cond.getConstantOperandVal(0);
52669 NotCond = getSETCC(NewCC, Cond.getOperand(1), SDLoc(Cond), DAG);
52670 }
52671 }
52672
52673 if (NotCond) {
52674 SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
52675 R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
52676 R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
52677 return R;
52678 }
52679 }
52680 }
52681 }
52682
52683 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
52684 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
52685 // iff the upper elements of the non-shifted arg are zero.
52686 // KUNPCK require 16+ bool vector elements.
52687 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
52688 unsigned NumElts = VT.getVectorNumElements();
52689 unsigned HalfElts = NumElts / 2;
52690 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
52691 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
52692 N1.getConstantOperandAPInt(1) == HalfElts &&
52693 DAG.MaskedVectorIsZero(N0, UpperElts)) {
52694 return DAG.getNode(
52695 ISD::CONCAT_VECTORS, dl, VT,
52696 extractSubVector(N0, 0, DAG, dl, HalfElts),
52697 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
52698 }
52699 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
52700 N0.getConstantOperandAPInt(1) == HalfElts &&
52701 DAG.MaskedVectorIsZero(N1, UpperElts)) {
52702 return DAG.getNode(
52703 ISD::CONCAT_VECTORS, dl, VT,
52704 extractSubVector(N1, 0, DAG, dl, HalfElts),
52705 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
52706 }
52707 }
52708
52709 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
52710 // Attempt to recursively combine an OR of shuffles.
52711 SDValue Op(N, 0);
52712 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
52713 return Res;
52714
52715 // If either operand is a constant mask, then only the elements that aren't
52716 // allones are actually demanded by the other operand.
52717 auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
52718 APInt UndefElts;
52719 SmallVector<APInt> EltBits;
52720 int NumElts = VT.getVectorNumElements();
52721 int EltSizeInBits = VT.getScalarSizeInBits();
52722 if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
52723 return false;
52724
52725 APInt DemandedElts = APInt::getZero(NumElts);
52726 for (int I = 0; I != NumElts; ++I)
52727 if (!EltBits[I].isAllOnes())
52728 DemandedElts.setBit(I);
52729
52730 return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
52731 };
52732 if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
52733 if (N->getOpcode() != ISD::DELETED_NODE)
52734 DCI.AddToWorklist(N);
52735 return SDValue(N, 0);
52736 }
52737 }
52738
52739 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), dl, VT, N0, N1, DAG))
52740 return R;
52741
52742 return SDValue();
52743}
52744
52745/// Try to turn tests against the signbit in the form of:
52746/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
52747/// into:
52748/// SETGT(X, -1)
52750 SelectionDAG &DAG) {
52751 // This is only worth doing if the output type is i8 or i1.
52752 EVT ResultType = N->getValueType(0);
52753 if (ResultType != MVT::i8 && ResultType != MVT::i1)
52754 return SDValue();
52755
52756 SDValue N0 = N->getOperand(0);
52757 SDValue N1 = N->getOperand(1);
52758
52759 // We should be performing an xor against a truncated shift.
52760 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
52761 return SDValue();
52762
52763 // Make sure we are performing an xor against one.
52764 if (!isOneConstant(N1))
52765 return SDValue();
52766
52767 // SetCC on x86 zero extends so only act on this if it's a logical shift.
52768 SDValue Shift = N0.getOperand(0);
52769 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
52770 return SDValue();
52771
52772 // Make sure we are truncating from one of i16, i32 or i64.
52773 EVT ShiftTy = Shift.getValueType();
52774 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
52775 return SDValue();
52776
52777 // Make sure the shift amount extracts the sign bit.
52778 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
52779 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
52780 return SDValue();
52781
52782 // Create a greater-than comparison against -1.
52783 // N.B. Using SETGE against 0 works but we want a canonical looking
52784 // comparison, using SETGT matches up with what TranslateX86CC.
52785 SDValue ShiftOp = Shift.getOperand(0);
52786 EVT ShiftOpTy = ShiftOp.getValueType();
52787 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52788 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
52789 *DAG.getContext(), ResultType);
52790 SDValue Cond =
52791 DAG.getSetCC(DL, SetCCResultType, ShiftOp,
52792 DAG.getAllOnesConstant(DL, ShiftOpTy), ISD::SETGT);
52793 if (SetCCResultType != ResultType)
52794 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
52795 return Cond;
52796}
52797
52798/// Turn vector tests of the signbit in the form of:
52799/// xor (sra X, elt_size(X)-1), -1
52800/// into:
52801/// pcmpgt X, -1
52802///
52803/// This should be called before type legalization because the pattern may not
52804/// persist after that.
52806 const X86Subtarget &Subtarget) {
52807 EVT VT = N->getValueType(0);
52808 if (!VT.isSimple())
52809 return SDValue();
52810
52811 switch (VT.getSimpleVT().SimpleTy) {
52812 // clang-format off
52813 default: return SDValue();
52814 case MVT::v16i8:
52815 case MVT::v8i16:
52816 case MVT::v4i32:
52817 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
52818 case MVT::v32i8:
52819 case MVT::v16i16:
52820 case MVT::v8i32:
52821 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
52822 // clang-format on
52823 }
52824
52825 // There must be a shift right algebraic before the xor, and the xor must be a
52826 // 'not' operation.
52827 SDValue Shift = N->getOperand(0);
52828 SDValue Ones = N->getOperand(1);
52829 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
52831 return SDValue();
52832
52833 // The shift should be smearing the sign bit across each vector element.
52834 auto *ShiftAmt =
52835 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
52836 if (!ShiftAmt ||
52837 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
52838 return SDValue();
52839
52840 // Create a greater-than comparison against -1. We don't use the more obvious
52841 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
52842 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
52843}
52844
52845/// Detect patterns of truncation with unsigned saturation:
52846///
52847/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
52848/// Return the source value x to be truncated or SDValue() if the pattern was
52849/// not matched.
52850///
52851/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
52852/// where C1 >= 0 and C2 is unsigned max of destination type.
52853///
52854/// (truncate (smax (smin (x, C2), C1)) to dest_type)
52855/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
52856///
52857/// These two patterns are equivalent to:
52858/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
52859/// So return the smax(x, C1) value to be truncated or SDValue() if the
52860/// pattern was not matched.
52862 const SDLoc &DL) {
52863 using namespace llvm::SDPatternMatch;
52864 EVT InVT = In.getValueType();
52865
52866 // Saturation with truncation. We truncate from InVT to VT.
52868 "Unexpected types for truncate operation");
52869
52870 APInt C1, C2;
52872
52873 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
52874 // the element size of the destination type.
52875 if (sd_match(In, m_UMin(m_Value(UMin), m_ConstInt(C2))) &&
52876 C2.isMask(VT.getScalarSizeInBits()))
52877 return UMin;
52878
52879 if (sd_match(In, m_SMin(m_Value(SMin), m_ConstInt(C2))) &&
52881 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
52882 return SMin;
52883
52884 if (sd_match(In, m_SMax(m_Value(SMax), m_ConstInt(C1))) &&
52886 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) && C2.uge(C1))
52887 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
52888
52889 return SDValue();
52890}
52891
52892/// Detect patterns of truncation with signed saturation:
52893/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
52894/// signed_max_of_dest_type)) to dest_type)
52895/// or:
52896/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
52897/// signed_min_of_dest_type)) to dest_type).
52898/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
52899/// Return the source value to be truncated or SDValue() if the pattern was not
52900/// matched.
52901static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
52902 using namespace llvm::SDPatternMatch;
52903 unsigned NumDstBits = VT.getScalarSizeInBits();
52904 unsigned NumSrcBits = In.getScalarValueSizeInBits();
52905 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
52906
52907 APInt SignedMax, SignedMin;
52908 if (MatchPackUS) {
52909 SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
52910 SignedMin = APInt::getZero(NumSrcBits);
52911 } else {
52912 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
52913 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
52914 }
52915
52916 SDValue SMin, SMax;
52917 if (sd_match(In, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))) &&
52918 sd_match(SMin, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))))
52919 return SMax;
52920
52921 if (sd_match(In, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))) &&
52922 sd_match(SMax, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))))
52923 return SMin;
52924
52925 return SDValue();
52926}
52927
52929 SelectionDAG &DAG,
52930 const X86Subtarget &Subtarget) {
52931 if (!Subtarget.hasSSE2() || !VT.isVector())
52932 return SDValue();
52933
52934 EVT SVT = VT.getVectorElementType();
52935 EVT InVT = In.getValueType();
52936 EVT InSVT = InVT.getVectorElementType();
52937
52938 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
52939 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
52940 // and concatenate at the same time. Then we can use a final vpmovuswb to
52941 // clip to 0-255.
52942 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
52943 InVT == MVT::v16i32 && VT == MVT::v16i8) {
52944 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52945 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
52946 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
52947 DL, DAG, Subtarget);
52948 assert(Mid && "Failed to pack!");
52949 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
52950 }
52951 }
52952
52953 // vXi32 truncate instructions are available with AVX512F.
52954 // vXi16 truncate instructions are only available with AVX512BW.
52955 // For 256-bit or smaller vectors, we require VLX.
52956 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
52957 // If the result type is 256-bits or larger and we have disable 512-bit
52958 // registers, we should go ahead and use the pack instructions if possible.
52959 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
52960 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
52961 (InVT.getSizeInBits() > 128) &&
52962 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
52963 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
52964
52965 if (!PreferAVX512 && VT.getVectorNumElements() > 1 &&
52967 (SVT == MVT::i8 || SVT == MVT::i16) &&
52968 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
52969 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52970 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
52971 if (SVT == MVT::i8 && InSVT == MVT::i32) {
52972 EVT MidVT = VT.changeVectorElementType(MVT::i16);
52973 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
52974 DAG, Subtarget);
52975 assert(Mid && "Failed to pack!");
52977 Subtarget);
52978 assert(V && "Failed to pack!");
52979 return V;
52980 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
52981 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
52982 Subtarget);
52983 }
52984 if (SDValue SSatVal = detectSSatPattern(In, VT))
52985 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
52986 Subtarget);
52987 }
52988
52989 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52990 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
52991 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
52992 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
52993 unsigned TruncOpc = 0;
52994 SDValue SatVal;
52995 if (SDValue SSatVal = detectSSatPattern(In, VT)) {
52996 SatVal = SSatVal;
52997 TruncOpc = X86ISD::VTRUNCS;
52998 } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
52999 SatVal = USatVal;
53000 TruncOpc = X86ISD::VTRUNCUS;
53001 }
53002 if (SatVal) {
53003 unsigned ResElts = VT.getVectorNumElements();
53004 // If the input type is less than 512 bits and we don't have VLX, we need
53005 // to widen to 512 bits.
53006 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
53007 unsigned NumConcats = 512 / InVT.getSizeInBits();
53008 ResElts *= NumConcats;
53009 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
53010 ConcatOps[0] = SatVal;
53011 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
53012 NumConcats * InVT.getVectorNumElements());
53013 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
53014 }
53015 // Widen the result if its narrower than 128 bits.
53016 if (ResElts * SVT.getSizeInBits() < 128)
53017 ResElts = 128 / SVT.getSizeInBits();
53018 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
53019 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
53020 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
53021 DAG.getVectorIdxConstant(0, DL));
53022 }
53023 }
53024
53025 return SDValue();
53026}
53027
53029 SelectionDAG &DAG,
53031 const X86Subtarget &Subtarget) {
53032 auto *Ld = cast<LoadSDNode>(N);
53033 EVT RegVT = Ld->getValueType(0);
53034 SDValue Ptr = Ld->getBasePtr();
53035 SDValue Chain = Ld->getChain();
53036 ISD::LoadExtType Ext = Ld->getExtensionType();
53037
53038 if (Ext != ISD::NON_EXTLOAD || !Subtarget.hasAVX() || !Ld->isSimple())
53039 return SDValue();
53040
53041 if (!(RegVT.is128BitVector() || RegVT.is256BitVector()))
53042 return SDValue();
53043
53045 if (!LdC)
53046 return SDValue();
53047
53048 auto MatchingBits = [](const APInt &Undefs, const APInt &UserUndefs,
53049 ArrayRef<APInt> Bits, ArrayRef<APInt> UserBits) {
53050 for (unsigned I = 0, E = Undefs.getBitWidth(); I != E; ++I) {
53051 if (Undefs[I])
53052 continue;
53053 if (UserUndefs[I] || Bits[I] != UserBits[I])
53054 return false;
53055 }
53056 return true;
53057 };
53058
53059 // Look through all other loads/broadcasts in the chain for another constant
53060 // pool entry.
53061 for (SDNode *User : Chain->users()) {
53062 auto *UserLd = dyn_cast<MemSDNode>(User);
53063 if (User != N && UserLd &&
53064 (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
53065 User->getOpcode() == X86ISD::VBROADCAST_LOAD ||
53067 UserLd->getChain() == Chain && User->hasAnyUseOfValue(0) &&
53068 User->getValueSizeInBits(0).getFixedValue() >
53069 RegVT.getFixedSizeInBits()) {
53070 EVT UserVT = User->getValueType(0);
53071 SDValue UserPtr = UserLd->getBasePtr();
53072 const Constant *UserC = getTargetConstantFromBasePtr(UserPtr);
53073
53074 // See if we are loading a constant that matches in the lower
53075 // bits of a longer constant (but from a different constant pool ptr).
53076 if (UserC && UserPtr != Ptr) {
53077 unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits();
53078 unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits();
53079 if (LdSize < UserSize || !ISD::isNormalLoad(User)) {
53080 APInt Undefs, UserUndefs;
53081 SmallVector<APInt> Bits, UserBits;
53082 unsigned NumBits = std::min(RegVT.getScalarSizeInBits(),
53083 UserVT.getScalarSizeInBits());
53084 if (getTargetConstantBitsFromNode(SDValue(N, 0), NumBits, Undefs,
53085 Bits) &&
53087 UserUndefs, UserBits)) {
53088 if (MatchingBits(Undefs, UserUndefs, Bits, UserBits)) {
53090 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
53091 RegVT.getSizeInBits());
53092 Extract = DAG.getBitcast(RegVT, Extract);
53093 return DCI.CombineTo(N, Extract, SDValue(User, 1));
53094 }
53095 }
53096 }
53097 }
53098 }
53099 }
53100
53101 return SDValue();
53102}
53103
53106 const X86Subtarget &Subtarget) {
53107 auto *Ld = cast<LoadSDNode>(N);
53108 EVT RegVT = Ld->getValueType(0);
53109 EVT MemVT = Ld->getMemoryVT();
53110 SDLoc dl(Ld);
53111 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53112
53113 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
53114 // into two 16-byte operations. Also split non-temporal aligned loads on
53115 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
53116 ISD::LoadExtType Ext = Ld->getExtensionType();
53117 unsigned Fast;
53118 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
53119 Ext == ISD::NON_EXTLOAD &&
53120 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
53121 Ld->getAlign() >= Align(16)) ||
53122 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
53123 *Ld->getMemOperand(), &Fast) &&
53124 !Fast))) {
53125 unsigned NumElems = RegVT.getVectorNumElements();
53126 if (NumElems < 2)
53127 return SDValue();
53128
53129 unsigned HalfOffset = 16;
53130 SDValue Ptr1 = Ld->getBasePtr();
53131 SDValue Ptr2 =
53132 DAG.getMemBasePlusOffset(Ptr1, TypeSize::getFixed(HalfOffset), dl);
53133 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
53134 NumElems / 2);
53135 SDValue Load1 =
53136 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
53137 Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
53138 SDValue Load2 =
53139 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
53140 Ld->getPointerInfo().getWithOffset(HalfOffset),
53141 Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
53142 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
53143 Load1.getValue(1), Load2.getValue(1));
53144
53145 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
53146 return DCI.CombineTo(N, NewVec, TF, true);
53147 }
53148
53149 // Bool vector load - attempt to cast to an integer, as we have good
53150 // (vXiY *ext(vXi1 bitcast(iX))) handling.
53151 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
53152 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
53153 unsigned NumElts = RegVT.getVectorNumElements();
53154 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
53155 if (TLI.isTypeLegal(IntVT)) {
53156 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
53157 Ld->getPointerInfo(), Ld->getBaseAlign(),
53158 Ld->getMemOperand()->getFlags());
53159 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
53160 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
53161 }
53162 }
53163
53164 // If we also broadcast this vector to a wider type, then just extract the
53165 // lowest subvector.
53166 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
53167 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
53168 SDValue Ptr = Ld->getBasePtr();
53169 SDValue Chain = Ld->getChain();
53170 for (SDNode *User : Chain->users()) {
53171 auto *UserLd = dyn_cast<MemSDNode>(User);
53172 if (User != N && UserLd &&
53173 User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
53174 UserLd->getChain() == Chain && UserLd->getBasePtr() == Ptr &&
53175 UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits() &&
53176 User->hasAnyUseOfValue(0) &&
53177 User->getValueSizeInBits(0).getFixedValue() >
53178 RegVT.getFixedSizeInBits()) {
53180 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
53181 RegVT.getSizeInBits());
53182 Extract = DAG.getBitcast(RegVT, Extract);
53183 return DCI.CombineTo(N, Extract, SDValue(User, 1));
53184 }
53185 }
53186 }
53187
53188 if (SDValue V = combineConstantPoolLoads(Ld, dl, DAG, DCI, Subtarget))
53189 return V;
53190
53191 // Cast ptr32 and ptr64 pointers to the default address space before a load.
53192 unsigned AddrSpace = Ld->getAddressSpace();
53193 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
53194 AddrSpace == X86AS::PTR32_UPTR) {
53195 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53196 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
53197 SDValue Cast =
53198 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
53199 return DAG.getExtLoad(Ext, dl, RegVT, Ld->getChain(), Cast,
53200 Ld->getPointerInfo(), MemVT, Ld->getBaseAlign(),
53201 Ld->getMemOperand()->getFlags());
53202 }
53203 }
53204
53205 return SDValue();
53206}
53207
53208/// If V is a build vector of boolean constants and exactly one of those
53209/// constants is true, return the operand index of that true element.
53210/// Otherwise, return -1.
53211static int getOneTrueElt(SDValue V) {
53212 // This needs to be a build vector of booleans.
53213 // TODO: Checking for the i1 type matches the IR definition for the mask,
53214 // but the mask check could be loosened to i8 or other types. That might
53215 // also require checking more than 'allOnesValue'; eg, the x86 HW
53216 // instructions only require that the MSB is set for each mask element.
53217 // The ISD::MSTORE comments/definition do not specify how the mask operand
53218 // is formatted.
53219 auto *BV = dyn_cast<BuildVectorSDNode>(V);
53220 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
53221 return -1;
53222
53223 int TrueIndex = -1;
53224 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
53225 for (unsigned i = 0; i < NumElts; ++i) {
53226 const SDValue &Op = BV->getOperand(i);
53227 if (Op.isUndef())
53228 continue;
53229 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
53230 if (!ConstNode)
53231 return -1;
53232 if (ConstNode->getAPIntValue().countr_one() >= 1) {
53233 // If we already found a one, this is too many.
53234 if (TrueIndex >= 0)
53235 return -1;
53236 TrueIndex = i;
53237 }
53238 }
53239 return TrueIndex;
53240}
53241
53242/// Given a masked memory load/store operation, return true if it has one mask
53243/// bit set. If it has one mask bit set, then also return the memory address of
53244/// the scalar element to load/store, the vector index to insert/extract that
53245/// scalar element, and the alignment for the scalar memory access.
53247 SelectionDAG &DAG, SDValue &Addr,
53248 SDValue &Index, Align &Alignment,
53249 unsigned &Offset) {
53250 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
53251 if (TrueMaskElt < 0)
53252 return false;
53253
53254 // Get the address of the one scalar element that is specified by the mask
53255 // using the appropriate offset from the base pointer.
53256 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
53257 Offset = 0;
53258 Addr = MaskedOp->getBasePtr();
53259 if (TrueMaskElt != 0) {
53260 Offset = TrueMaskElt * EltVT.getStoreSize();
53262 SDLoc(MaskedOp));
53263 }
53264
53265 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
53266 Alignment = commonAlignment(MaskedOp->getBaseAlign(), EltVT.getStoreSize());
53267 return true;
53268}
53269
53270/// If exactly one element of the mask is set for a non-extending masked load,
53271/// it is a scalar load and vector insert.
53272/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
53273/// mask have already been optimized in IR, so we don't bother with those here.
53274static SDValue
53277 const X86Subtarget &Subtarget) {
53278 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
53279 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
53280 // However, some target hooks may need to be added to know when the transform
53281 // is profitable. Endianness would also have to be considered.
53282
53283 SDValue Addr, VecIndex;
53284 Align Alignment;
53285 unsigned Offset;
53286 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
53287 return SDValue();
53288
53289 // Load the one scalar element that is specified by the mask using the
53290 // appropriate offset from the base pointer.
53291 SDLoc DL(ML);
53292 EVT VT = ML->getValueType(0);
53293 EVT EltVT = VT.getVectorElementType();
53294
53295 EVT CastVT = VT;
53296 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
53297 EltVT = MVT::f64;
53298 CastVT = VT.changeVectorElementType(EltVT);
53299 }
53300
53301 SDValue Load =
53302 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
53303 ML->getPointerInfo().getWithOffset(Offset),
53304 Alignment, ML->getMemOperand()->getFlags());
53305
53306 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
53307
53308 // Insert the loaded element into the appropriate place in the vector.
53309 SDValue Insert =
53310 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
53311 Insert = DAG.getBitcast(VT, Insert);
53312 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
53313}
53314
53315static SDValue
53318 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
53319 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
53320 return SDValue();
53321
53322 SDLoc DL(ML);
53323 EVT VT = ML->getValueType(0);
53324
53325 // If we are loading the first and last elements of a vector, it is safe and
53326 // always faster to load the whole vector. Replace the masked load with a
53327 // vector load and select.
53328 unsigned NumElts = VT.getVectorNumElements();
53329 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
53330 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
53331 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
53332 if (LoadFirstElt && LoadLastElt) {
53333 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
53334 ML->getMemOperand());
53335 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
53336 ML->getPassThru());
53337 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
53338 }
53339
53340 // Convert a masked load with a constant mask into a masked load and a select.
53341 // This allows the select operation to use a faster kind of select instruction
53342 // (for example, vblendvps -> vblendps).
53343
53344 // Don't try this if the pass-through operand is already undefined. That would
53345 // cause an infinite loop because that's what we're about to create.
53346 if (ML->getPassThru().isUndef())
53347 return SDValue();
53348
53349 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
53350 return SDValue();
53351
53352 // The new masked load has an undef pass-through operand. The select uses the
53353 // original pass-through operand.
53354 SDValue NewML = DAG.getMaskedLoad(
53355 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
53356 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
53357 ML->getAddressingMode(), ML->getExtensionType());
53358 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
53359 ML->getPassThru());
53360
53361 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
53362}
53363
53366 const X86Subtarget &Subtarget) {
53367 auto *Mld = cast<MaskedLoadSDNode>(N);
53368
53369 // TODO: Expanding load with constant mask may be optimized as well.
53370 if (Mld->isExpandingLoad())
53371 return SDValue();
53372
53373 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
53374 if (SDValue ScalarLoad =
53375 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
53376 return ScalarLoad;
53377
53378 // TODO: Do some AVX512 subsets benefit from this transform?
53379 if (!Subtarget.hasAVX512())
53380 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
53381 return Blend;
53382 }
53383
53384 // If the mask value has been legalized to a non-boolean vector, try to
53385 // simplify ops leading up to it. We only demand the MSB of each lane.
53386 SDValue Mask = Mld->getMask();
53387 if (Mask.getScalarValueSizeInBits() != 1) {
53388 EVT VT = Mld->getValueType(0);
53389 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53391 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
53392 if (N->getOpcode() != ISD::DELETED_NODE)
53393 DCI.AddToWorklist(N);
53394 return SDValue(N, 0);
53395 }
53396 if (SDValue NewMask =
53398 return DAG.getMaskedLoad(
53399 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
53400 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
53401 Mld->getAddressingMode(), Mld->getExtensionType());
53402 }
53403
53404 return SDValue();
53405}
53406
53407/// If exactly one element of the mask is set for a non-truncating masked store,
53408/// it is a vector extract and scalar store.
53409/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
53410/// mask have already been optimized in IR, so we don't bother with those here.
53412 SelectionDAG &DAG,
53413 const X86Subtarget &Subtarget) {
53414 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
53415 // However, some target hooks may need to be added to know when the transform
53416 // is profitable. Endianness would also have to be considered.
53417
53418 SDValue Addr, VecIndex;
53419 Align Alignment;
53420 unsigned Offset;
53421 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
53422 return SDValue();
53423
53424 // Extract the one scalar element that is actually being stored.
53425 SDLoc DL(MS);
53426 SDValue Value = MS->getValue();
53427 EVT VT = Value.getValueType();
53428 EVT EltVT = VT.getVectorElementType();
53429 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
53430 EltVT = MVT::f64;
53431 EVT CastVT = VT.changeVectorElementType(EltVT);
53432 Value = DAG.getBitcast(CastVT, Value);
53433 }
53434 SDValue Extract =
53435 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
53436
53437 // Store that element at the appropriate offset from the base pointer.
53438 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
53440 Alignment, MS->getMemOperand()->getFlags());
53441}
53442
53445 const X86Subtarget &Subtarget) {
53447 if (Mst->isCompressingStore())
53448 return SDValue();
53449
53450 EVT VT = Mst->getValue().getValueType();
53451 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53452
53453 if (Mst->isTruncatingStore())
53454 return SDValue();
53455
53456 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
53457 return ScalarStore;
53458
53459 // If the mask value has been legalized to a non-boolean vector, try to
53460 // simplify ops leading up to it. We only demand the MSB of each lane.
53461 SDValue Mask = Mst->getMask();
53462 if (Mask.getScalarValueSizeInBits() != 1) {
53464 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
53465 if (N->getOpcode() != ISD::DELETED_NODE)
53466 DCI.AddToWorklist(N);
53467 return SDValue(N, 0);
53468 }
53469 if (SDValue NewMask =
53471 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
53472 Mst->getBasePtr(), Mst->getOffset(), NewMask,
53473 Mst->getMemoryVT(), Mst->getMemOperand(),
53474 Mst->getAddressingMode());
53475 }
53476
53477 SDValue Value = Mst->getValue();
53478 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
53479 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
53480 Mst->getMemoryVT())) {
53481 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
53482 Mst->getBasePtr(), Mst->getOffset(), Mask,
53483 Mst->getMemoryVT(), Mst->getMemOperand(),
53484 Mst->getAddressingMode(), true);
53485 }
53486
53487 return SDValue();
53488}
53489
53492 const X86Subtarget &Subtarget) {
53494 EVT StVT = St->getMemoryVT();
53495 SDLoc dl(St);
53496 SDValue StoredVal = St->getValue();
53497 EVT VT = StoredVal.getValueType();
53498 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53499
53500 // Convert a store of vXi1 into a store of iX and a bitcast.
53501 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
53502 VT.getVectorElementType() == MVT::i1) {
53503
53505 StoredVal = DAG.getBitcast(NewVT, StoredVal);
53506
53507 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53508 St->getPointerInfo(), St->getBaseAlign(),
53509 St->getMemOperand()->getFlags());
53510 }
53511
53512 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
53513 // This will avoid a copy to k-register.
53514 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
53515 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
53516 StoredVal.getOperand(0).getValueType() == MVT::i8) {
53517 SDValue Val = StoredVal.getOperand(0);
53518 // We must store zeros to the unused bits.
53519 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
53520 return DAG.getStore(St->getChain(), dl, Val, St->getBasePtr(),
53521 St->getPointerInfo(), St->getBaseAlign(),
53522 St->getMemOperand()->getFlags());
53523 }
53524
53525 // Widen v2i1/v4i1 stores to v8i1.
53526 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
53527 Subtarget.hasAVX512()) {
53528 unsigned NumConcats = 8 / VT.getVectorNumElements();
53529 // We must store zeros to the unused bits.
53530 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
53531 Ops[0] = StoredVal;
53532 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
53533 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53534 St->getPointerInfo(), St->getBaseAlign(),
53535 St->getMemOperand()->getFlags());
53536 }
53537
53538 // Turn vXi1 stores of constants into a scalar store.
53539 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
53540 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
53542 // If its a v64i1 store without 64-bit support, we need two stores.
53543 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
53544 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
53545 StoredVal->ops().slice(0, 32));
53547 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
53548 StoredVal->ops().slice(32, 32));
53550
53551 SDValue Ptr0 = St->getBasePtr();
53552 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(4), dl);
53553
53554 SDValue Ch0 =
53555 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
53556 St->getBaseAlign(), St->getMemOperand()->getFlags());
53557 SDValue Ch1 = DAG.getStore(
53558 St->getChain(), dl, Hi, Ptr1, St->getPointerInfo().getWithOffset(4),
53559 St->getBaseAlign(), St->getMemOperand()->getFlags());
53560 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
53561 }
53562
53563 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
53564 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53565 St->getPointerInfo(), St->getBaseAlign(),
53566 St->getMemOperand()->getFlags());
53567 }
53568
53569 // Convert scalar fabs/fneg load-store to integer equivalents.
53570 if ((VT == MVT::f16 || VT == MVT::bf16 || VT == MVT::f32 || VT == MVT::f64) &&
53571 (StoredVal.getOpcode() == ISD::FABS ||
53572 StoredVal.getOpcode() == ISD::FNEG) &&
53573 ISD::isNormalLoad(StoredVal.getOperand(0).getNode()) &&
53574 StoredVal.hasOneUse() && StoredVal.getOperand(0).hasOneUse()) {
53575 MVT IntVT = VT.getSimpleVT().changeTypeToInteger();
53576 if (TLI.isTypeLegal(IntVT)) {
53578 unsigned SignOp = ISD::XOR;
53579 if (StoredVal.getOpcode() == ISD::FABS) {
53580 SignMask = ~SignMask;
53581 SignOp = ISD::AND;
53582 }
53583 SDValue LogicOp = DAG.getNode(
53584 SignOp, dl, IntVT, DAG.getBitcast(IntVT, StoredVal.getOperand(0)),
53585 DAG.getConstant(SignMask, dl, IntVT));
53586 return DAG.getStore(St->getChain(), dl, LogicOp, St->getBasePtr(),
53587 St->getPointerInfo(), St->getBaseAlign(),
53588 St->getMemOperand()->getFlags());
53589 }
53590 }
53591
53592 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
53593 // Sandy Bridge, perform two 16-byte stores.
53594 unsigned Fast;
53595 if (VT.is256BitVector() && StVT == VT &&
53596 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
53597 *St->getMemOperand(), &Fast) &&
53598 !Fast) {
53599 unsigned NumElems = VT.getVectorNumElements();
53600 if (NumElems < 2)
53601 return SDValue();
53602
53603 return splitVectorStore(St, DAG);
53604 }
53605
53606 // Split under-aligned vector non-temporal stores.
53607 if (St->isNonTemporal() && StVT == VT &&
53608 St->getAlign().value() < VT.getStoreSize()) {
53609 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
53610 // vectors or the legalizer can scalarize it to use MOVNTI.
53611 if (VT.is256BitVector() || VT.is512BitVector()) {
53612 unsigned NumElems = VT.getVectorNumElements();
53613 if (NumElems < 2)
53614 return SDValue();
53615 return splitVectorStore(St, DAG);
53616 }
53617
53618 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
53619 // to use MOVNTI.
53620 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
53621 MVT NTVT = Subtarget.hasSSE4A()
53622 ? MVT::v2f64
53623 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
53624 return scalarizeVectorStore(St, NTVT, DAG);
53625 }
53626 }
53627
53628 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
53629 // supported, but avx512f is by extending to v16i32 and truncating.
53630 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
53631 St->getValue().getOpcode() == ISD::TRUNCATE &&
53632 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
53633 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
53634 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
53635 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
53636 St->getValue().getOperand(0));
53637 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
53638 MVT::v16i8, St->getMemOperand());
53639 }
53640
53641 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
53642 if (!St->isTruncatingStore() &&
53643 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
53644 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
53645 StoredVal.hasOneUse() &&
53646 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
53647 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
53648 return EmitTruncSStore(IsSigned, St->getChain(),
53649 dl, StoredVal.getOperand(0), St->getBasePtr(),
53650 VT, St->getMemOperand(), DAG);
53651 }
53652
53653 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
53654 if (!St->isTruncatingStore()) {
53655 auto IsExtractedElement = [](SDValue V) {
53656 if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
53657 V = V.getOperand(0);
53658 unsigned Opc = V.getOpcode();
53660 isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
53661 V.getOperand(0).hasOneUse())
53662 return V.getOperand(0);
53663 return SDValue();
53664 };
53665 if (SDValue Extract = IsExtractedElement(StoredVal)) {
53666 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
53667 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
53668 SDValue Src = Trunc.getOperand(0);
53669 MVT DstVT = Trunc.getSimpleValueType();
53670 MVT SrcVT = Src.getSimpleValueType();
53671 unsigned NumSrcElts = SrcVT.getVectorNumElements();
53672 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
53673 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
53674 if (NumTruncBits == VT.getSizeInBits() &&
53675 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
53676 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
53677 TruncVT, St->getMemOperand());
53678 }
53679 }
53680 }
53681 }
53682
53683 // Optimize trunc store (of multiple scalars) to shuffle and store.
53684 // First, pack all of the elements in one place. Next, store to memory
53685 // in fewer chunks.
53686 if (St->isTruncatingStore() && VT.isVector()) {
53687 if (TLI.isTruncStoreLegal(VT, StVT)) {
53688 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
53689 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
53690 dl, Val, St->getBasePtr(),
53691 St->getMemoryVT(), St->getMemOperand(), DAG);
53692 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
53693 DAG, dl))
53694 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
53695 dl, Val, St->getBasePtr(),
53696 St->getMemoryVT(), St->getMemOperand(), DAG);
53697 }
53698
53699 return SDValue();
53700 }
53701
53702 // Cast ptr32 and ptr64 pointers to the default address space before a store.
53703 unsigned AddrSpace = St->getAddressSpace();
53704 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
53705 AddrSpace == X86AS::PTR32_UPTR) {
53706 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53707 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
53708 SDValue Cast =
53709 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
53710 return DAG.getTruncStore(
53711 St->getChain(), dl, StoredVal, Cast, St->getPointerInfo(), StVT,
53712 St->getBaseAlign(), St->getMemOperand()->getFlags(), St->getAAInfo());
53713 }
53714 }
53715
53716 // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC)
53717 // store(cmov(x, load(p), CC), p) to cstore(x, p, InvertCC)
53718 if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
53719 Subtarget.hasCF() && St->isSimple()) {
53720 SDValue Cmov;
53721 if (StoredVal.getOpcode() == X86ISD::CMOV)
53722 Cmov = StoredVal;
53723 else if (StoredVal.getOpcode() == ISD::TRUNCATE &&
53724 StoredVal.getOperand(0).getOpcode() == X86ISD::CMOV)
53725 Cmov = StoredVal.getOperand(0);
53726 else
53727 return SDValue();
53728
53729 auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
53730 if (!Ld || !Ld->isSimple() || Ld->getBasePtr() != St->getBasePtr())
53731 return SDValue();
53732
53733 bool InvertCC = false;
53734 SDValue V = SDValue(Ld, 0);
53735 if (V == Cmov.getOperand(1))
53736 InvertCC = true;
53737 else if (V != Cmov.getOperand(0))
53738 return SDValue();
53739
53740 SDVTList Tys = DAG.getVTList(MVT::Other);
53741 SDValue CC = Cmov.getOperand(2);
53742 SDValue Src = DAG.getAnyExtOrTrunc(Cmov.getOperand(!InvertCC), dl, VT);
53743 if (InvertCC)
53744 CC = DAG.getTargetConstant(
53747 dl, MVT::i8);
53748 SDValue Ops[] = {St->getChain(), Src, St->getBasePtr(), CC,
53749 Cmov.getOperand(3)};
53750 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, dl, Tys, Ops, VT,
53751 St->getMemOperand());
53752 }
53753
53754 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
53755 // the FP state in cases where an emms may be missing.
53756 // A preferable solution to the general problem is to figure out the right
53757 // places to insert EMMS. This qualifies as a quick hack.
53758
53759 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
53760 if (VT.getSizeInBits() != 64)
53761 return SDValue();
53762
53763 const Function &F = DAG.getMachineFunction().getFunction();
53764 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
53765 bool F64IsLegal =
53766 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
53767
53768 if (!F64IsLegal || Subtarget.is64Bit())
53769 return SDValue();
53770
53771 if (VT == MVT::i64 && isa<LoadSDNode>(St->getValue()) &&
53772 cast<LoadSDNode>(St->getValue())->isSimple() &&
53773 St->getChain().hasOneUse() && St->isSimple()) {
53774 auto *Ld = cast<LoadSDNode>(St->getValue());
53775
53776 if (!ISD::isNormalLoad(Ld))
53777 return SDValue();
53778
53779 // Avoid the transformation if there are multiple uses of the loaded value.
53780 if (!Ld->hasNUsesOfValue(1, 0))
53781 return SDValue();
53782
53783 SDLoc LdDL(Ld);
53784 SDLoc StDL(N);
53785
53786 // Remove any range metadata as we're converting to f64 load/store.
53787 Ld->getMemOperand()->clearRanges();
53788
53789 // Lower to a single movq load/store pair.
53790 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
53791 Ld->getBasePtr(), Ld->getMemOperand());
53792
53793 // Make sure new load is placed in same chain order.
53794 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
53795 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
53796 St->getMemOperand());
53797 }
53798
53799 // This is similar to the above case, but here we handle a scalar 64-bit
53800 // integer store that is extracted from a vector on a 32-bit target.
53801 // If we have SSE2, then we can treat it like a floating-point double
53802 // to get past legalization. The execution dependencies fixup pass will
53803 // choose the optimal machine instruction for the store if this really is
53804 // an integer or v2f32 rather than an f64.
53805 if (VT == MVT::i64 &&
53807 SDValue OldExtract = St->getOperand(1);
53808 SDValue ExtOp0 = OldExtract.getOperand(0);
53809 unsigned VecSize = ExtOp0.getValueSizeInBits();
53810 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
53811 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
53812 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
53813 BitCast, OldExtract.getOperand(1));
53814 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
53815 St->getPointerInfo(), St->getBaseAlign(),
53816 St->getMemOperand()->getFlags());
53817 }
53818
53819 return SDValue();
53820}
53821
53824 const X86Subtarget &Subtarget) {
53825 auto *St = cast<MemIntrinsicSDNode>(N);
53826
53827 SDValue StoredVal = N->getOperand(1);
53828 MVT VT = StoredVal.getSimpleValueType();
53829 EVT MemVT = St->getMemoryVT();
53830
53831 // Figure out which elements we demand.
53832 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
53833 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
53834
53835 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53836 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
53837 if (N->getOpcode() != ISD::DELETED_NODE)
53838 DCI.AddToWorklist(N);
53839 return SDValue(N, 0);
53840 }
53841
53842 return SDValue();
53843}
53844
53845/// Return 'true' if this vector operation is "horizontal"
53846/// and return the operands for the horizontal operation in LHS and RHS. A
53847/// horizontal operation performs the binary operation on successive elements
53848/// of its first operand, then on successive elements of its second operand,
53849/// returning the resulting values in a vector. For example, if
53850/// A = < float a0, float a1, float a2, float a3 >
53851/// and
53852/// B = < float b0, float b1, float b2, float b3 >
53853/// then the result of doing a horizontal operation on A and B is
53854/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
53855/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
53856/// A horizontal-op B, for some already available A and B, and if so then LHS is
53857/// set to A, RHS to B, and the routine returns 'true'.
53858static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
53859 SelectionDAG &DAG, const X86Subtarget &Subtarget,
53860 bool IsCommutative,
53861 SmallVectorImpl<int> &PostShuffleMask,
53862 bool ForceHorizOp) {
53863 // If either operand is undef, bail out. The binop should be simplified.
53864 if (LHS.isUndef() || RHS.isUndef())
53865 return false;
53866
53867 // Look for the following pattern:
53868 // A = < float a0, float a1, float a2, float a3 >
53869 // B = < float b0, float b1, float b2, float b3 >
53870 // and
53871 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
53872 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
53873 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
53874 // which is A horizontal-op B.
53875
53876 MVT VT = LHS.getSimpleValueType();
53877 assert((VT.is128BitVector() || VT.is256BitVector()) &&
53878 "Unsupported vector type for horizontal add/sub");
53879 unsigned NumElts = VT.getVectorNumElements();
53880
53881 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
53882 SmallVectorImpl<int> &ShuffleMask) {
53883 bool UseSubVector = false;
53884 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
53885 Op.getOperand(0).getValueType().is256BitVector() &&
53886 llvm::isNullConstant(Op.getOperand(1))) {
53887 Op = Op.getOperand(0);
53888 UseSubVector = true;
53889 }
53891 SmallVector<int, 16> SrcMask, ScaledMask;
53893 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
53894 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
53895 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
53896 })) {
53897 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
53898 if (!UseSubVector && SrcOps.size() <= 2 &&
53899 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
53900 N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();
53901 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
53902 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
53903 }
53904 if (UseSubVector && SrcOps.size() == 1 &&
53905 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
53906 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
53907 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
53908 ShuffleMask.assign(Mask.begin(), Mask.end());
53909 }
53910 }
53911 };
53912
53913 // View LHS in the form
53914 // LHS = VECTOR_SHUFFLE A, B, LMask
53915 // If LHS is not a shuffle, then pretend it is the identity shuffle:
53916 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
53917 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
53918 SDValue A, B;
53920 GetShuffle(LHS, A, B, LMask);
53921
53922 // Likewise, view RHS in the form
53923 // RHS = VECTOR_SHUFFLE C, D, RMask
53924 SDValue C, D;
53926 GetShuffle(RHS, C, D, RMask);
53927
53928 // At least one of the operands should be a vector shuffle.
53929 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
53930 if (NumShuffles == 0)
53931 return false;
53932
53933 if (LMask.empty()) {
53934 A = LHS;
53935 for (unsigned i = 0; i != NumElts; ++i)
53936 LMask.push_back(i);
53937 }
53938
53939 if (RMask.empty()) {
53940 C = RHS;
53941 for (unsigned i = 0; i != NumElts; ++i)
53942 RMask.push_back(i);
53943 }
53944
53945 // If we have an unary mask, ensure the other op is set to null.
53946 if (isUndefOrInRange(LMask, 0, NumElts))
53947 B = SDValue();
53948 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
53949 A = SDValue();
53950
53951 if (isUndefOrInRange(RMask, 0, NumElts))
53952 D = SDValue();
53953 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
53954 C = SDValue();
53955
53956 // If A and B occur in reverse order in RHS, then canonicalize by commuting
53957 // RHS operands and shuffle mask.
53958 if (A != C) {
53959 std::swap(C, D);
53961 }
53962 // Check that the shuffles are both shuffling the same vectors.
53963 if (!(A == C && B == D))
53964 return false;
53965
53966 PostShuffleMask.clear();
53967 PostShuffleMask.append(NumElts, SM_SentinelUndef);
53968
53969 // LHS and RHS are now:
53970 // LHS = shuffle A, B, LMask
53971 // RHS = shuffle A, B, RMask
53972 // Check that the masks correspond to performing a horizontal operation.
53973 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
53974 // so we just repeat the inner loop if this is a 256-bit op.
53975 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
53976 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
53977 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
53978 assert((NumEltsPer128BitChunk % 2 == 0) &&
53979 "Vector type should have an even number of elements in each lane");
53980 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
53981 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
53982 // Ignore undefined components.
53983 int LIdx = LMask[i + j], RIdx = RMask[i + j];
53984 if (LIdx < 0 || RIdx < 0 ||
53985 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
53986 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
53987 continue;
53988
53989 // Check that successive odd/even elements are being operated on. If not,
53990 // this is not a horizontal operation.
53991 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
53992 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
53993 return false;
53994
53995 // Compute the post-shuffle mask index based on where the element
53996 // is stored in the HOP result, and where it needs to be moved to.
53997 int Base = LIdx & ~1u;
53998 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
53999 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
54000
54001 // The low half of the 128-bit result must choose from A.
54002 // The high half of the 128-bit result must choose from B,
54003 // unless B is undef. In that case, we are always choosing from A.
54004 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
54005 Index += NumEltsPer64BitChunk;
54006 PostShuffleMask[i + j] = Index;
54007 }
54008 }
54009
54010 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
54011 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
54012
54013 bool IsIdentityPostShuffle =
54014 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
54015 if (IsIdentityPostShuffle)
54016 PostShuffleMask.clear();
54017
54018 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
54019 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
54020 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
54021 return false;
54022
54023 // If the source nodes are already used in HorizOps then always accept this.
54024 // Shuffle folding should merge these back together.
54025 auto FoundHorizUser = [&](SDNode *User) {
54026 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
54027 };
54028 ForceHorizOp =
54029 ForceHorizOp || (llvm::any_of(NewLHS->users(), FoundHorizUser) &&
54030 llvm::any_of(NewRHS->users(), FoundHorizUser));
54031
54032 // Assume a SingleSource HOP if we only shuffle one input and don't need to
54033 // shuffle the result.
54034 if (!ForceHorizOp &&
54035 !shouldUseHorizontalOp(NewLHS == NewRHS &&
54036 (NumShuffles < 2 || !IsIdentityPostShuffle),
54037 DAG, Subtarget))
54038 return false;
54039
54040 LHS = DAG.getBitcast(VT, NewLHS);
54041 RHS = DAG.getBitcast(VT, NewRHS);
54042 return true;
54043}
54044
54045// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
54047 const X86Subtarget &Subtarget) {
54048 EVT VT = N->getValueType(0);
54049 unsigned Opcode = N->getOpcode();
54050 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
54051 SmallVector<int, 8> PostShuffleMask;
54052
54053 auto MergableHorizOp = [N](unsigned HorizOpcode) {
54054 return N->hasOneUse() &&
54055 N->user_begin()->getOpcode() == ISD::VECTOR_SHUFFLE &&
54056 (N->user_begin()->getOperand(0).getOpcode() == HorizOpcode ||
54057 N->user_begin()->getOperand(1).getOpcode() == HorizOpcode);
54058 };
54059
54060 switch (Opcode) {
54061 case ISD::FADD:
54062 case ISD::FSUB:
54063 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
54064 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
54065 SDValue LHS = N->getOperand(0);
54066 SDValue RHS = N->getOperand(1);
54067 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
54068 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
54069 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
54070 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
54071 if (!PostShuffleMask.empty())
54072 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
54073 DAG.getUNDEF(VT), PostShuffleMask);
54074 return HorizBinOp;
54075 }
54076 }
54077 break;
54078 case ISD::ADD:
54079 case ISD::SUB:
54080 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
54081 VT == MVT::v16i16 || VT == MVT::v8i32)) {
54082 SDValue LHS = N->getOperand(0);
54083 SDValue RHS = N->getOperand(1);
54084 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
54085 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
54086 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
54087 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
54089 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
54090 };
54091 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
54092 {LHS, RHS}, HOpBuilder);
54093 if (!PostShuffleMask.empty())
54094 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
54095 DAG.getUNDEF(VT), PostShuffleMask);
54096 return HorizBinOp;
54097 }
54098 }
54099 break;
54100 }
54101
54102 return SDValue();
54103}
54104
54105// Try to combine the following nodes
54106// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
54107// <i32 -2147483648[float -0.000000e+00]> 0
54108// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
54109// <(load 4 from constant-pool)> t0, t29
54110// [t30: v16i32 = bitcast t27]
54111// t6: v16i32 = xor t7, t27[t30]
54112// t11: v16f32 = bitcast t6
54113// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
54114// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
54115// t22: v16f32 = bitcast t7
54116// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
54117// t24: v32f16 = bitcast t23
54119 const X86Subtarget &Subtarget) {
54120 EVT VT = N->getValueType(0);
54121 SDValue LHS = N->getOperand(0);
54122 SDValue RHS = N->getOperand(1);
54123 int CombineOpcode =
54124 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
54125 auto combineConjugation = [&](SDValue &r) {
54126 if (LHS->getOpcode() == ISD::BITCAST) {
54127 SDValue XOR = LHS.getOperand(0);
54128 if (XOR->getOpcode() == ISD::XOR) {
54129 KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1));
54130 if (XORRHS.isConstant()) {
54131 APInt ConjugationInt32 = APInt(32, 0x80000000);
54132 APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL);
54133 if ((XORRHS.getBitWidth() == 32 &&
54134 XORRHS.getConstant() == ConjugationInt32) ||
54135 (XORRHS.getBitWidth() == 64 &&
54136 XORRHS.getConstant() == ConjugationInt64)) {
54137 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
54138 SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
54139 SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
54140 r = DAG.getBitcast(VT, FCMulC);
54141 return true;
54142 }
54143 }
54144 }
54145 }
54146 return false;
54147 };
54148 SDValue Res;
54149 if (combineConjugation(Res))
54150 return Res;
54151 std::swap(LHS, RHS);
54152 if (combineConjugation(Res))
54153 return Res;
54154 return Res;
54155}
54156
54157// Try to combine the following nodes:
54158// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
54160 const X86Subtarget &Subtarget) {
54161 auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
54163 Flags.hasAllowContract();
54164 };
54165
54166 auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
54167 return DAG.getTarget().Options.NoSignedZerosFPMath ||
54168 Flags.hasNoSignedZeros();
54169 };
54170 auto IsVectorAllNegativeZero = [&DAG](SDValue Op) {
54171 APInt AI = APInt(32, 0x80008000);
54172 KnownBits Bits = DAG.computeKnownBits(Op);
54173 return Bits.getBitWidth() == 32 && Bits.isConstant() &&
54174 Bits.getConstant() == AI;
54175 };
54176
54177 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
54178 !AllowContract(N->getFlags()))
54179 return SDValue();
54180
54181 EVT VT = N->getValueType(0);
54182 if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
54183 return SDValue();
54184
54185 SDValue LHS = N->getOperand(0);
54186 SDValue RHS = N->getOperand(1);
54187 bool IsConj;
54188 SDValue FAddOp1, MulOp0, MulOp1;
54189 auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
54190 &IsVectorAllNegativeZero,
54191 &HasNoSignedZero](SDValue N) -> bool {
54192 if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
54193 return false;
54194 SDValue Op0 = N.getOperand(0);
54195 unsigned Opcode = Op0.getOpcode();
54196 if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
54197 if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
54198 MulOp0 = Op0.getOperand(0);
54199 MulOp1 = Op0.getOperand(1);
54200 IsConj = Opcode == X86ISD::VFCMULC;
54201 return true;
54202 }
54203 if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
54205 HasNoSignedZero(Op0->getFlags())) ||
54206 IsVectorAllNegativeZero(Op0->getOperand(2)))) {
54207 MulOp0 = Op0.getOperand(0);
54208 MulOp1 = Op0.getOperand(1);
54209 IsConj = Opcode == X86ISD::VFCMADDC;
54210 return true;
54211 }
54212 }
54213 return false;
54214 };
54215
54216 if (GetCFmulFrom(LHS))
54217 FAddOp1 = RHS;
54218 else if (GetCFmulFrom(RHS))
54219 FAddOp1 = LHS;
54220 else
54221 return SDValue();
54222
54223 MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
54224 FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
54225 unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
54226 // FIXME: How do we handle when fast math flags of FADD are different from
54227 // CFMUL's?
54228 SDValue CFmul =
54229 DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
54230 return DAG.getBitcast(VT, CFmul);
54231}
54232
54233/// Do target-specific dag combines on floating-point adds/subs.
54235 const X86Subtarget &Subtarget) {
54236 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
54237 return HOp;
54238
54239 if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
54240 return COp;
54241
54242 return SDValue();
54243}
54244
54246 const X86Subtarget &Subtarget) {
54247 EVT VT = N->getValueType(0);
54248 SDValue Src = N->getOperand(0);
54249 EVT SrcVT = Src.getValueType();
54250 SDLoc DL(N);
54251
54252 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54253
54254 // Let legalize expand this if it isn't a legal type yet.
54255 if (!TLI.isTypeLegal(VT))
54256 return SDValue();
54257
54258 if ((SrcVT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) ||
54259 (SrcVT.getScalarType() == MVT::f32 && !Subtarget.hasDQI()))
54260 return SDValue();
54261
54262 if (SrcVT == MVT::v2f16) {
54263 SrcVT = MVT::v4f16;
54264 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54265 DAG.getUNDEF(MVT::v2f16));
54266 }
54267
54268 if (SrcVT == MVT::v4f16) {
54269 SrcVT = MVT::v8f16;
54270 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54271 DAG.getUNDEF(MVT::v4f16));
54272 } else if (SrcVT == MVT::v2f32) {
54273 SrcVT = MVT::v4f32;
54274 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54275 DAG.getUNDEF(MVT::v2f32));
54276 } else {
54277 return SDValue();
54278 }
54279
54280 return DAG.getNode(X86ISD::CVTP2SI, DL, VT, Src);
54281}
54282
54283// Attempt to fold some (truncate (srl (add/or/xor X, C1), C2)) patterns to
54284// (add/or/xor (truncate (srl X, C2)), C1'). C1' will be smaller than C1 so we
54285// are able to avoid generating code with MOVABS and large constants in certain
54286// cases.
54288 const SDLoc &DL) {
54289 assert(N.getOpcode() == ISD::SRL && "Unknown shift opcode");
54290 std::optional<unsigned> ValidSrlConst = DAG.getValidShiftAmount(N);
54291 if (!ValidSrlConst)
54292 return SDValue();
54293 unsigned SrlConstVal = *ValidSrlConst;
54294
54295 SDValue Op = N.getOperand(0);
54296 unsigned Opcode = Op.getOpcode();
54297 assert(VT == MVT::i32 && Op.getValueType() == MVT::i64 &&
54298 "Illegal truncation types");
54299
54300 if ((Opcode != ISD::ADD && Opcode != ISD::OR && Opcode != ISD::XOR) ||
54301 !isa<ConstantSDNode>(Op.getOperand(1)))
54302 return SDValue();
54303 const APInt &OpConst = Op.getConstantOperandAPInt(1);
54304
54305 if (SrlConstVal <= 32 ||
54306 (Opcode == ISD::ADD && OpConst.countr_zero() < SrlConstVal))
54307 return SDValue();
54308
54309 SDValue OpLhsSrl =
54310 DAG.getNode(ISD::SRL, DL, MVT::i64, Op.getOperand(0), N.getOperand(1));
54311 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, OpLhsSrl);
54312
54313 APInt NewOpConstVal = OpConst.lshr(SrlConstVal).trunc(VT.getSizeInBits());
54314 SDValue NewOpConst = DAG.getConstant(NewOpConstVal, DL, VT);
54315 SDValue NewOpNode = DAG.getNode(Opcode, DL, VT, Trunc, NewOpConst);
54316
54317 if (Opcode == ISD::ADD) {
54318 EVT CleanUpVT = EVT::getIntegerVT(*DAG.getContext(), 64 - SrlConstVal);
54319 return DAG.getZeroExtendInReg(NewOpNode, DL, CleanUpVT);
54320 }
54321 return NewOpNode;
54322}
54323
54324/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
54325/// the codegen.
54326/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
54327/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
54328/// anything that is guaranteed to be transformed by DAGCombiner.
54330 const X86Subtarget &Subtarget,
54331 const SDLoc &DL) {
54332 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
54333 SDValue Src = N->getOperand(0);
54334 unsigned SrcOpcode = Src.getOpcode();
54335 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54336
54337 EVT VT = N->getValueType(0);
54338 EVT SrcVT = Src.getValueType();
54339
54340 auto IsFreeTruncation = [VT](SDValue Op) {
54341 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
54342
54343 // See if this has been extended from a smaller/equal size to
54344 // the truncation size, allowing a truncation to combine with the extend.
54345 unsigned Opcode = Op.getOpcode();
54346 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
54347 Opcode == ISD::ZERO_EXTEND) &&
54348 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
54349 return true;
54350
54351 // See if this is a single use constant which can be constant folded.
54352 // NOTE: We don't peek throught bitcasts here because there is currently
54353 // no support for constant folding truncate+bitcast+vector_of_constants. So
54354 // we'll just send up with a truncate on both operands which will
54355 // get turned back into (truncate (binop)) causing an infinite loop.
54356 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
54357 };
54358
54359 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
54360 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
54361 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
54362 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
54363 };
54364
54365 // Don't combine if the operation has other uses.
54366 if (!Src.hasOneUse())
54367 return SDValue();
54368
54369 if (VT == MVT::i32 && SrcVT == MVT::i64 && SrcOpcode == ISD::SRL)
54370 return combinei64TruncSrlConstant(Src, VT, DAG, DL);
54371
54372 if (!VT.isVector())
54373 return SDValue();
54374
54375 // In most cases its only worth pre-truncating if we're only facing the cost
54376 // of one truncation.
54377 // i.e. if one of the inputs will constant fold or the input is repeated.
54378 switch (SrcOpcode) {
54379 case ISD::MUL:
54380 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
54381 // better to truncate if we have the chance.
54382 if (SrcVT.getScalarType() == MVT::i64 &&
54383 TLI.isOperationLegal(SrcOpcode, VT) &&
54384 !TLI.isOperationLegal(SrcOpcode, SrcVT))
54385 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
54386 [[fallthrough]];
54387 case ISD::AND:
54388 case ISD::XOR:
54389 case ISD::OR:
54390 case ISD::ADD:
54391 case ISD::SUB: {
54392 SDValue Op0 = Src.getOperand(0);
54393 SDValue Op1 = Src.getOperand(1);
54394 if (TLI.isOperationLegal(SrcOpcode, VT) &&
54395 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
54396 return TruncateArithmetic(Op0, Op1);
54397 break;
54398 }
54399 }
54400
54401 return SDValue();
54402}
54403
54404// Try to form a MULHU or MULHS node by looking for
54405// (trunc (srl (mul ext, ext), >= 16))
54406// TODO: This is X86 specific because we want to be able to handle wide types
54407// before type legalization. But we can only do it if the vector will be
54408// legalized via widening/splitting. Type legalization can't handle promotion
54409// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
54410// combiner.
54411static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
54412 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
54413 using namespace llvm::SDPatternMatch;
54414
54415 if (!Subtarget.hasSSE2())
54416 return SDValue();
54417
54418 // Only handle vXi16 types that are at least 128-bits unless they will be
54419 // widened.
54420 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
54421 return SDValue();
54422
54423 // Input type should be at least vXi32.
54424 EVT InVT = Src.getValueType();
54425 if (InVT.getVectorElementType().getSizeInBits() < 32)
54426 return SDValue();
54427
54428 // First instruction should be a right shift by 16 of a multiply.
54429 SDValue LHS, RHS;
54430 APInt ShiftAmt;
54431 if (!sd_match(Src,
54432 m_Srl(m_Mul(m_Value(LHS), m_Value(RHS)), m_ConstInt(ShiftAmt))))
54433 return SDValue();
54434
54435 if (ShiftAmt.ult(16) || ShiftAmt.uge(InVT.getScalarSizeInBits()))
54436 return SDValue();
54437
54438 uint64_t AdditionalShift = ShiftAmt.getZExtValue() - 16;
54439
54440 // Count leading sign/zero bits on both inputs - if there are enough then
54441 // truncation back to vXi16 will be cheap - either as a pack/shuffle
54442 // sequence or using AVX512 truncations. If the inputs are sext/zext then the
54443 // truncations may actually be free by peeking through to the ext source.
54444 auto IsSext = [&DAG](SDValue V) {
54445 return DAG.ComputeMaxSignificantBits(V) <= 16;
54446 };
54447 auto IsZext = [&DAG](SDValue V) {
54448 return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
54449 };
54450
54451 bool IsSigned = IsSext(LHS) && IsSext(RHS);
54452 bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
54453 if (!IsSigned && !IsUnsigned)
54454 return SDValue();
54455
54456 // Check if both inputs are extensions, which will be removed by truncation.
54457 auto isOpTruncateFree = [](SDValue Op) {
54458 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
54459 Op.getOpcode() == ISD::ZERO_EXTEND)
54460 return Op.getOperand(0).getScalarValueSizeInBits() <= 16;
54461 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
54462 };
54463 bool IsTruncateFree = isOpTruncateFree(LHS) && isOpTruncateFree(RHS);
54464
54465 // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
54466 // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
54467 // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
54468 // will have to split anyway.
54469 unsigned InSizeInBits = InVT.getSizeInBits();
54470 if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
54471 !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
54472 (InSizeInBits % 16) == 0) {
54473 EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54474 InVT.getSizeInBits() / 16);
54475 SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
54476 DAG.getBitcast(BCVT, RHS));
54477 Res = DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
54478 return DAG.getNode(ISD::SRL, DL, VT, Res,
54479 DAG.getShiftAmountConstant(AdditionalShift, VT, DL));
54480 }
54481
54482 // Truncate back to source type.
54483 LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
54484 RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
54485
54486 unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
54487 SDValue Res = DAG.getNode(Opc, DL, VT, LHS, RHS);
54488 return DAG.getNode(ISD::SRL, DL, VT, Res,
54489 DAG.getShiftAmountConstant(AdditionalShift, VT, DL));
54490}
54491
54492// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
54493// from one vector with signed bytes from another vector, adds together
54494// adjacent pairs of 16-bit products, and saturates the result before
54495// truncating to 16-bits.
54496//
54497// Which looks something like this:
54498// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
54499// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
54501 const X86Subtarget &Subtarget,
54502 const SDLoc &DL) {
54503 if (!VT.isVector() || !Subtarget.hasSSSE3())
54504 return SDValue();
54505
54506 unsigned NumElems = VT.getVectorNumElements();
54507 EVT ScalarVT = VT.getVectorElementType();
54508 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
54509 return SDValue();
54510
54511 SDValue SSatVal = detectSSatPattern(In, VT);
54512 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
54513 return SDValue();
54514
54515 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
54516 // of multiplies from even/odd elements.
54517 SDValue N0 = SSatVal.getOperand(0);
54518 SDValue N1 = SSatVal.getOperand(1);
54519
54520 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
54521 return SDValue();
54522
54523 SDValue N00 = N0.getOperand(0);
54524 SDValue N01 = N0.getOperand(1);
54525 SDValue N10 = N1.getOperand(0);
54526 SDValue N11 = N1.getOperand(1);
54527
54528 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
54529 // Canonicalize zero_extend to LHS.
54530 if (N01.getOpcode() == ISD::ZERO_EXTEND)
54531 std::swap(N00, N01);
54532 if (N11.getOpcode() == ISD::ZERO_EXTEND)
54533 std::swap(N10, N11);
54534
54535 // Ensure we have a zero_extend and a sign_extend.
54536 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
54537 N01.getOpcode() != ISD::SIGN_EXTEND ||
54538 N10.getOpcode() != ISD::ZERO_EXTEND ||
54539 N11.getOpcode() != ISD::SIGN_EXTEND)
54540 return SDValue();
54541
54542 // Peek through the extends.
54543 N00 = N00.getOperand(0);
54544 N01 = N01.getOperand(0);
54545 N10 = N10.getOperand(0);
54546 N11 = N11.getOperand(0);
54547
54548 // Ensure the extend is from vXi8.
54549 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
54550 N01.getValueType().getVectorElementType() != MVT::i8 ||
54551 N10.getValueType().getVectorElementType() != MVT::i8 ||
54552 N11.getValueType().getVectorElementType() != MVT::i8)
54553 return SDValue();
54554
54555 // All inputs should be build_vectors.
54556 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
54557 N01.getOpcode() != ISD::BUILD_VECTOR ||
54558 N10.getOpcode() != ISD::BUILD_VECTOR ||
54560 return SDValue();
54561
54562 // N00/N10 are zero extended. N01/N11 are sign extended.
54563
54564 // For each element, we need to ensure we have an odd element from one vector
54565 // multiplied by the odd element of another vector and the even element from
54566 // one of the same vectors being multiplied by the even element from the
54567 // other vector. So we need to make sure for each element i, this operator
54568 // is being performed:
54569 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
54570 SDValue ZExtIn, SExtIn;
54571 for (unsigned i = 0; i != NumElems; ++i) {
54572 SDValue N00Elt = N00.getOperand(i);
54573 SDValue N01Elt = N01.getOperand(i);
54574 SDValue N10Elt = N10.getOperand(i);
54575 SDValue N11Elt = N11.getOperand(i);
54576 // TODO: Be more tolerant to undefs.
54577 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54578 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54579 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54581 return SDValue();
54582 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
54583 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
54584 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
54585 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
54586 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
54587 return SDValue();
54588 unsigned IdxN00 = ConstN00Elt->getZExtValue();
54589 unsigned IdxN01 = ConstN01Elt->getZExtValue();
54590 unsigned IdxN10 = ConstN10Elt->getZExtValue();
54591 unsigned IdxN11 = ConstN11Elt->getZExtValue();
54592 // Add is commutative so indices can be reordered.
54593 if (IdxN00 > IdxN10) {
54594 std::swap(IdxN00, IdxN10);
54595 std::swap(IdxN01, IdxN11);
54596 }
54597 // N0 indices be the even element. N1 indices must be the next odd element.
54598 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
54599 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
54600 return SDValue();
54601 SDValue N00In = N00Elt.getOperand(0);
54602 SDValue N01In = N01Elt.getOperand(0);
54603 SDValue N10In = N10Elt.getOperand(0);
54604 SDValue N11In = N11Elt.getOperand(0);
54605 // First time we find an input capture it.
54606 if (!ZExtIn) {
54607 ZExtIn = N00In;
54608 SExtIn = N01In;
54609 }
54610 if (ZExtIn != N00In || SExtIn != N01In ||
54611 ZExtIn != N10In || SExtIn != N11In)
54612 return SDValue();
54613 }
54614
54615 auto ExtractVec = [&DAG, &DL, NumElems](SDValue &Ext) {
54616 EVT ExtVT = Ext.getValueType();
54617 if (ExtVT.getVectorNumElements() != NumElems * 2) {
54618 MVT NVT = MVT::getVectorVT(MVT::i8, NumElems * 2);
54619 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, Ext,
54620 DAG.getVectorIdxConstant(0, DL));
54621 }
54622 };
54623 ExtractVec(ZExtIn);
54624 ExtractVec(SExtIn);
54625
54626 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
54628 // Shrink by adding truncate nodes and let DAGCombine fold with the
54629 // sources.
54630 EVT InVT = Ops[0].getValueType();
54631 assert(InVT.getScalarType() == MVT::i8 &&
54632 "Unexpected scalar element type");
54633 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
54634 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54635 InVT.getVectorNumElements() / 2);
54636 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
54637 };
54638 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
54639 PMADDBuilder);
54640}
54641
54643 const X86Subtarget &Subtarget) {
54644 EVT VT = N->getValueType(0);
54645 SDValue Src = N->getOperand(0);
54646 SDLoc DL(N);
54647
54648 // Attempt to pre-truncate inputs to arithmetic ops instead.
54649 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
54650 return V;
54651
54652 // Try to detect PMADD
54653 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
54654 return PMAdd;
54655
54656 // Try to combine truncation with signed/unsigned saturation.
54657 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
54658 return Val;
54659
54660 // Try to combine PMULHUW/PMULHW for vXi16.
54661 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
54662 return V;
54663
54664 // The bitcast source is a direct mmx result.
54665 // Detect bitcasts between i32 to x86mmx
54666 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
54667 SDValue BCSrc = Src.getOperand(0);
54668 if (BCSrc.getValueType() == MVT::x86mmx)
54669 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
54670 }
54671
54672 return SDValue();
54673}
54674
54677 EVT VT = N->getValueType(0);
54678 SDValue In = N->getOperand(0);
54679 SDLoc DL(N);
54680
54681 if (SDValue SSatVal = detectSSatPattern(In, VT))
54682 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
54683 if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))
54684 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
54685
54686 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54687 APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
54688 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
54689 return SDValue(N, 0);
54690
54691 return SDValue();
54692}
54693
54694/// Returns the negated value if the node \p N flips sign of FP value.
54695///
54696/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
54697/// or FSUB(0, x)
54698/// AVX512F does not have FXOR, so FNEG is lowered as
54699/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
54700/// In this case we go though all bitcasts.
54701/// This also recognizes splat of a negated value and returns the splat of that
54702/// value.
54703static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
54704 if (N->getOpcode() == ISD::FNEG)
54705 return N->getOperand(0);
54706
54707 // Don't recurse exponentially.
54709 return SDValue();
54710
54711 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
54712
54714 EVT VT = Op->getValueType(0);
54715
54716 // Make sure the element size doesn't change.
54717 if (VT.getScalarSizeInBits() != ScalarSize)
54718 return SDValue();
54719
54720 unsigned Opc = Op.getOpcode();
54721 switch (Opc) {
54722 case ISD::VECTOR_SHUFFLE: {
54723 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
54724 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
54725 if (!Op.getOperand(1).isUndef())
54726 return SDValue();
54727 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
54728 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
54729 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
54730 cast<ShuffleVectorSDNode>(Op)->getMask());
54731 break;
54732 }
54734 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
54735 // -V, INDEX).
54736 SDValue InsVector = Op.getOperand(0);
54737 SDValue InsVal = Op.getOperand(1);
54738 if (!InsVector.isUndef())
54739 return SDValue();
54740 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
54741 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
54742 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
54743 NegInsVal, Op.getOperand(2));
54744 break;
54745 }
54746 case ISD::FSUB:
54747 case ISD::XOR:
54748 case X86ISD::FXOR: {
54749 SDValue Op1 = Op.getOperand(1);
54750 SDValue Op0 = Op.getOperand(0);
54751
54752 // For XOR and FXOR, we want to check if constant
54753 // bits of Op1 are sign bit masks. For FSUB, we
54754 // have to check if constant bits of Op0 are sign
54755 // bit masks and hence we swap the operands.
54756 if (Opc == ISD::FSUB)
54757 std::swap(Op0, Op1);
54758
54759 APInt UndefElts;
54760 SmallVector<APInt, 16> EltBits;
54761 // Extract constant bits and see if they are all
54762 // sign bit masks. Ignore the undef elements.
54763 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
54764 /* AllowWholeUndefs */ true,
54765 /* AllowPartialUndefs */ false)) {
54766 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
54767 if (!UndefElts[I] && !EltBits[I].isSignMask())
54768 return SDValue();
54769
54770 // Only allow bitcast from correctly-sized constant.
54771 Op0 = peekThroughBitcasts(Op0);
54772 if (Op0.getScalarValueSizeInBits() == ScalarSize)
54773 return Op0;
54774 }
54775 break;
54776 } // case
54777 } // switch
54778
54779 return SDValue();
54780}
54781
54782static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
54783 bool NegRes) {
54784 if (NegMul) {
54785 switch (Opcode) {
54786 // clang-format off
54787 default: llvm_unreachable("Unexpected opcode");
54788 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
54789 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
54790 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
54791 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
54792 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
54793 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
54794 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
54795 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
54796 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
54797 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
54798 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
54799 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
54800 // clang-format on
54801 }
54802 }
54803
54804 if (NegAcc) {
54805 switch (Opcode) {
54806 // clang-format off
54807 default: llvm_unreachable("Unexpected opcode");
54808 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
54809 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
54810 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54811 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
54812 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
54813 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54814 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
54815 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
54816 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54817 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
54818 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
54819 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54820 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
54821 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
54822 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
54823 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
54824 // clang-format on
54825 }
54826 }
54827
54828 if (NegRes) {
54829 switch (Opcode) {
54830 // For accuracy reason, we never combine fneg and fma under strict FP.
54831 // clang-format off
54832 default: llvm_unreachable("Unexpected opcode");
54833 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
54834 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54835 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
54836 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54837 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
54838 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54839 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
54840 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54841 // clang-format on
54842 }
54843 }
54844
54845 return Opcode;
54846}
54847
54848/// Do target-specific dag combines on floating point negations.
54851 const X86Subtarget &Subtarget) {
54852 EVT OrigVT = N->getValueType(0);
54853 SDValue Arg = isFNEG(DAG, N);
54854 if (!Arg)
54855 return SDValue();
54856
54857 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54858 EVT VT = Arg.getValueType();
54859 EVT SVT = VT.getScalarType();
54860 SDLoc DL(N);
54861
54862 // Let legalize expand this if it isn't a legal type yet.
54863 if (!TLI.isTypeLegal(VT))
54864 return SDValue();
54865
54866 // If we're negating a FMUL node on a target with FMA, then we can avoid the
54867 // use of a constant by performing (-0 - A*B) instead.
54868 // FIXME: Check rounding control flags as well once it becomes available.
54869 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
54870 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
54871 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
54872 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
54873 Arg.getOperand(1), Zero);
54874 return DAG.getBitcast(OrigVT, NewNode);
54875 }
54876
54878 bool LegalOperations = !DCI.isBeforeLegalizeOps();
54879 if (SDValue NegArg =
54880 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
54881 return DAG.getBitcast(OrigVT, NegArg);
54882
54883 return SDValue();
54884}
54885
54887 bool LegalOperations,
54888 bool ForCodeSize,
54890 unsigned Depth) const {
54891 // fneg patterns are removable even if they have multiple uses.
54892 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
54894 return DAG.getBitcast(Op.getValueType(), Arg);
54895 }
54896
54897 EVT VT = Op.getValueType();
54898 EVT SVT = VT.getScalarType();
54899 unsigned Opc = Op.getOpcode();
54900 SDNodeFlags Flags = Op.getNode()->getFlags();
54901 switch (Opc) {
54902 case ISD::FMA:
54903 case X86ISD::FMSUB:
54904 case X86ISD::FNMADD:
54905 case X86ISD::FNMSUB:
54906 case X86ISD::FMADD_RND:
54907 case X86ISD::FMSUB_RND:
54908 case X86ISD::FNMADD_RND:
54909 case X86ISD::FNMSUB_RND: {
54910 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
54911 !(SVT == MVT::f32 || SVT == MVT::f64) ||
54913 break;
54914
54915 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
54916 // if it may have signed zeros.
54917 if (!Flags.hasNoSignedZeros())
54918 break;
54919
54920 // Because getCheaperNegatedExpression can delete nodes we need a handle to
54921 // keep temporary nodes alive.
54922 std::list<HandleSDNode> Handles;
54923
54924 // This is always negatible for free but we might be able to remove some
54925 // extra operand negations as well.
54926 SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
54927 for (int i = 0; i != 3; ++i) {
54928 NewOps[i] = getCheaperNegatedExpression(
54929 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
54930 if (!!NewOps[i])
54931 Handles.emplace_back(NewOps[i]);
54932 }
54933
54934 bool NegA = !!NewOps[0];
54935 bool NegB = !!NewOps[1];
54936 bool NegC = !!NewOps[2];
54937 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
54938
54939 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
54941
54942 // Fill in the non-negated ops with the original values.
54943 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
54944 if (!NewOps[i])
54945 NewOps[i] = Op.getOperand(i);
54946 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
54947 }
54948 case X86ISD::FRCP:
54949 if (SDValue NegOp0 =
54950 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
54951 ForCodeSize, Cost, Depth + 1))
54952 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
54953 break;
54954 }
54955
54956 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
54957 ForCodeSize, Cost, Depth);
54958}
54959
54961 const X86Subtarget &Subtarget) {
54962 MVT VT = N->getSimpleValueType(0);
54963 // If we have integer vector types available, use the integer opcodes.
54964 if (!VT.isVector() || !Subtarget.hasSSE2())
54965 return SDValue();
54966
54967 SDLoc dl(N);
54969 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
54970 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
54971 unsigned IntOpcode;
54972 switch (N->getOpcode()) {
54973 // clang-format off
54974 default: llvm_unreachable("Unexpected FP logic op");
54975 case X86ISD::FOR: IntOpcode = ISD::OR; break;
54976 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
54977 case X86ISD::FAND: IntOpcode = ISD::AND; break;
54978 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
54979 // clang-format on
54980 }
54981 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
54982 return DAG.getBitcast(VT, IntOp);
54983}
54984
54985/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
54987 if (N->getOpcode() != ISD::XOR)
54988 return SDValue();
54989
54990 SDValue LHS = N->getOperand(0);
54991 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
54992 return SDValue();
54993
54995 X86::CondCode(LHS->getConstantOperandVal(0)));
54996 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
54997}
54998
55000 const X86Subtarget &Subtarget) {
55001 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
55002 "Invalid opcode for combing with CTLZ");
55003 if (Subtarget.hasFastLZCNT())
55004 return SDValue();
55005
55006 EVT VT = N->getValueType(0);
55007 if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&
55008 (VT != MVT::i64 || !Subtarget.is64Bit()))
55009 return SDValue();
55010
55011 SDValue N0 = N->getOperand(0);
55012 SDValue N1 = N->getOperand(1);
55013
55014 if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&
55016 return SDValue();
55017
55018 SDValue OpCTLZ;
55019 SDValue OpSizeTM1;
55020
55021 if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
55022 OpCTLZ = N1;
55023 OpSizeTM1 = N0;
55024 } else if (N->getOpcode() == ISD::SUB) {
55025 return SDValue();
55026 } else {
55027 OpCTLZ = N0;
55028 OpSizeTM1 = N1;
55029 }
55030
55031 if (!OpCTLZ.hasOneUse())
55032 return SDValue();
55033 auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);
55034 if (!C)
55035 return SDValue();
55036
55037 if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
55038 return SDValue();
55039 EVT OpVT = VT;
55040 SDValue Op = OpCTLZ.getOperand(0);
55041 if (VT == MVT::i8) {
55042 // Zero extend to i32 since there is not an i8 bsr.
55043 OpVT = MVT::i32;
55044 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);
55045 }
55046
55047 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
55048 Op = DAG.getNode(X86ISD::BSR, DL, VTs, DAG.getUNDEF(OpVT), Op);
55049 if (VT == MVT::i8)
55050 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);
55051
55052 return Op;
55053}
55054
55057 const X86Subtarget &Subtarget) {
55058 SDValue N0 = N->getOperand(0);
55059 SDValue N1 = N->getOperand(1);
55060 EVT VT = N->getValueType(0);
55061 SDLoc DL(N);
55062
55063 // If this is SSE1 only convert to FXOR to avoid scalarization.
55064 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
55065 return DAG.getBitcast(MVT::v4i32,
55066 DAG.getNode(X86ISD::FXOR, DL, MVT::v4f32,
55067 DAG.getBitcast(MVT::v4f32, N0),
55068 DAG.getBitcast(MVT::v4f32, N1)));
55069 }
55070
55071 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
55072 return Cmp;
55073
55074 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), DL, N0, N1, DAG))
55075 return R;
55076
55077 if (SDValue R = combineBitOpWithShift(N->getOpcode(), DL, VT, N0, N1, DAG))
55078 return R;
55079
55080 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), DL, VT, N0, N1, DAG))
55081 return R;
55082
55083 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), DL, VT, N0, N1,
55084 DAG, DCI, Subtarget))
55085 return FPLogic;
55086
55087 if (SDValue R = combineXorSubCTLZ(N, DL, DAG, Subtarget))
55088 return R;
55089
55090 if (DCI.isBeforeLegalizeOps())
55091 return SDValue();
55092
55093 if (SDValue SetCC = foldXor1SetCC(N, DL, DAG))
55094 return SetCC;
55095
55096 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), DL, VT, N0, N1, DAG))
55097 return R;
55098
55099 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DL, DAG))
55100 return RV;
55101
55102 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
55103 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55104 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
55105 N0.getOperand(0).getValueType().isVector() &&
55106 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
55107 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
55108 return DAG.getBitcast(
55109 VT, DAG.getNOT(DL, N0.getOperand(0), N0.getOperand(0).getValueType()));
55110 }
55111
55112 // Handle AVX512 mask widening.
55113 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
55114 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
55115 VT.getVectorElementType() == MVT::i1 &&
55117 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
55118 return DAG.getNode(
55120 DAG.getNOT(DL, N0.getOperand(1), N0.getOperand(1).getValueType()),
55121 N0.getOperand(2));
55122 }
55123
55124 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
55125 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
55126 // TODO: Under what circumstances could this be performed in DAGCombine?
55127 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
55128 N0.getOperand(0).getOpcode() == N->getOpcode()) {
55129 SDValue TruncExtSrc = N0.getOperand(0);
55130 auto *N1C = dyn_cast<ConstantSDNode>(N1);
55131 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
55132 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
55133 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
55134 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
55135 return DAG.getNode(ISD::XOR, DL, VT, LHS,
55136 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
55137 }
55138 }
55139
55140 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
55141 return R;
55142
55143 return combineFneg(N, DAG, DCI, Subtarget);
55144}
55145
55148 const X86Subtarget &Subtarget) {
55149 SDValue N0 = N->getOperand(0);
55150 EVT VT = N->getValueType(0);
55151
55152 // Convert a (iX bitreverse(bitcast(vXi1 X))) -> (iX bitcast(shuffle(X)))
55153 if (VT.isInteger() && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse()) {
55154 SDValue Src = N0.getOperand(0);
55155 EVT SrcVT = Src.getValueType();
55156 if (SrcVT.isVector() && SrcVT.getScalarType() == MVT::i1 &&
55157 (DCI.isBeforeLegalize() ||
55158 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)) &&
55159 Subtarget.hasSSSE3()) {
55160 unsigned NumElts = SrcVT.getVectorNumElements();
55161 SmallVector<int, 32> ReverseMask(NumElts);
55162 for (unsigned I = 0; I != NumElts; ++I)
55163 ReverseMask[I] = (NumElts - 1) - I;
55164 SDValue Rev =
55165 DAG.getVectorShuffle(SrcVT, SDLoc(N), Src, Src, ReverseMask);
55166 return DAG.getBitcast(VT, Rev);
55167 }
55168 }
55169
55170 return SDValue();
55171}
55172
55173// Various combines to try to convert to avgceilu.
55176 const X86Subtarget &Subtarget) {
55177 unsigned Opcode = N->getOpcode();
55178 SDValue N0 = N->getOperand(0);
55179 SDValue N1 = N->getOperand(1);
55180 EVT VT = N->getValueType(0);
55181 EVT SVT = VT.getScalarType();
55182 SDLoc DL(N);
55183
55184 // avgceils(x,y) -> flipsign(avgceilu(flipsign(x),flipsign(y)))
55185 // Only useful on vXi8 which doesn't have good SRA handling.
55186 if (Opcode == ISD::AVGCEILS && VT.isVector() && SVT == MVT::i8) {
55188 SDValue SignMask = DAG.getConstant(SignBit, DL, VT);
55189 N0 = DAG.getNode(ISD::XOR, DL, VT, N0, SignMask);
55190 N1 = DAG.getNode(ISD::XOR, DL, VT, N1, SignMask);
55191 return DAG.getNode(ISD::XOR, DL, VT,
55192 DAG.getNode(ISD::AVGCEILU, DL, VT, N0, N1), SignMask);
55193 }
55194
55195 return SDValue();
55196}
55197
55200 const X86Subtarget &Subtarget) {
55201 EVT VT = N->getValueType(0);
55202 unsigned NumBits = VT.getSizeInBits();
55203
55204 // TODO - Constant Folding.
55205
55206 // Simplify the inputs.
55207 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55208 APInt DemandedMask(APInt::getAllOnes(NumBits));
55209 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
55210 return SDValue(N, 0);
55211
55212 return SDValue();
55213}
55214
55216 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
55217}
55218
55219/// If a value is a scalar FP zero or a vector FP zero (potentially including
55220/// undefined elements), return a zero constant that may be used to fold away
55221/// that value. In the case of a vector, the returned constant will not contain
55222/// undefined elements even if the input parameter does. This makes it suitable
55223/// to be used as a replacement operand with operations (eg, bitwise-and) where
55224/// an undef should not propagate.
55226 const X86Subtarget &Subtarget) {
55228 return SDValue();
55229
55230 if (V.getValueType().isVector())
55231 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
55232
55233 return V;
55234}
55235
55237 const X86Subtarget &Subtarget) {
55238 SDValue N0 = N->getOperand(0);
55239 SDValue N1 = N->getOperand(1);
55240 EVT VT = N->getValueType(0);
55241 SDLoc DL(N);
55242
55243 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
55244 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
55245 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
55246 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
55247 return SDValue();
55248
55249 auto isAllOnesConstantFP = [](SDValue V) {
55250 if (V.getSimpleValueType().isVector())
55251 return ISD::isBuildVectorAllOnes(V.getNode());
55252 auto *C = dyn_cast<ConstantFPSDNode>(V);
55253 return C && C->getConstantFPValue()->isAllOnesValue();
55254 };
55255
55256 // fand (fxor X, -1), Y --> fandn X, Y
55257 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
55258 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
55259
55260 // fand X, (fxor Y, -1) --> fandn Y, X
55261 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
55262 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
55263
55264 return SDValue();
55265}
55266
55267/// Do target-specific dag combines on X86ISD::FAND nodes.
55269 const X86Subtarget &Subtarget) {
55270 // FAND(0.0, x) -> 0.0
55271 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
55272 return V;
55273
55274 // FAND(x, 0.0) -> 0.0
55275 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
55276 return V;
55277
55278 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
55279 return V;
55280
55281 return lowerX86FPLogicOp(N, DAG, Subtarget);
55282}
55283
55284/// Do target-specific dag combines on X86ISD::FANDN nodes.
55286 const X86Subtarget &Subtarget) {
55287 // FANDN(0.0, x) -> x
55288 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
55289 return N->getOperand(1);
55290
55291 // FANDN(x, 0.0) -> 0.0
55292 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
55293 return V;
55294
55295 return lowerX86FPLogicOp(N, DAG, Subtarget);
55296}
55297
55298/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
55301 const X86Subtarget &Subtarget) {
55302 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
55303
55304 // F[X]OR(0.0, x) -> x
55305 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
55306 return N->getOperand(1);
55307
55308 // F[X]OR(x, 0.0) -> x
55309 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
55310 return N->getOperand(0);
55311
55312 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
55313 return NewVal;
55314
55315 return lowerX86FPLogicOp(N, DAG, Subtarget);
55316}
55317
55318/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
55320 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
55321
55322 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
55323 if (!DAG.getTarget().Options.NoNaNsFPMath ||
55325 return SDValue();
55326
55327 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
55328 // into FMINC and FMAXC, which are Commutative operations.
55329 unsigned NewOp = 0;
55330 switch (N->getOpcode()) {
55331 default: llvm_unreachable("unknown opcode");
55332 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
55333 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
55334 }
55335
55336 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
55337 N->getOperand(0), N->getOperand(1));
55338}
55339
55341 const X86Subtarget &Subtarget) {
55342 EVT VT = N->getValueType(0);
55343 if (Subtarget.useSoftFloat() || isSoftF16(VT, Subtarget))
55344 return SDValue();
55345
55346 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55347
55348 auto IsMinMaxLegal = [&](EVT VT) {
55349 if (!TLI.isTypeLegal(VT))
55350 return false;
55351 return VT.getScalarType() != MVT::f16 ||
55352 (Subtarget.hasFP16() && (VT == MVT::v32f16 || Subtarget.hasVLX()));
55353 };
55354
55355 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
55356 (Subtarget.hasSSE2() && VT == MVT::f64) ||
55357 (Subtarget.hasFP16() && VT == MVT::f16) ||
55358 (VT.isVector() && IsMinMaxLegal(VT))))
55359 return SDValue();
55360
55361 SDValue Op0 = N->getOperand(0);
55362 SDValue Op1 = N->getOperand(1);
55363 SDLoc DL(N);
55364 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
55365
55366 // If we don't have to respect NaN inputs, this is a direct translation to x86
55367 // min/max instructions.
55368 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
55369 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
55370
55371 // If one of the operands is known non-NaN use the native min/max instructions
55372 // with the non-NaN input as second operand.
55373 if (DAG.isKnownNeverNaN(Op1))
55374 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
55375 if (DAG.isKnownNeverNaN(Op0))
55376 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
55377
55378 // If we have to respect NaN inputs, this takes at least 3 instructions.
55379 // Favor a library call when operating on a scalar and minimizing code size.
55380 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
55381 return SDValue();
55382
55383 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
55384 VT);
55385
55386 // There are 4 possibilities involving NaN inputs, and these are the required
55387 // outputs:
55388 // Op1
55389 // Num NaN
55390 // ----------------
55391 // Num | Max | Op0 |
55392 // Op0 ----------------
55393 // NaN | Op1 | NaN |
55394 // ----------------
55395 //
55396 // The SSE FP max/min instructions were not designed for this case, but rather
55397 // to implement:
55398 // Min = Op1 < Op0 ? Op1 : Op0
55399 // Max = Op1 > Op0 ? Op1 : Op0
55400 //
55401 // So they always return Op0 if either input is a NaN. However, we can still
55402 // use those instructions for fmaxnum by selecting away a NaN input.
55403
55404 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
55405 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
55406 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
55407
55408 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
55409 // are NaN, the NaN value of Op1 is the result.
55410 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
55411}
55412
55415 EVT VT = N->getValueType(0);
55416 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55417
55418 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
55419 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
55420 return SDValue(N, 0);
55421
55422 // Convert a full vector load into vzload when not all bits are needed.
55423 SDValue In = N->getOperand(0);
55424 MVT InVT = In.getSimpleValueType();
55425 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
55426 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
55427 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
55428 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
55429 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
55430 MVT MemVT = MVT::getIntegerVT(NumBits);
55431 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
55432 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
55433 SDLoc dl(N);
55434 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
55435 DAG.getBitcast(InVT, VZLoad));
55436 DCI.CombineTo(N, Convert);
55437 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55439 return SDValue(N, 0);
55440 }
55441 }
55442
55443 return SDValue();
55444}
55445
55449 bool IsStrict = TSI.isTargetStrictFPOpcode(N->getOpcode());
55450 EVT VT = N->getValueType(0);
55451
55452 // Convert a full vector load into vzload when not all bits are needed.
55453 SDValue In = N->getOperand(IsStrict ? 1 : 0);
55454 MVT InVT = In.getSimpleValueType();
55455 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
55456 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
55457 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
55458 LoadSDNode *LN = cast<LoadSDNode>(In);
55459 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
55460 MVT MemVT = MVT::getFloatingPointVT(NumBits);
55461 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
55462 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
55463 SDLoc dl(N);
55464 if (IsStrict) {
55465 SDValue Convert =
55466 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
55467 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
55468 DCI.CombineTo(N, Convert, Convert.getValue(1));
55469 } else {
55470 SDValue Convert =
55471 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
55472 DCI.CombineTo(N, Convert);
55473 }
55474 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55476 return SDValue(N, 0);
55477 }
55478 }
55479
55480 return SDValue();
55481}
55482
55483/// Do target-specific dag combines on X86ISD::ANDNP nodes.
55486 const X86Subtarget &Subtarget) {
55487 SDValue N0 = N->getOperand(0);
55488 SDValue N1 = N->getOperand(1);
55489 MVT VT = N->getSimpleValueType(0);
55490 int NumElts = VT.getVectorNumElements();
55491 unsigned EltSizeInBits = VT.getScalarSizeInBits();
55492 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55493 SDLoc DL(N);
55494
55495 // ANDNP(undef, x) -> 0
55496 // ANDNP(x, undef) -> 0
55497 if (N0.isUndef() || N1.isUndef())
55498 return DAG.getConstant(0, DL, VT);
55499
55500 // ANDNP(0, x) -> x
55502 return N1;
55503
55504 // ANDNP(x, 0) -> 0
55506 return DAG.getConstant(0, DL, VT);
55507
55508 // ANDNP(x, -1) -> NOT(x) -> XOR(x, -1)
55510 return DAG.getNOT(DL, N0, VT);
55511
55512 // Turn ANDNP back to AND if input is inverted.
55513 if (SDValue Not = IsNOT(N0, DAG))
55514 return DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, Not), N1);
55515
55516 // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.
55517 // to make use of predicated selects.
55518 // ANDN(SEXT(SETCC()),X) -> SELECT(NOT(SETCC()),X,0)
55519 if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::SIGN_EXTEND) {
55520 SDValue Src = N0.getOperand(0);
55521 EVT SrcVT = Src.getValueType();
55522 if (Src.getOpcode() == ISD::SETCC && SrcVT.getScalarType() == MVT::i1 &&
55523 (VT.is512BitVector() || Subtarget.hasVLX()) &&
55524 (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
55525 TLI.isTypeLegal(SrcVT) && N0.hasOneUse() && Src.hasOneUse())
55526 return DAG.getSelect(DL, VT, DAG.getNOT(DL, Src, SrcVT), N1,
55527 getZeroVector(VT, Subtarget, DAG, DL));
55528 }
55529
55530 // Constant Folding
55531 APInt Undefs0, Undefs1;
55532 SmallVector<APInt> EltBits0, EltBits1;
55533 if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0,
55534 /*AllowWholeUndefs*/ true,
55535 /*AllowPartialUndefs*/ true)) {
55536 if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1,
55537 /*AllowWholeUndefs*/ true,
55538 /*AllowPartialUndefs*/ true)) {
55539 SmallVector<APInt> ResultBits;
55540 for (int I = 0; I != NumElts; ++I)
55541 ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
55542 return getConstVector(ResultBits, VT, DAG, DL);
55543 }
55544
55545 // Constant fold NOT(N0) to allow us to use AND.
55546 // Ensure this is only performed if we can confirm that the bitcasted source
55547 // has oneuse to prevent an infinite loop with canonicalizeBitSelect.
55548 if (N0->hasOneUse()) {
55550 if (BC0.getOpcode() != ISD::BITCAST) {
55551 for (APInt &Elt : EltBits0)
55552 Elt = ~Elt;
55553 SDValue Not = getConstVector(EltBits0, VT, DAG, DL);
55554 return DAG.getNode(ISD::AND, DL, VT, Not, N1);
55555 }
55556 }
55557 }
55558
55559 // Attempt to recursively combine a bitmask ANDNP with shuffles.
55560 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
55561 SDValue Op(N, 0);
55562 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
55563 return Res;
55564
55565 // If either operand is a constant mask, then only the elements that aren't
55566 // zero are actually demanded by the other operand.
55567 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
55568 APInt UndefElts;
55569 SmallVector<APInt> EltBits;
55570 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
55571 APInt DemandedElts = APInt::getAllOnes(NumElts);
55572 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
55573 EltBits)) {
55574 DemandedBits.clearAllBits();
55575 DemandedElts.clearAllBits();
55576 for (int I = 0; I != NumElts; ++I) {
55577 if (UndefElts[I]) {
55578 // We can't assume an undef src element gives an undef dst - the
55579 // other src might be zero.
55580 DemandedBits.setAllBits();
55581 DemandedElts.setBit(I);
55582 } else if ((Invert && !EltBits[I].isAllOnes()) ||
55583 (!Invert && !EltBits[I].isZero())) {
55584 DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
55585 DemandedElts.setBit(I);
55586 }
55587 }
55588 }
55589 return std::make_pair(DemandedBits, DemandedElts);
55590 };
55591 APInt Bits0, Elts0;
55592 APInt Bits1, Elts1;
55593 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
55594 std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);
55595
55596 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
55597 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
55598 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
55599 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
55600 if (N->getOpcode() != ISD::DELETED_NODE)
55601 DCI.AddToWorklist(N);
55602 return SDValue(N, 0);
55603 }
55604 }
55605
55606 // Folds for better commutativity:
55607 if (N1->hasOneUse()) {
55608 // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
55609 if (SDValue Not = IsNOT(N1, DAG))
55610 return DAG.getNOT(
55611 DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT);
55612
55613 // ANDNP(x,PSHUFB(y,z)) -> PSHUFB(y,OR(z,x))
55614 // Zero out elements by setting the PSHUFB mask value to 0xFF.
55615 if (DAG.ComputeNumSignBits(N0) == EltSizeInBits) {
55617 if (BC1.getOpcode() == X86ISD::PSHUFB) {
55618 EVT ShufVT = BC1.getValueType();
55619 SDValue NewMask = DAG.getNode(ISD::OR, DL, ShufVT, BC1.getOperand(1),
55620 DAG.getBitcast(ShufVT, N0));
55621 SDValue NewShuf =
55622 DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, BC1.getOperand(0), NewMask);
55623 return DAG.getBitcast(VT, NewShuf);
55624 }
55625 }
55626 }
55627
55628 return SDValue();
55629}
55630
55633 SDValue N1 = N->getOperand(1);
55634
55635 // BT ignores high bits in the bit index operand.
55636 unsigned BitWidth = N1.getValueSizeInBits();
55638 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
55639 if (N->getOpcode() != ISD::DELETED_NODE)
55640 DCI.AddToWorklist(N);
55641 return SDValue(N, 0);
55642 }
55643
55644 return SDValue();
55645}
55646
55649 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
55650 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
55651
55652 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
55653 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55654 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
55655 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
55656 if (N->getOpcode() != ISD::DELETED_NODE)
55657 DCI.AddToWorklist(N);
55658 return SDValue(N, 0);
55659 }
55660
55661 // Convert a full vector load into vzload when not all bits are needed.
55662 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
55663 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
55664 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
55665 SDLoc dl(N);
55666 if (IsStrict) {
55667 SDValue Convert = DAG.getNode(
55668 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
55669 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
55670 DCI.CombineTo(N, Convert, Convert.getValue(1));
55671 } else {
55672 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
55673 DAG.getBitcast(MVT::v8i16, VZLoad));
55674 DCI.CombineTo(N, Convert);
55675 }
55676
55677 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55679 return SDValue(N, 0);
55680 }
55681 }
55682 }
55683
55684 return SDValue();
55685}
55686
55687// Try to combine sext_in_reg of a cmov of constants by extending the constants.
55689 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
55690
55691 EVT DstVT = N->getValueType(0);
55692
55693 SDValue N0 = N->getOperand(0);
55694 SDValue N1 = N->getOperand(1);
55695 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
55696
55697 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
55698 return SDValue();
55699
55700 // Look through single use any_extends / truncs.
55701 SDValue IntermediateBitwidthOp;
55702 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
55703 N0.hasOneUse()) {
55704 IntermediateBitwidthOp = N0;
55705 N0 = N0.getOperand(0);
55706 }
55707
55708 // See if we have a single use cmov.
55709 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
55710 return SDValue();
55711
55712 SDValue CMovOp0 = N0.getOperand(0);
55713 SDValue CMovOp1 = N0.getOperand(1);
55714
55715 // Make sure both operands are constants.
55716 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
55717 !isa<ConstantSDNode>(CMovOp1.getNode()))
55718 return SDValue();
55719
55720 SDLoc DL(N);
55721
55722 // If we looked through an any_extend/trunc above, add one to the constants.
55723 if (IntermediateBitwidthOp) {
55724 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
55725 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
55726 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
55727 }
55728
55729 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
55730 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
55731
55732 EVT CMovVT = DstVT;
55733 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
55734 if (DstVT == MVT::i16) {
55735 CMovVT = MVT::i32;
55736 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
55737 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
55738 }
55739
55740 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
55741 N0.getOperand(2), N0.getOperand(3));
55742
55743 if (CMovVT != DstVT)
55744 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
55745
55746 return CMov;
55747}
55748
55750 const X86Subtarget &Subtarget) {
55751 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
55752
55753 if (SDValue V = combineSextInRegCmov(N, DAG))
55754 return V;
55755
55756 EVT VT = N->getValueType(0);
55757 SDValue N0 = N->getOperand(0);
55758 SDValue N1 = N->getOperand(1);
55759 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
55760 SDLoc dl(N);
55761
55762 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
55763 // both SSE and AVX2 since there is no sign-extended shift right
55764 // operation on a vector with 64-bit elements.
55765 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
55766 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
55767 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
55768 N0.getOpcode() == ISD::SIGN_EXTEND)) {
55769 SDValue N00 = N0.getOperand(0);
55770
55771 // EXTLOAD has a better solution on AVX2,
55772 // it may be replaced with X86ISD::VSEXT node.
55773 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
55774 if (!ISD::isNormalLoad(N00.getNode()))
55775 return SDValue();
55776
55777 // Attempt to promote any comparison mask ops before moving the
55778 // SIGN_EXTEND_INREG in the way.
55779 if (SDValue Promote = PromoteMaskArithmetic(N0, dl, DAG, Subtarget))
55780 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
55781
55782 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
55783 SDValue Tmp =
55784 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
55785 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
55786 }
55787 }
55788 return SDValue();
55789}
55790
55791/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
55792/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
55793/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
55794/// opportunities to combine math ops, use an LEA, or use a complex addressing
55795/// mode. This can eliminate extend, add, and shift instructions.
55797 const X86Subtarget &Subtarget) {
55798 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
55799 Ext->getOpcode() != ISD::ZERO_EXTEND)
55800 return SDValue();
55801
55802 // TODO: This should be valid for other integer types.
55803 EVT VT = Ext->getValueType(0);
55804 if (VT != MVT::i64)
55805 return SDValue();
55806
55807 SDValue Add = Ext->getOperand(0);
55808 if (Add.getOpcode() != ISD::ADD)
55809 return SDValue();
55810
55811 SDValue AddOp0 = Add.getOperand(0);
55812 SDValue AddOp1 = Add.getOperand(1);
55813 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
55814 bool NSW = Add->getFlags().hasNoSignedWrap();
55815 bool NUW = Add->getFlags().hasNoUnsignedWrap();
55816 NSW = NSW || (Sext && DAG.willNotOverflowAdd(true, AddOp0, AddOp1));
55817 NUW = NUW || (!Sext && DAG.willNotOverflowAdd(false, AddOp0, AddOp1));
55818
55819 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
55820 // into the 'zext'
55821 if ((Sext && !NSW) || (!Sext && !NUW))
55822 return SDValue();
55823
55824 // Having a constant operand to the 'add' ensures that we are not increasing
55825 // the instruction count because the constant is extended for free below.
55826 // A constant operand can also become the displacement field of an LEA.
55827 auto *AddOp1C = dyn_cast<ConstantSDNode>(AddOp1);
55828 if (!AddOp1C)
55829 return SDValue();
55830
55831 // Don't make the 'add' bigger if there's no hope of combining it with some
55832 // other 'add' or 'shl' instruction.
55833 // TODO: It may be profitable to generate simpler LEA instructions in place
55834 // of single 'add' instructions, but the cost model for selecting an LEA
55835 // currently has a high threshold.
55836 bool HasLEAPotential = false;
55837 for (auto *User : Ext->users()) {
55838 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
55839 HasLEAPotential = true;
55840 break;
55841 }
55842 }
55843 if (!HasLEAPotential)
55844 return SDValue();
55845
55846 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
55847 int64_t AddC = Sext ? AddOp1C->getSExtValue() : AddOp1C->getZExtValue();
55848 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
55849 SDValue NewConstant = DAG.getConstant(AddC, SDLoc(Add), VT);
55850
55851 // The wider add is guaranteed to not wrap because both operands are
55852 // sign-extended.
55853 SDNodeFlags Flags;
55854 Flags.setNoSignedWrap(NSW);
55855 Flags.setNoUnsignedWrap(NUW);
55856 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
55857}
55858
55859// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
55860// operands and the result of CMOV is not used anywhere else - promote CMOV
55861// itself instead of promoting its result. This could be beneficial, because:
55862// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
55863// (or more) pseudo-CMOVs only when they go one-after-another and
55864// getting rid of result extension code after CMOV will help that.
55865// 2) Promotion of constant CMOV arguments is free, hence the
55866// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
55867// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
55868// promotion is also good in terms of code-size.
55869// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
55870// promotion).
55872 SDValue CMovN = Extend->getOperand(0);
55873 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
55874 return SDValue();
55875
55876 EVT TargetVT = Extend->getValueType(0);
55877 unsigned ExtendOpcode = Extend->getOpcode();
55878 SDLoc DL(Extend);
55879
55880 EVT VT = CMovN.getValueType();
55881 SDValue CMovOp0 = CMovN.getOperand(0);
55882 SDValue CMovOp1 = CMovN.getOperand(1);
55883
55884 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
55885 !isa<ConstantSDNode>(CMovOp1.getNode()))
55886 return SDValue();
55887
55888 // Only extend to i32 or i64.
55889 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
55890 return SDValue();
55891
55892 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
55893 // are free.
55894 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
55895 return SDValue();
55896
55897 // If this a zero extend to i64, we should only extend to i32 and use a free
55898 // zero extend to finish.
55899 EVT ExtendVT = TargetVT;
55900 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
55901 ExtendVT = MVT::i32;
55902
55903 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
55904 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
55905
55906 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
55907 CMovN.getOperand(2), CMovN.getOperand(3));
55908
55909 // Finish extending if needed.
55910 if (ExtendVT != TargetVT)
55911 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
55912
55913 return Res;
55914}
55915
55916// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
55917// result type.
55919 const X86Subtarget &Subtarget) {
55920 SDValue N0 = N->getOperand(0);
55921 EVT VT = N->getValueType(0);
55922 SDLoc dl(N);
55923
55924 // Only do this combine with AVX512 for vector extends.
55925 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
55926 return SDValue();
55927
55928 // Only combine legal element types.
55929 EVT SVT = VT.getVectorElementType();
55930 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
55931 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
55932 return SDValue();
55933
55934 // We don't have CMPP Instruction for vxf16
55935 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
55936 return SDValue();
55937 // We can only do this if the vector size in 256 bits or less.
55938 unsigned Size = VT.getSizeInBits();
55939 if (Size > 256 && Subtarget.useAVX512Regs())
55940 return SDValue();
55941
55942 EVT N00VT = N0.getOperand(0).getValueType();
55943
55944 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
55945 // that's the only integer compares with we have.
55947 if (N00VT.isInteger() && ISD::isUnsignedIntSetCC(CC))
55948 return SDValue();
55949
55950 // Only do this combine if the extension will be fully consumed by the setcc.
55951 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
55952 if (Size != MatchingVecType.getSizeInBits())
55953 return SDValue();
55954
55955 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
55956
55957 if (N->getOpcode() == ISD::ZERO_EXTEND)
55958 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
55959
55960 return Res;
55961}
55962
55965 const X86Subtarget &Subtarget) {
55966 SDValue N0 = N->getOperand(0);
55967 EVT VT = N->getValueType(0);
55968 SDLoc DL(N);
55969
55970 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
55971 if (!DCI.isBeforeLegalizeOps() &&
55973 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
55974 N0->getOperand(1));
55975 bool ReplaceOtherUses = !N0.hasOneUse();
55976 DCI.CombineTo(N, Setcc);
55977 // Replace other uses with a truncate of the widened setcc_carry.
55978 if (ReplaceOtherUses) {
55979 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
55980 N0.getValueType(), Setcc);
55981 DCI.CombineTo(N0.getNode(), Trunc);
55982 }
55983
55984 return SDValue(N, 0);
55985 }
55986
55987 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
55988 return NewCMov;
55989
55990 if (!DCI.isBeforeLegalizeOps())
55991 return SDValue();
55992
55993 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
55994 return V;
55995
55996 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
55997 DAG, DCI, Subtarget))
55998 return V;
55999
56000 if (VT.isVector()) {
56001 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DL, DAG, Subtarget))
56002 return R;
56003
56005 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
56006 }
56007
56008 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
56009 return NewAdd;
56010
56011 return SDValue();
56012}
56013
56014// Inverting a constant vector is profitable if it can be eliminated and the
56015// inverted vector is already present in DAG. Otherwise, it will be loaded
56016// anyway.
56017//
56018// We determine which of the values can be completely eliminated and invert it.
56019// If both are eliminable, select a vector with the first negative element.
56022 "ConstantFP build vector expected");
56023 // Check if we can eliminate V. We assume if a value is only used in FMAs, we
56024 // can eliminate it. Since this function is invoked for each FMA with this
56025 // vector.
56026 auto IsNotFMA = [](SDNode *User) {
56027 return User->getOpcode() != ISD::FMA &&
56028 User->getOpcode() != ISD::STRICT_FMA;
56029 };
56030 if (llvm::any_of(V->users(), IsNotFMA))
56031 return SDValue();
56032
56034 EVT VT = V.getValueType();
56035 EVT EltVT = VT.getVectorElementType();
56036 for (const SDValue &Op : V->op_values()) {
56037 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
56038 Ops.push_back(DAG.getConstantFP(-Cst->getValueAPF(), SDLoc(Op), EltVT));
56039 } else {
56040 assert(Op.isUndef());
56041 Ops.push_back(DAG.getUNDEF(EltVT));
56042 }
56043 }
56044
56046 if (!NV)
56047 return SDValue();
56048
56049 // If an inverted version cannot be eliminated, choose it instead of the
56050 // original version.
56051 if (llvm::any_of(NV->users(), IsNotFMA))
56052 return SDValue(NV, 0);
56053
56054 // If the inverted version also can be eliminated, we have to consistently
56055 // prefer one of the values. We prefer a constant with a negative value on
56056 // the first place.
56057 // N.B. We need to skip undefs that may precede a value.
56058 for (const SDValue &Op : V->op_values()) {
56059 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
56060 if (Cst->isNegative())
56061 return SDValue();
56062 break;
56063 }
56064 }
56065 return SDValue(NV, 0);
56066}
56067
56070 const X86Subtarget &Subtarget) {
56071 SDLoc dl(N);
56072 EVT VT = N->getValueType(0);
56074 bool IsStrict = N->isTargetOpcode()
56075 ? TSI.isTargetStrictFPOpcode(N->getOpcode())
56076 : N->isStrictFPOpcode();
56077
56078 // Let legalize expand this if it isn't a legal type yet.
56079 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56080 if (!TLI.isTypeLegal(VT))
56081 return SDValue();
56082
56083 SDValue A = N->getOperand(IsStrict ? 1 : 0);
56084 SDValue B = N->getOperand(IsStrict ? 2 : 1);
56085 SDValue C = N->getOperand(IsStrict ? 3 : 2);
56086
56087 // If the operation allows fast-math and the target does not support FMA,
56088 // split this into mul+add to avoid libcall(s).
56089 SDNodeFlags Flags = N->getFlags();
56090 if (!IsStrict && Flags.hasAllowReassociation() &&
56091 TLI.isOperationExpand(ISD::FMA, VT)) {
56092 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
56093 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
56094 }
56095
56096 EVT ScalarVT = VT.getScalarType();
56097 if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
56098 !Subtarget.hasAnyFMA()) &&
56099 !(ScalarVT == MVT::f16 && Subtarget.hasFP16()) &&
56100 !(ScalarVT == MVT::bf16 && Subtarget.hasAVX10_2()))
56101 return SDValue();
56102
56103 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
56105 bool LegalOperations = !DCI.isBeforeLegalizeOps();
56106 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
56107 CodeSize)) {
56108 V = NegV;
56109 return true;
56110 }
56111 // Look through extract_vector_elts. If it comes from an FNEG, create a
56112 // new extract from the FNEG input.
56113 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
56114 isNullConstant(V.getOperand(1))) {
56115 SDValue Vec = V.getOperand(0);
56116 if (SDValue NegV = TLI.getCheaperNegatedExpression(
56117 Vec, DAG, LegalOperations, CodeSize)) {
56118 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
56119 NegV, V.getOperand(1));
56120 return true;
56121 }
56122 }
56123 // Lookup if there is an inverted version of constant vector V in DAG.
56124 if (ISD::isBuildVectorOfConstantFPSDNodes(V.getNode())) {
56125 if (SDValue NegV = getInvertedVectorForFMA(V, DAG)) {
56126 V = NegV;
56127 return true;
56128 }
56129 }
56130 return false;
56131 };
56132
56133 // Do not convert the passthru input of scalar intrinsics.
56134 // FIXME: We could allow negations of the lower element only.
56135 bool NegA = invertIfNegative(A);
56136 // Create a dummy use for A so that in the process of negating B or C
56137 // recursively, it is not deleted.
56138 HandleSDNode NegAHandle(A);
56139 bool NegB = invertIfNegative(B);
56140 // Similar to A, get a handle on B.
56141 HandleSDNode NegBHandle(B);
56142 bool NegC = invertIfNegative(C);
56143
56144 if (!NegA && !NegB && !NegC)
56145 return SDValue();
56146
56147 unsigned NewOpcode =
56148 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
56149
56150 // Propagate fast-math-flags to new FMA node.
56151 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
56152 if (IsStrict) {
56153 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
56154 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
56155 {N->getOperand(0), A, B, C});
56156 } else {
56157 if (N->getNumOperands() == 4)
56158 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
56159 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
56160 }
56161}
56162
56163// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
56164// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
56167 SDLoc dl(N);
56168 EVT VT = N->getValueType(0);
56169 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56171 bool LegalOperations = !DCI.isBeforeLegalizeOps();
56172
56173 SDValue N2 = N->getOperand(2);
56174
56175 SDValue NegN2 =
56176 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
56177 if (!NegN2)
56178 return SDValue();
56179 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
56180
56181 if (N->getNumOperands() == 4)
56182 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
56183 NegN2, N->getOperand(3));
56184 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
56185 NegN2);
56186}
56187
56188// Try to widen the build vector and bitcast it to the type of zext.
56189// This is a special case for the 128-bit vector types. Intention is to remove
56190// the zext and replace it with a bitcast the wider type. While lowering
56191// the bitcast is removed and extra commutation due to zext is avoided.
56192// For example:
56193// zext v4i16 ( v4i8 build_vector (x, y, z, w)) -> bitcast v4i16 ( v8i8
56194// build_vector (x, 0, y, 0, z, w, 0)
56196
56197 if (Extend->getOpcode() != ISD::ZERO_EXTEND)
56198 return SDValue();
56199
56200 EVT ExtendVT = Extend->getValueType(0);
56201
56202 SDValue BV = Extend->getOperand(0);
56203 if (BV.getOpcode() != ISD::BUILD_VECTOR || !BV.hasOneUse())
56204 return SDValue();
56205
56206 if (any_of(BV->op_values(), [](SDValue Op) { return Op.isUndef(); })) {
56207 // If the build vector has undef elements, we cannot widen it.
56208 // The widening would create a vector with more undef elements, which
56209 // is not valid.
56210 return SDValue();
56211 }
56212
56213 if (!all_of(BV->op_values(),
56214 [](SDValue Op) { return Op.getOpcode() == ISD::LOAD; })) {
56215 // If the build vector any element other than \ISD::LOAD, we cannot widen
56216 // it.
56217 return SDValue();
56218 }
56219
56220 SDLoc dl(BV);
56221 EVT VT = BV.getValueType();
56222 EVT EltVT = BV.getOperand(0).getValueType();
56223 unsigned NumElts = VT.getVectorNumElements();
56224
56225 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56226
56227 if (TLI.getTypeAction(*DAG.getContext(), VT) !=
56229 return SDValue();
56230
56231 EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
56232 unsigned WidenNumElts = WidenVT.getVectorNumElements();
56233
56234 SmallVector<SDValue, 16> NewOps(BV->op_begin(), BV->op_end());
56235 assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");
56236 // Fill the new elements with Zero.
56237 NewOps.append(WidenNumElts - NumElts, DAG.getConstant(0, dl, EltVT));
56238 // Compute the step to place the elements in the right place and control the
56239 // iteration.
56240 unsigned step = WidenNumElts / NumElts;
56241 if (WidenVT.is128BitVector()) {
56242 if (step > 1 && Extend->getValueSizeInBits(0) == WidenVT.getSizeInBits()) {
56243 for (int i = NumElts - 1, j = WidenNumElts - step; i > 0;
56244 i--, j -= step) {
56245 SDValue temp = NewOps[i];
56246 NewOps[i] = NewOps[j];
56247 NewOps[j] = temp;
56248 }
56249 // Create new build vector with WidenVT and NewOps
56250 SDValue NewBV = DAG.getBuildVector(WidenVT, dl, NewOps);
56251 // Replace the old build vector with the new one. Bitcast the
56252 // new build vector to the type of the zext.
56253 SDValue NewBVBitcast = DAG.getBitcast(ExtendVT, NewBV);
56254 DAG.ReplaceAllUsesOfValueWith(SDValue(Extend, 0), NewBVBitcast);
56255 return NewBV;
56256 }
56257 }
56258 return SDValue();
56259}
56260
56263 const X86Subtarget &Subtarget) {
56264 SDLoc dl(N);
56265 SDValue N0 = N->getOperand(0);
56266 EVT VT = N->getValueType(0);
56267
56268 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
56269 // FIXME: Is this needed? We don't seem to have any tests for it.
56270 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
56272 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
56273 N0->getOperand(1));
56274 bool ReplaceOtherUses = !N0.hasOneUse();
56275 DCI.CombineTo(N, Setcc);
56276 // Replace other uses with a truncate of the widened setcc_carry.
56277 if (ReplaceOtherUses) {
56278 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
56279 N0.getValueType(), Setcc);
56280 DCI.CombineTo(N0.getNode(), Trunc);
56281 }
56282
56283 return SDValue(N, 0);
56284 }
56285
56286 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
56287 return NewCMov;
56288
56289 if (DCI.isBeforeLegalizeOps())
56290 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
56291 return V;
56292
56293 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
56294 DAG, DCI, Subtarget))
56295 return V;
56296
56297 if (VT.isVector())
56298 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), dl, DAG, Subtarget))
56299 return R;
56300
56301 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
56302 return NewAdd;
56303
56304 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
56305 return R;
56306
56307 // TODO: Combine with any target/faux shuffle.
56308 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
56310 SDValue N00 = N0.getOperand(0);
56311 SDValue N01 = N0.getOperand(1);
56312 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
56313 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
56314 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
56315 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
56316 return concatSubVectors(N00, N01, DAG, dl);
56317 }
56318 }
56319
56320 if (SDValue V = widenBuildVec(N, DAG))
56321 return V;
56322
56323 return SDValue();
56324}
56325
56326/// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
56327/// pre-promote its result type since vXi1 vectors don't get promoted
56328/// during type legalization.
56331 const SDLoc &DL, SelectionDAG &DAG,
56332 const X86Subtarget &Subtarget) {
56333 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
56334 VT.getVectorElementType() == MVT::i1 &&
56335 (OpVT.getVectorElementType() == MVT::i8 ||
56336 OpVT.getVectorElementType() == MVT::i16)) {
56337 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
56338 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
56339 }
56340 return SDValue();
56341}
56342
56343// The pattern (setcc (and (broadcast x), (2^n, 2^{n+1}, ...)), (0, 0, ...),
56344// eq/ne) is generated when using an integer as a mask. Instead of generating a
56345// broadcast + vptest, we can directly move the integer to a mask register.
56347 const SDLoc &DL, SelectionDAG &DAG,
56348 const X86Subtarget &Subtarget) {
56349 if (CC != ISD::SETNE && CC != ISD::SETEQ)
56350 return SDValue();
56351
56352 if (!Subtarget.hasAVX512())
56353 return SDValue();
56354
56355 if (Op0.getOpcode() != ISD::AND)
56356 return SDValue();
56357
56358 SDValue Broadcast = Op0.getOperand(0);
56359 if (Broadcast.getOpcode() != X86ISD::VBROADCAST &&
56360 Broadcast.getOpcode() != X86ISD::VBROADCAST_LOAD)
56361 return SDValue();
56362
56363 SDValue Load = Op0.getOperand(1);
56364 EVT LoadVT = Load.getSimpleValueType();
56365
56366 APInt UndefElts;
56367 SmallVector<APInt, 32> EltBits;
56369 UndefElts, EltBits,
56370 /*AllowWholeUndefs*/ true,
56371 /*AllowPartialUndefs*/ false) ||
56372 UndefElts[0] || !EltBits[0].isPowerOf2() || UndefElts.getBitWidth() > 16)
56373 return SDValue();
56374
56375 // Check if the constant pool contains only powers of 2 starting from some
56376 // 2^N. The table may also contain undefs because of widening of vector
56377 // operands.
56378 unsigned N = EltBits[0].logBase2();
56379 unsigned Len = UndefElts.getBitWidth();
56380 for (unsigned I = 1; I != Len; ++I) {
56381 if (UndefElts[I]) {
56382 if (!UndefElts.extractBits(Len - (I + 1), I + 1).isAllOnes())
56383 return SDValue();
56384 break;
56385 }
56386
56387 if (EltBits[I].getBitWidth() <= N + I || !EltBits[I].isOneBitSet(N + I))
56388 return SDValue();
56389 }
56390
56391 MVT BroadcastOpVT = Broadcast.getSimpleValueType().getVectorElementType();
56392 SDValue BroadcastOp;
56393 if (Broadcast.getOpcode() != X86ISD::VBROADCAST) {
56394 BroadcastOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, BroadcastOpVT,
56395 Broadcast, DAG.getVectorIdxConstant(0, DL));
56396 } else {
56397 BroadcastOp = Broadcast.getOperand(0);
56398 if (BroadcastOp.getValueType().isVector())
56399 return SDValue();
56400 }
56401
56402 SDValue Masked = BroadcastOp;
56403 if (N != 0) {
56404 unsigned BroadcastOpBitWidth = BroadcastOpVT.getSizeInBits();
56405 unsigned NumDefinedElts = UndefElts.countTrailingZeros();
56406
56407 if (NumDefinedElts > BroadcastOpBitWidth)
56408 return SDValue();
56409
56410 APInt Mask = APInt::getLowBitsSet(BroadcastOpBitWidth, NumDefinedElts);
56411 SDValue ShiftedValue = DAG.getNode(ISD::SRL, DL, BroadcastOpVT, BroadcastOp,
56412 DAG.getConstant(N, DL, BroadcastOpVT));
56413 Masked = DAG.getNode(ISD::AND, DL, BroadcastOpVT, ShiftedValue,
56414 DAG.getConstant(Mask, DL, BroadcastOpVT));
56415 }
56416 // We can't extract more than 16 bits using this pattern, because 2^{17} will
56417 // not fit in an i16 and a vXi32 where X > 16 is more than 512 bits.
56418 SDValue Trunc = DAG.getAnyExtOrTrunc(Masked, DL, MVT::i16);
56419 SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, MVT::v16i1, Trunc);
56420
56421 if (CC == ISD::SETEQ)
56422 Bitcast = DAG.getNOT(DL, Bitcast, MVT::v16i1);
56423
56424 if (VT != MVT::v16i1)
56425 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Bitcast,
56426 DAG.getVectorIdxConstant(0, DL));
56427
56428 return Bitcast;
56429}
56430
56433 const X86Subtarget &Subtarget) {
56434 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
56435 const SDValue LHS = N->getOperand(0);
56436 const SDValue RHS = N->getOperand(1);
56437 EVT VT = N->getValueType(0);
56438 EVT OpVT = LHS.getValueType();
56439 SDLoc DL(N);
56440
56441 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
56442 if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
56443 Subtarget))
56444 return V;
56445 }
56446
56447 if (VT == MVT::i1) {
56448 X86::CondCode X86CC;
56449 if (SDValue V =
56450 MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))
56451 return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));
56452 }
56453
56454 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
56455 if (OpVT.isScalarInteger()) {
56456 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
56457 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
56458 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
56459 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
56460 if (N0.getOperand(0) == N1)
56461 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
56462 N0.getOperand(1));
56463 if (N0.getOperand(1) == N1)
56464 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
56465 N0.getOperand(0));
56466 }
56467 return SDValue();
56468 };
56469 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
56470 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56471 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
56472 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56473
56474 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
56475 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
56476 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
56477 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
56478 if (N0.getOperand(0) == N1)
56479 return DAG.getNode(ISD::AND, DL, OpVT, N1,
56480 DAG.getNOT(DL, N0.getOperand(1), OpVT));
56481 if (N0.getOperand(1) == N1)
56482 return DAG.getNode(ISD::AND, DL, OpVT, N1,
56483 DAG.getNOT(DL, N0.getOperand(0), OpVT));
56484 }
56485 return SDValue();
56486 };
56487 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
56488 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56489 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
56490 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56491
56492 // cmpeq(trunc(x),C) --> cmpeq(x,C)
56493 // cmpne(trunc(x),C) --> cmpne(x,C)
56494 // iff x upper bits are zero.
56495 if (LHS.getOpcode() == ISD::TRUNCATE &&
56496 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
56498 EVT SrcVT = LHS.getOperand(0).getValueType();
56500 OpVT.getScalarSizeInBits());
56501 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56502 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
56503 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
56504 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
56505 DAG.getZExtOrTrunc(RHS, DL, SrcVT), CC);
56506 }
56507
56508 // With C as a power of 2 and C != 0 and C != INT_MIN:
56509 // icmp eq Abs(X) C ->
56510 // (icmp eq A, C) | (icmp eq A, -C)
56511 // icmp ne Abs(X) C ->
56512 // (icmp ne A, C) & (icmp ne A, -C)
56513 // Both of these patterns can be better optimized in
56514 // DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar
56515 // integers which is checked above.
56516 if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {
56517 if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {
56518 const APInt &CInt = C->getAPIntValue();
56519 // We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.
56520 if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {
56521 SDValue BaseOp = LHS.getOperand(0);
56522 SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);
56523 SDValue SETCC1 = DAG.getSetCC(
56524 DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
56525 return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,
56526 SETCC0, SETCC1);
56527 }
56528 }
56529 }
56530 }
56531 }
56532
56533 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
56534 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
56535 // Using temporaries to avoid messing up operand ordering for later
56536 // transformations if this doesn't work.
56537 SDValue Op0 = LHS;
56538 SDValue Op1 = RHS;
56539 ISD::CondCode TmpCC = CC;
56540 // Put build_vector on the right.
56541 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
56542 std::swap(Op0, Op1);
56543 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
56544 }
56545
56546 bool IsSEXT0 =
56547 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
56548 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
56549 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
56550
56551 if (IsSEXT0 && IsVZero1) {
56552 assert(VT == Op0.getOperand(0).getValueType() &&
56553 "Unexpected operand type");
56554 if (TmpCC == ISD::SETGT)
56555 return DAG.getConstant(0, DL, VT);
56556 if (TmpCC == ISD::SETLE)
56557 return DAG.getConstant(1, DL, VT);
56558 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
56559 return DAG.getNOT(DL, Op0.getOperand(0), VT);
56560
56561 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
56562 "Unexpected condition code!");
56563 return Op0.getOperand(0);
56564 }
56565
56566 if (IsVZero1)
56567 if (SDValue V =
56568 combineAVX512SetCCToKMOV(VT, Op0, TmpCC, DL, DAG, Subtarget))
56569 return V;
56570 }
56571
56572 // Try and make unsigned vector comparison signed. On pre AVX512 targets there
56573 // only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to
56574 // use `PCMPGT` if the result is mean to stay in a vector (and if its going to
56575 // a mask, there are signed AVX512 comparisons).
56576 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {
56577 bool CanMakeSigned = false;
56578 if (ISD::isUnsignedIntSetCC(CC)) {
56579 KnownBits CmpKnown =
56581 // If we know LHS/RHS share the same sign bit at each element we can
56582 // make this signed.
56583 // NOTE: `computeKnownBits` on a vector type aggregates common bits
56584 // across all lanes. So a pattern where the sign varies from lane to
56585 // lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be
56586 // missed. We could get around this by demanding each lane
56587 // independently, but this isn't the most important optimization and
56588 // that may eat into compile time.
56589 CanMakeSigned =
56590 CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();
56591 }
56592 if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {
56593 SDValue LHSOut = LHS;
56594 SDValue RHSOut = RHS;
56595 ISD::CondCode NewCC = CC;
56596 switch (CC) {
56597 case ISD::SETGE:
56598 case ISD::SETUGE:
56599 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,
56600 /*NSW*/ true))
56601 LHSOut = NewLHS;
56602 else if (SDValue NewRHS = incDecVectorConstant(
56603 RHS, DAG, /*IsInc*/ false, /*NSW*/ true))
56604 RHSOut = NewRHS;
56605 else
56606 break;
56607
56608 [[fallthrough]];
56609 case ISD::SETUGT:
56610 NewCC = ISD::SETGT;
56611 break;
56612
56613 case ISD::SETLE:
56614 case ISD::SETULE:
56615 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,
56616 /*NSW*/ true))
56617 LHSOut = NewLHS;
56618 else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,
56619 /*NSW*/ true))
56620 RHSOut = NewRHS;
56621 else
56622 break;
56623
56624 [[fallthrough]];
56625 case ISD::SETULT:
56626 // Will be swapped to SETGT in LowerVSETCC*.
56627 NewCC = ISD::SETLT;
56628 break;
56629 default:
56630 break;
56631 }
56632 if (NewCC != CC) {
56633 if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,
56634 NewCC, DL, DAG, Subtarget))
56635 return R;
56636 return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);
56637 }
56638 }
56639 }
56640
56641 if (SDValue R =
56642 truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))
56643 return R;
56644
56645 // In the middle end transforms:
56646 // `(or (icmp eq X, C), (icmp eq X, C+1))`
56647 // -> `(icmp ult (add x, -C), 2)`
56648 // Likewise inverted cases with `ugt`.
56649 //
56650 // Since x86, pre avx512, doesn't have unsigned vector compares, this results
56651 // in worse codegen. So, undo the middle-end transform and go back to `(or
56652 // (icmp eq), (icmp eq))` form.
56653 // Also skip AVX1 with ymm vectors, as the umin approach combines better than
56654 // the xmm approach.
56655 //
56656 // NB: We don't handle the similiar simplication of `(and (icmp ne), (icmp
56657 // ne))` as it doesn't end up instruction positive.
56658 // TODO: We might want to do this for avx512 as well if we `sext` the result.
56659 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger() &&
56660 ISD::isUnsignedIntSetCC(CC) && LHS.getOpcode() == ISD::ADD &&
56661 !Subtarget.hasAVX512() &&
56662 (OpVT.getSizeInBits() <= 128 || !Subtarget.hasAVX() ||
56663 Subtarget.hasAVX2()) &&
56664 LHS.hasOneUse()) {
56665
56666 APInt CmpC;
56667 SDValue AddC = LHS.getOperand(1);
56668 if (ISD::isConstantSplatVector(RHS.getNode(), CmpC) &&
56670 // See which form we have depending on the constant/condition.
56671 SDValue C0 = SDValue();
56672 SDValue C1 = SDValue();
56673
56674 // If we had `(add x, -1)` and can lower with `umin`, don't transform as
56675 // we will end up generating an additional constant. Keeping in the
56676 // current form has a slight latency cost, but it probably worth saving a
56677 // constant.
56680 // Pass
56681 }
56682 // Normal Cases
56683 else if ((CC == ISD::SETULT && CmpC == 2) ||
56684 (CC == ISD::SETULE && CmpC == 1)) {
56685 // These will constant fold.
56686 C0 = DAG.getNegative(AddC, DL, OpVT);
56687 C1 = DAG.getNode(ISD::SUB, DL, OpVT, C0,
56688 DAG.getAllOnesConstant(DL, OpVT));
56689 }
56690 // Inverted Cases
56691 else if ((CC == ISD::SETUGT && (-CmpC) == 3) ||
56692 (CC == ISD::SETUGE && (-CmpC) == 2)) {
56693 // These will constant fold.
56694 C0 = DAG.getNOT(DL, AddC, OpVT);
56695 C1 = DAG.getNode(ISD::ADD, DL, OpVT, C0,
56696 DAG.getAllOnesConstant(DL, OpVT));
56697 }
56698 if (C0 && C1) {
56699 SDValue NewLHS =
56700 DAG.getSetCC(DL, VT, LHS.getOperand(0), C0, ISD::SETEQ);
56701 SDValue NewRHS =
56702 DAG.getSetCC(DL, VT, LHS.getOperand(0), C1, ISD::SETEQ);
56703 return DAG.getNode(ISD::OR, DL, VT, NewLHS, NewRHS);
56704 }
56705 }
56706 }
56707
56708 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
56709 // to avoid scalarization via legalization because v4i32 is not a legal type.
56710 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
56711 LHS.getValueType() == MVT::v4f32)
56712 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
56713
56714 // X pred 0.0 --> X pred -X
56715 // If the negation of X already exists, use it in the comparison. This removes
56716 // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
56717 // instructions in patterns with a 'select' node.
56719 SDVTList FNegVT = DAG.getVTList(OpVT);
56720 if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
56721 return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
56722 }
56723
56724 return SDValue();
56725}
56726
56729 const X86Subtarget &Subtarget) {
56730 SDValue Src = N->getOperand(0);
56731 MVT SrcVT = Src.getSimpleValueType();
56732 MVT VT = N->getSimpleValueType(0);
56733 unsigned NumBits = VT.getScalarSizeInBits();
56734 unsigned NumElts = SrcVT.getVectorNumElements();
56735 unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();
56736 assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types");
56737
56738 // Perform constant folding.
56739 APInt UndefElts;
56740 SmallVector<APInt, 32> EltBits;
56741 if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits,
56742 /*AllowWholeUndefs*/ true,
56743 /*AllowPartialUndefs*/ true)) {
56744 APInt Imm(32, 0);
56745 for (unsigned Idx = 0; Idx != NumElts; ++Idx)
56746 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
56747 Imm.setBit(Idx);
56748
56749 return DAG.getConstant(Imm, SDLoc(N), VT);
56750 }
56751
56752 // Look through int->fp bitcasts that don't change the element width.
56753 unsigned EltWidth = SrcVT.getScalarSizeInBits();
56754 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
56755 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
56756 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
56757
56758 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
56759 // with scalar comparisons.
56760 if (SDValue NotSrc = IsNOT(Src, DAG)) {
56761 SDLoc DL(N);
56762 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
56763 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
56764 return DAG.getNode(ISD::XOR, DL, VT,
56765 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
56766 DAG.getConstant(NotMask, DL, VT));
56767 }
56768
56769 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
56770 // results with scalar comparisons.
56771 if (Src.getOpcode() == X86ISD::PCMPGT &&
56772 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
56773 SDLoc DL(N);
56774 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
56775 return DAG.getNode(ISD::XOR, DL, VT,
56776 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
56777 DAG.getConstant(NotMask, DL, VT));
56778 }
56779
56780 // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
56781 // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
56782 // iff pow2splat(c1).
56783 // Use KnownBits to determine if only a single bit is non-zero
56784 // in each element (pow2 or zero), and shift that bit to the msb.
56785 if (Src.getOpcode() == X86ISD::PCMPEQ) {
56786 KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));
56787 KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));
56788 unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();
56789 if (KnownLHS.countMaxPopulation() == 1 &&
56790 (KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&
56791 ShiftAmt == KnownRHS.countMinLeadingZeros()))) {
56792 SDLoc DL(N);
56793 MVT ShiftVT = SrcVT;
56794 SDValue ShiftLHS = Src.getOperand(0);
56795 SDValue ShiftRHS = Src.getOperand(1);
56796 if (ShiftVT.getScalarType() == MVT::i8) {
56797 // vXi8 shifts - we only care about the signbit so can use PSLLW.
56798 ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
56799 ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);
56800 ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);
56801 }
56802 ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
56803 ShiftLHS, ShiftAmt, DAG);
56804 ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
56805 ShiftRHS, ShiftAmt, DAG);
56806 ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);
56807 ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);
56808 SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);
56809 return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));
56810 }
56811 }
56812
56813 // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)
56814 if (N->isOnlyUserOf(Src.getNode())) {
56816 if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {
56817 APInt UndefElts;
56818 SmallVector<APInt, 32> EltBits;
56819 if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,
56820 UndefElts, EltBits)) {
56821 APInt Mask = APInt::getZero(NumBits);
56822 for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
56823 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
56824 Mask.setBit(Idx);
56825 }
56826 SDLoc DL(N);
56827 SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));
56828 SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);
56829 return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,
56830 DAG.getConstant(Mask, DL, VT));
56831 }
56832 }
56833 }
56834
56835 // Simplify the inputs.
56836 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56837 APInt DemandedMask(APInt::getAllOnes(NumBits));
56838 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
56839 return SDValue(N, 0);
56840
56841 return SDValue();
56842}
56843
56846 const X86Subtarget &Subtarget) {
56847 MVT VT = N->getSimpleValueType(0);
56848 unsigned NumBits = VT.getScalarSizeInBits();
56849
56850 // Simplify the inputs.
56851 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56852 APInt DemandedMask(APInt::getAllOnes(NumBits));
56853 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
56854 return SDValue(N, 0);
56855
56856 return SDValue();
56857}
56858
56862 SDValue Mask = MemOp->getMask();
56863
56864 // With vector masks we only demand the upper bit of the mask.
56865 if (Mask.getScalarValueSizeInBits() != 1) {
56866 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56867 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
56868 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
56869 if (N->getOpcode() != ISD::DELETED_NODE)
56870 DCI.AddToWorklist(N);
56871 return SDValue(N, 0);
56872 }
56873 }
56874
56875 return SDValue();
56876}
56877
56879 SDValue Index, SDValue Base, SDValue Scale,
56880 SelectionDAG &DAG) {
56881 SDLoc DL(GorS);
56882
56883 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
56884 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
56885 Gather->getMask(), Base, Index, Scale } ;
56886 return DAG.getMaskedGather(Gather->getVTList(),
56887 Gather->getMemoryVT(), DL, Ops,
56888 Gather->getMemOperand(),
56889 Gather->getIndexType(),
56890 Gather->getExtensionType());
56891 }
56892 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
56893 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
56894 Scatter->getMask(), Base, Index, Scale };
56895 return DAG.getMaskedScatter(Scatter->getVTList(),
56896 Scatter->getMemoryVT(), DL,
56897 Ops, Scatter->getMemOperand(),
56898 Scatter->getIndexType(),
56899 Scatter->isTruncatingStore());
56900}
56901
56904 SDLoc DL(N);
56905 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
56906 SDValue Index = GorS->getIndex();
56907 SDValue Base = GorS->getBasePtr();
56908 SDValue Scale = GorS->getScale();
56909 EVT IndexVT = Index.getValueType();
56910 EVT IndexSVT = IndexVT.getVectorElementType();
56911 unsigned IndexWidth = Index.getScalarValueSizeInBits();
56912 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56913 EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
56914
56915 if (DCI.isBeforeLegalize()) {
56916 // Attempt to move shifted index into the address scale, allows further
56917 // index truncation below.
56918 if (Index.getOpcode() == ISD::SHL && IndexSVT == PtrVT &&
56919 isa<ConstantSDNode>(Scale)) {
56920 unsigned ScaleAmt = Scale->getAsZExtVal();
56921 assert(isPowerOf2_32(ScaleAmt) && "Scale must be a power of 2");
56922 unsigned Log2ScaleAmt = Log2_32(ScaleAmt);
56923 unsigned MaskBits = IndexWidth - Log2ScaleAmt;
56924 APInt DemandedBits = APInt::getLowBitsSet(IndexWidth, MaskBits);
56925 if (TLI.SimplifyDemandedBits(Index, DemandedBits, DCI)) {
56926 if (N->getOpcode() != ISD::DELETED_NODE)
56927 DCI.AddToWorklist(N);
56928 return SDValue(N, 0);
56929 }
56930 if (auto MinShAmt = DAG.getValidMinimumShiftAmount(Index)) {
56931 if (*MinShAmt >= 1 && Log2ScaleAmt < 3 &&
56932 DAG.ComputeNumSignBits(Index.getOperand(0)) > 1) {
56933 SDValue ShAmt = Index.getOperand(1);
56934 SDValue NewShAmt =
56935 DAG.getNode(ISD::SUB, DL, ShAmt.getValueType(), ShAmt,
56936 DAG.getConstant(1, DL, ShAmt.getValueType()));
56937 SDValue NewIndex = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
56938 Index.getOperand(0), NewShAmt);
56939 SDValue NewScale =
56940 DAG.getConstant(ScaleAmt * 2, DL, Scale.getValueType());
56941 return rebuildGatherScatter(GorS, NewIndex, Base, NewScale, DAG);
56942 }
56943 }
56944 }
56945
56946 // Shrink indices if they are larger than 32-bits.
56947 // Only do this before legalize types since v2i64 could become v2i32.
56948 // FIXME: We could check that the type is legal if we're after legalize
56949 // types, but then we would need to construct test cases where that happens.
56950 if (IndexWidth > 32 && DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
56951 EVT NewVT = IndexVT.changeVectorElementType(MVT::i32);
56952
56953 // FIXME: We could support more than just constant fold, but we need to
56954 // careful with costing. A truncate that can be optimized out would be
56955 // fine. Otherwise we might only want to create a truncate if it avoids
56956 // a split.
56957 if (SDValue TruncIndex =
56958 DAG.FoldConstantArithmetic(ISD::TRUNCATE, DL, NewVT, Index))
56959 return rebuildGatherScatter(GorS, TruncIndex, Base, Scale, DAG);
56960
56961 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
56962 // there are sufficient sign bits. Only do this before legalize types to
56963 // avoid creating illegal types in truncate.
56964 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
56965 Index.getOpcode() == ISD::ZERO_EXTEND) &&
56966 Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
56967 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
56968 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56969 }
56970
56971 // Shrink if we remove an illegal type.
56972 if (!TLI.isTypeLegal(Index.getValueType()) && TLI.isTypeLegal(NewVT)) {
56973 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
56974 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56975 }
56976 }
56977 }
56978
56979 // Try to move splat adders from the index operand to the base
56980 // pointer operand. Taking care to multiply by the scale. We can only do
56981 // this when index element type is the same as the pointer type.
56982 // Otherwise we need to be sure the math doesn't wrap before the scale.
56983 if (Index.getOpcode() == ISD::ADD && IndexSVT == PtrVT &&
56984 isa<ConstantSDNode>(Scale)) {
56985 uint64_t ScaleAmt = Scale->getAsZExtVal();
56986
56987 for (unsigned I = 0; I != 2; ++I)
56988 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(I))) {
56989 BitVector UndefElts;
56990 if (SDValue Splat = BV->getSplatValue(&UndefElts)) {
56991 if (UndefElts.none()) {
56992 // If the splat value is constant we can add the scaled splat value
56993 // to the existing base.
56994 if (auto *C = dyn_cast<ConstantSDNode>(Splat)) {
56995 APInt Adder = C->getAPIntValue() * ScaleAmt;
56996 SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
56997 DAG.getConstant(Adder, DL, PtrVT));
56998 SDValue NewIndex = Index.getOperand(1 - I);
56999 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
57000 }
57001 // For non-constant cases, limit this to non-scaled cases.
57002 if (ScaleAmt == 1) {
57003 SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base, Splat);
57004 SDValue NewIndex = Index.getOperand(1 - I);
57005 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
57006 }
57007 }
57008 }
57009 // It's also possible base is just a constant. In that case, just
57010 // replace it with 0 and move the displacement into the index.
57011 if (ScaleAmt == 1 && BV->isConstant() && isa<ConstantSDNode>(Base)) {
57012 SDValue Splat = DAG.getSplatBuildVector(IndexVT, DL, Base);
57013 // Combine the constant build_vector and the constant base.
57014 Splat =
57015 DAG.getNode(ISD::ADD, DL, IndexVT, Index.getOperand(I), Splat);
57016 // Add to the other half of the original Index add.
57017 SDValue NewIndex = DAG.getNode(ISD::ADD, DL, IndexVT,
57018 Index.getOperand(1 - I), Splat);
57019 SDValue NewBase = DAG.getConstant(0, DL, PtrVT);
57020 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
57021 }
57022 }
57023 }
57024
57025 if (DCI.isBeforeLegalizeOps()) {
57026 // Make sure the index is either i32 or i64
57027 if (IndexWidth != 32 && IndexWidth != 64) {
57028 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
57029 IndexVT = IndexVT.changeVectorElementType(EltVT);
57030 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
57031 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
57032 }
57033 }
57034
57035 // With vector masks we only demand the upper bit of the mask.
57036 SDValue Mask = GorS->getMask();
57037 if (Mask.getScalarValueSizeInBits() != 1) {
57038 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
57039 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
57040 if (N->getOpcode() != ISD::DELETED_NODE)
57041 DCI.AddToWorklist(N);
57042 return SDValue(N, 0);
57043 }
57044 }
57045
57046 return SDValue();
57047}
57048
57049// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
57051 const X86Subtarget &Subtarget) {
57052 SDLoc DL(N);
57053 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
57054 SDValue EFLAGS = N->getOperand(1);
57055
57056 // Try to simplify the EFLAGS and condition code operands.
57057 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
57058 return getSETCC(CC, Flags, DL, DAG);
57059
57060 return SDValue();
57061}
57062
57063/// Optimize branch condition evaluation.
57065 const X86Subtarget &Subtarget) {
57066 SDLoc DL(N);
57067 SDValue EFLAGS = N->getOperand(3);
57068 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
57069
57070 // Try to simplify the EFLAGS and condition code operands.
57071 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
57072 // RAUW them under us.
57073 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
57074 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
57075 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
57076 N->getOperand(1), Cond, Flags);
57077 }
57078
57079 return SDValue();
57080}
57081
57082// TODO: Could we move this to DAGCombine?
57084 SelectionDAG &DAG) {
57085 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
57086 // to optimize away operation when it's from a constant.
57087 //
57088 // The general transformation is:
57089 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
57090 // AND(VECTOR_CMP(x,y), constant2)
57091 // constant2 = UNARYOP(constant)
57092
57093 // Early exit if this isn't a vector operation, the operand of the
57094 // unary operation isn't a bitwise AND, or if the sizes of the operations
57095 // aren't the same.
57096 EVT VT = N->getValueType(0);
57097 bool IsStrict = N->isStrictFPOpcode();
57098 unsigned NumEltBits = VT.getScalarSizeInBits();
57099 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57100 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
57101 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
57102 VT.getSizeInBits() != Op0.getValueSizeInBits())
57103 return SDValue();
57104
57105 // Now check that the other operand of the AND is a constant. We could
57106 // make the transformation for non-constant splats as well, but it's unclear
57107 // that would be a benefit as it would not eliminate any operations, just
57108 // perform one more step in scalar code before moving to the vector unit.
57109 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
57110 // Bail out if the vector isn't a constant.
57111 if (!BV->isConstant())
57112 return SDValue();
57113
57114 // Everything checks out. Build up the new and improved node.
57115 SDLoc DL(N);
57116 EVT IntVT = BV->getValueType(0);
57117 // Create a new constant of the appropriate type for the transformed
57118 // DAG.
57119 SDValue SourceConst;
57120 if (IsStrict)
57121 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
57122 {N->getOperand(0), SDValue(BV, 0)});
57123 else
57124 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
57125 // The AND node needs bitcasts to/from an integer vector type around it.
57126 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
57127 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
57128 MaskConst);
57129 SDValue Res = DAG.getBitcast(VT, NewAnd);
57130 if (IsStrict)
57131 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
57132 return Res;
57133 }
57134
57135 return SDValue();
57136}
57137
57138/// If we are converting a value to floating-point, try to replace scalar
57139/// truncate of an extracted vector element with a bitcast. This tries to keep
57140/// the sequence on XMM registers rather than moving between vector and GPRs.
57142 // TODO: This is currently only used by combineSIntToFP, but it is generalized
57143 // to allow being called by any similar cast opcode.
57144 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
57145 SDValue Trunc = N->getOperand(0);
57146 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
57147 return SDValue();
57148
57149 SDValue ExtElt = Trunc.getOperand(0);
57150 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
57151 !isNullConstant(ExtElt.getOperand(1)))
57152 return SDValue();
57153
57154 EVT TruncVT = Trunc.getValueType();
57155 EVT SrcVT = ExtElt.getValueType();
57156 unsigned DestWidth = TruncVT.getSizeInBits();
57157 unsigned SrcWidth = SrcVT.getSizeInBits();
57158 if (SrcWidth % DestWidth != 0)
57159 return SDValue();
57160
57161 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
57162 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
57163 unsigned VecWidth = SrcVecVT.getSizeInBits();
57164 unsigned NumElts = VecWidth / DestWidth;
57165 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
57166 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
57167 SDLoc DL(N);
57168 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
57169 BitcastVec, ExtElt.getOperand(1));
57170 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
57171}
57172
57174 const X86Subtarget &Subtarget) {
57175 bool IsStrict = N->isStrictFPOpcode();
57176 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57177 EVT VT = N->getValueType(0);
57178 EVT InVT = Op0.getValueType();
57179
57180 // Using i16 as an intermediate type is a bad idea, unless we have HW support
57181 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
57182 // if hasFP16 support:
57183 // UINT_TO_FP(vXi1~15) -> SINT_TO_FP(ZEXT(vXi1~15 to vXi16))
57184 // UINT_TO_FP(vXi17~31) -> SINT_TO_FP(ZEXT(vXi17~31 to vXi32))
57185 // else
57186 // UINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
57187 // UINT_TO_FP(vXi33~63) -> SINT_TO_FP(ZEXT(vXi33~63 to vXi64))
57188 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
57189 unsigned ScalarSize = InVT.getScalarSizeInBits();
57190 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
57191 ScalarSize >= 64)
57192 return SDValue();
57193 SDLoc dl(N);
57194 EVT DstVT =
57196 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
57197 : ScalarSize < 32 ? MVT::i32
57198 : MVT::i64,
57199 InVT.getVectorNumElements());
57200 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
57201 if (IsStrict)
57202 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57203 {N->getOperand(0), P});
57204 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57205 }
57206
57207 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
57208 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
57209 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
57210 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
57211 VT.getScalarType() != MVT::f16) {
57212 SDLoc dl(N);
57213 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
57214 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
57215
57216 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
57217 if (IsStrict)
57218 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57219 {N->getOperand(0), P});
57220 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57221 }
57222
57223 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
57224 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
57225 // the optimization here.
57226 SDNodeFlags Flags = N->getFlags();
57227 if (Flags.hasNonNeg() || DAG.SignBitIsZero(Op0)) {
57228 if (IsStrict)
57229 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
57230 {N->getOperand(0), Op0});
57231 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
57232 }
57233
57234 return SDValue();
57235}
57236
57239 const X86Subtarget &Subtarget) {
57240 // First try to optimize away the conversion entirely when it's
57241 // conditionally from a constant. Vectors only.
57242 bool IsStrict = N->isStrictFPOpcode();
57244 return Res;
57245
57246 // Now move on to more general possibilities.
57247 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57248 EVT VT = N->getValueType(0);
57249 EVT InVT = Op0.getValueType();
57250
57251 // Using i16 as an intermediate type is a bad idea, unless we have HW support
57252 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
57253 // if hasFP16 support:
57254 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
57255 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
57256 // else
57257 // SINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
57258 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
57259 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
57260 unsigned ScalarSize = InVT.getScalarSizeInBits();
57261 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
57262 ScalarSize >= 64)
57263 return SDValue();
57264 SDLoc dl(N);
57265 EVT DstVT =
57267 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
57268 : ScalarSize < 32 ? MVT::i32
57269 : MVT::i64,
57270 InVT.getVectorNumElements());
57271 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
57272 if (IsStrict)
57273 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57274 {N->getOperand(0), P});
57275 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57276 }
57277
57278 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
57279 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
57280 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
57281 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
57282 VT.getScalarType() != MVT::f16) {
57283 SDLoc dl(N);
57284 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
57285 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
57286 if (IsStrict)
57287 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57288 {N->getOperand(0), P});
57289 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57290 }
57291
57292 // Without AVX512DQ we only support i64 to float scalar conversion. For both
57293 // vectors and scalars, see if we know that the upper bits are all the sign
57294 // bit, in which case we can truncate the input to i32 and convert from that.
57295 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
57296 unsigned BitWidth = InVT.getScalarSizeInBits();
57297 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
57298 if (NumSignBits >= (BitWidth - 31)) {
57299 EVT TruncVT = MVT::i32;
57300 if (InVT.isVector())
57301 TruncVT = InVT.changeVectorElementType(TruncVT);
57302 SDLoc dl(N);
57303 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
57304 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
57305 if (IsStrict)
57306 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57307 {N->getOperand(0), Trunc});
57308 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
57309 }
57310 // If we're after legalize and the type is v2i32 we need to shuffle and
57311 // use CVTSI2P.
57312 assert(InVT == MVT::v2i64 && "Unexpected VT!");
57313 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
57314 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
57315 { 0, 2, -1, -1 });
57316 if (IsStrict)
57317 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
57318 {N->getOperand(0), Shuf});
57319 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
57320 }
57321 }
57322
57323 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
57324 // a 32-bit target where SSE doesn't support i64->FP operations.
57325 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
57326 Op0.getOpcode() == ISD::LOAD) {
57327 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
57328
57329 // This transformation is not supported if the result type is f16 or f128.
57330 if (VT == MVT::f16 || VT == MVT::f128)
57331 return SDValue();
57332
57333 // If we have AVX512DQ we can use packed conversion instructions unless
57334 // the VT is f80.
57335 if (Subtarget.hasDQI() && VT != MVT::f80)
57336 return SDValue();
57337
57338 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
57339 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
57340 std::pair<SDValue, SDValue> Tmp =
57341 Subtarget.getTargetLowering()->BuildFILD(
57342 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
57343 Ld->getPointerInfo(), Ld->getBaseAlign(), DAG);
57344 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
57345 return Tmp.first;
57346 }
57347 }
57348
57349 if (IsStrict)
57350 return SDValue();
57351
57352 if (SDValue V = combineToFPTruncExtElt(N, DAG))
57353 return V;
57354
57355 return SDValue();
57356}
57357
57359 const X86Subtarget &Subtarget) {
57360 EVT VT = N->getValueType(0);
57361 SDValue Src = N->getOperand(0);
57362 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::FRINT &&
57363 VT.getScalarType() == MVT::i32 && Src.hasOneUse())
57364 return DAG.getNode(ISD::LRINT, SDLoc(N), VT, Src.getOperand(0));
57365
57366 return SDValue();
57367}
57368
57369// Custom handling for VCVTTPS2QQS/VCVTTPS2UQQS
57371 const X86Subtarget &Subtarget) {
57372 if (!Subtarget.hasAVX10_2())
57373 return SDValue();
57374
57375 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT_SAT;
57376 EVT SrcVT = N->getOperand(0).getValueType();
57377 EVT DstVT = N->getValueType(0);
57378 SDLoc dl(N);
57379
57380 if (SrcVT == MVT::v2f32 && DstVT == MVT::v2i64) {
57381 SDValue V2F32Value = DAG.getUNDEF(SrcVT);
57382
57383 // Concatenate the original v2f32 input and V2F32Value to create v4f32
57384 SDValue NewSrc = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
57385 N->getOperand(0), V2F32Value);
57386
57387 // Select the FP_TO_SINT_SAT/FP_TO_UINT_SAT node
57388 if (IsSigned)
57389 return DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v2i64, NewSrc);
57390
57391 return DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v2i64, NewSrc);
57392 }
57393 return SDValue();
57394}
57395
57397 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
57398
57399 for (const SDNode *User : Flags->users()) {
57400 X86::CondCode CC;
57401 switch (User->getOpcode()) {
57402 default:
57403 // Be conservative.
57404 return true;
57405 case X86ISD::SETCC:
57407 CC = (X86::CondCode)User->getConstantOperandVal(0);
57408 break;
57409 case X86ISD::BRCOND:
57410 case X86ISD::CMOV:
57411 CC = (X86::CondCode)User->getConstantOperandVal(2);
57412 break;
57413 }
57414
57415 switch (CC) {
57416 // clang-format off
57417 default: break;
57418 case X86::COND_A: case X86::COND_AE:
57419 case X86::COND_B: case X86::COND_BE:
57420 case X86::COND_O: case X86::COND_NO:
57421 case X86::COND_G: case X86::COND_GE:
57422 case X86::COND_L: case X86::COND_LE:
57423 return true;
57424 // clang-format on
57425 }
57426 }
57427
57428 return false;
57429}
57430
57431static bool onlyZeroFlagUsed(SDValue Flags) {
57432 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
57433
57434 for (const SDNode *User : Flags->users()) {
57435 unsigned CCOpNo;
57436 switch (User->getOpcode()) {
57437 default:
57438 // Be conservative.
57439 return false;
57440 case X86ISD::SETCC:
57442 CCOpNo = 0;
57443 break;
57444 case X86ISD::BRCOND:
57445 case X86ISD::CMOV:
57446 CCOpNo = 2;
57447 break;
57448 }
57449
57450 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
57451 if (CC != X86::COND_E && CC != X86::COND_NE)
57452 return false;
57453 }
57454
57455 return true;
57456}
57457
57460 const X86Subtarget &Subtarget) {
57461 // Only handle test patterns.
57462 if (!isNullConstant(N->getOperand(1)))
57463 return SDValue();
57464
57465 // If we have a CMP of a truncated binop, see if we can make a smaller binop
57466 // and use its flags directly.
57467 // TODO: Maybe we should try promoting compares that only use the zero flag
57468 // first if we can prove the upper bits with computeKnownBits?
57469 SDLoc dl(N);
57470 SDValue Op = N->getOperand(0);
57471 EVT VT = Op.getValueType();
57472 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57473
57474 if (SDValue CMP =
57475 combineX86SubCmpForFlags(N, SDValue(N, 0), DAG, DCI, Subtarget))
57476 return CMP;
57477
57478 // If we have a constant logical shift that's only used in a comparison
57479 // against zero turn it into an equivalent AND. This allows turning it into
57480 // a TEST instruction later.
57481 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
57482 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
57483 onlyZeroFlagUsed(SDValue(N, 0))) {
57484 unsigned BitWidth = VT.getSizeInBits();
57485 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
57486 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
57487 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
57488 APInt Mask = Op.getOpcode() == ISD::SRL
57489 ? APInt::getHighBitsSet(BitWidth, MaskBits)
57490 : APInt::getLowBitsSet(BitWidth, MaskBits);
57491 if (Mask.isSignedIntN(32)) {
57492 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
57493 DAG.getConstant(Mask, dl, VT));
57494 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57495 DAG.getConstant(0, dl, VT));
57496 }
57497 }
57498 }
57499
57500 // If we're extracting from a avx512 bool vector and comparing against zero,
57501 // then try to just bitcast the vector to an integer to use TEST/BT directly.
57502 // (and (extract_elt (kshiftr vXi1, C), 0), 1) -> (and (bc vXi1), 1<<C)
57503 if (Op.getOpcode() == ISD::AND && isOneConstant(Op.getOperand(1)) &&
57504 Op.hasOneUse() && onlyZeroFlagUsed(SDValue(N, 0))) {
57505 SDValue Src = Op.getOperand(0);
57506 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
57507 isNullConstant(Src.getOperand(1)) &&
57508 Src.getOperand(0).getValueType().getScalarType() == MVT::i1) {
57509 SDValue BoolVec = Src.getOperand(0);
57510 unsigned ShAmt = 0;
57511 if (BoolVec.getOpcode() == X86ISD::KSHIFTR) {
57512 ShAmt = BoolVec.getConstantOperandVal(1);
57513 BoolVec = BoolVec.getOperand(0);
57514 }
57515 BoolVec = widenMaskVector(BoolVec, false, Subtarget, DAG, dl);
57516 EVT VecVT = BoolVec.getValueType();
57517 unsigned BitWidth = VecVT.getVectorNumElements();
57518 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), BitWidth);
57519 if (TLI.isTypeLegal(VecVT) && TLI.isTypeLegal(BCVT)) {
57520 APInt Mask = APInt::getOneBitSet(BitWidth, ShAmt);
57521 Op = DAG.getBitcast(BCVT, BoolVec);
57522 Op = DAG.getNode(ISD::AND, dl, BCVT, Op,
57523 DAG.getConstant(Mask, dl, BCVT));
57524 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57525 DAG.getConstant(0, dl, BCVT));
57526 }
57527 }
57528 }
57529
57530 // Peek through any zero-extend if we're only testing for a zero result.
57531 if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
57532 SDValue Src = Op.getOperand(0);
57533 EVT SrcVT = Src.getValueType();
57534 if (SrcVT.getScalarSizeInBits() >= 8 && TLI.isTypeLegal(SrcVT))
57535 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
57536 DAG.getConstant(0, dl, SrcVT));
57537 }
57538
57539 // Look for a truncate.
57540 if (Op.getOpcode() != ISD::TRUNCATE)
57541 return SDValue();
57542
57543 SDValue Trunc = Op;
57544 Op = Op.getOperand(0);
57545
57546 // See if we can compare with zero against the truncation source,
57547 // which should help using the Z flag from many ops. Only do this for
57548 // i32 truncated op to prevent partial-reg compares of promoted ops.
57549 EVT OpVT = Op.getValueType();
57550 APInt UpperBits =
57552 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
57553 onlyZeroFlagUsed(SDValue(N, 0))) {
57554 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57555 DAG.getConstant(0, dl, OpVT));
57556 }
57557
57558 // After this the truncate and arithmetic op must have a single use.
57559 if (!Trunc.hasOneUse() || !Op.hasOneUse())
57560 return SDValue();
57561
57562 unsigned NewOpc;
57563 switch (Op.getOpcode()) {
57564 default: return SDValue();
57565 case ISD::AND:
57566 // Skip and with constant. We have special handling for and with immediate
57567 // during isel to generate test instructions.
57568 if (isa<ConstantSDNode>(Op.getOperand(1)))
57569 return SDValue();
57570 NewOpc = X86ISD::AND;
57571 break;
57572 case ISD::OR: NewOpc = X86ISD::OR; break;
57573 case ISD::XOR: NewOpc = X86ISD::XOR; break;
57574 case ISD::ADD:
57575 // If the carry or overflow flag is used, we can't truncate.
57577 return SDValue();
57578 NewOpc = X86ISD::ADD;
57579 break;
57580 case ISD::SUB:
57581 // If the carry or overflow flag is used, we can't truncate.
57583 return SDValue();
57584 NewOpc = X86ISD::SUB;
57585 break;
57586 }
57587
57588 // We found an op we can narrow. Truncate its inputs.
57589 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
57590 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
57591
57592 // Use a X86 specific opcode to avoid DAG combine messing with it.
57593 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57594 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
57595
57596 // For AND, keep a CMP so that we can match the test pattern.
57597 if (NewOpc == X86ISD::AND)
57598 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57599 DAG.getConstant(0, dl, VT));
57600
57601 // Return the flags.
57602 return Op.getValue(1);
57603}
57604
57607 const X86Subtarget &ST) {
57608 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
57609 "Expected X86ISD::ADD or X86ISD::SUB");
57610
57611 SDLoc DL(N);
57612 SDValue LHS = N->getOperand(0);
57613 SDValue RHS = N->getOperand(1);
57614 MVT VT = LHS.getSimpleValueType();
57615 bool IsSub = X86ISD::SUB == N->getOpcode();
57616 unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;
57617
57618 if (IsSub && isOneConstant(RHS) && !N->hasAnyUseOfValue(0))
57619 if (SDValue CMP = combineX86SubCmpForFlags(N, SDValue(N, 1), DAG, DCI, ST))
57620 return CMP;
57621
57622 // If we don't use the flag result, simplify back to a generic ADD/SUB.
57623 if (!N->hasAnyUseOfValue(1)) {
57624 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
57625 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
57626 }
57627
57628 // Fold any similar generic ADD/SUB opcodes to reuse this node.
57629 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
57630 SDValue Ops[] = {N0, N1};
57631 SDVTList VTs = DAG.getVTList(N->getValueType(0));
57632 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
57633 SDValue Op(N, 0);
57634 if (Negate) {
57635 // Bail if this is only used by a user of the x86 add/sub.
57636 if (GenericAddSub->hasOneUse() &&
57637 GenericAddSub->user_begin()->isOnlyUserOf(N))
57638 return;
57639 Op = DAG.getNegative(Op, DL, VT);
57640 }
57641 DCI.CombineTo(GenericAddSub, Op);
57642 }
57643 };
57644 MatchGeneric(LHS, RHS, false);
57645 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
57646
57647 // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
57648 // EFLAGS result doesn't change.
57649 return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,
57650 /*ZeroSecondOpOnly*/ true);
57651}
57652
57654 SDValue LHS = N->getOperand(0);
57655 SDValue RHS = N->getOperand(1);
57656 SDValue BorrowIn = N->getOperand(2);
57657
57658 if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
57659 MVT VT = N->getSimpleValueType(0);
57660 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57661 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
57662 }
57663
57664 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
57665 // iff the flag result is dead.
57666 if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
57667 !N->hasAnyUseOfValue(1))
57668 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
57669 LHS.getOperand(1), BorrowIn);
57670
57671 return SDValue();
57672}
57673
57674// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
57677 SDValue LHS = N->getOperand(0);
57678 SDValue RHS = N->getOperand(1);
57679 SDValue CarryIn = N->getOperand(2);
57680 auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
57681 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
57682
57683 // Canonicalize constant to RHS.
57684 if (LHSC && !RHSC)
57685 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
57686 CarryIn);
57687
57688 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
57689 // the result is either zero or one (depending on the input carry bit).
57690 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
57691 if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
57692 // We don't have a good way to replace an EFLAGS use, so only do this when
57693 // dead right now.
57694 SDValue(N, 1).use_empty()) {
57695 SDLoc DL(N);
57696 EVT VT = N->getValueType(0);
57697 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
57698 SDValue Res1 = DAG.getNode(
57699 ISD::AND, DL, VT,
57701 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
57702 DAG.getConstant(1, DL, VT));
57703 return DCI.CombineTo(N, Res1, CarryOut);
57704 }
57705
57706 // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
57707 // iff the flag result is dead.
57708 // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
57709 if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
57710 SDLoc DL(N);
57711 APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
57712 return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
57713 DAG.getConstant(0, DL, LHS.getValueType()),
57714 DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
57715 }
57716
57717 if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
57718 MVT VT = N->getSimpleValueType(0);
57719 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57720 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
57721 }
57722
57723 // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
57724 // iff the flag result is dead.
57725 if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
57726 !N->hasAnyUseOfValue(1))
57727 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
57728 LHS.getOperand(1), CarryIn);
57729
57730 return SDValue();
57731}
57732
57734 const SDLoc &DL, EVT VT,
57735 const X86Subtarget &Subtarget) {
57736 using namespace SDPatternMatch;
57737
57738 // Example of pattern we try to detect:
57739 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
57740 //(add (build_vector (extract_elt t, 0),
57741 // (extract_elt t, 2),
57742 // (extract_elt t, 4),
57743 // (extract_elt t, 6)),
57744 // (build_vector (extract_elt t, 1),
57745 // (extract_elt t, 3),
57746 // (extract_elt t, 5),
57747 // (extract_elt t, 7)))
57748
57749 if (!Subtarget.hasSSE2())
57750 return SDValue();
57751
57752 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
57753 VT.getVectorNumElements() < 4 ||
57755 return SDValue();
57756
57757 SDValue Op0, Op1, Accum;
57762 m_Value(Op1))))))
57763 return SDValue();
57764
57765 // Check if one of Op0,Op1 is of the form:
57766 // (build_vector (extract_elt Mul, 0),
57767 // (extract_elt Mul, 2),
57768 // (extract_elt Mul, 4),
57769 // ...
57770 // the other is of the form:
57771 // (build_vector (extract_elt Mul, 1),
57772 // (extract_elt Mul, 3),
57773 // (extract_elt Mul, 5),
57774 // ...
57775 // and identify Mul.
57776 SDValue Mul;
57777 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
57778 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
57779 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
57780 // TODO: Be more tolerant to undefs.
57781 APInt Idx0L, Idx0H, Idx1L, Idx1H;
57782 SDValue Vec0L, Vec0H, Vec1L, Vec1H;
57783 if (!sd_match(Op0L, m_ExtractElt(m_Value(Vec0L), m_ConstInt(Idx0L))) ||
57784 !sd_match(Op0H, m_ExtractElt(m_Value(Vec0H), m_ConstInt(Idx0H))) ||
57785 !sd_match(Op1L, m_ExtractElt(m_Value(Vec1L), m_ConstInt(Idx1L))) ||
57786 !sd_match(Op1H, m_ExtractElt(m_Value(Vec1H), m_ConstInt(Idx1H))))
57787 return SDValue();
57788 // Commutativity of mul allows factors of a product to reorder.
57789 if (Idx0L.getZExtValue() > Idx1L.getZExtValue())
57790 std::swap(Idx0L, Idx1L);
57791 if (Idx0H.getZExtValue() > Idx1H.getZExtValue())
57792 std::swap(Idx0H, Idx1H);
57793 // Commutativity of add allows pairs of factors to reorder.
57794 if (Idx0L.getZExtValue() > Idx0H.getZExtValue()) {
57795 std::swap(Idx0L, Idx0H);
57796 std::swap(Idx1L, Idx1H);
57797 }
57798 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
57799 Idx1H != 2 * i + 3)
57800 return SDValue();
57801 if (!Mul) {
57802 // First time an extract_elt's source vector is visited. Must be a MUL
57803 // with 2X number of vector elements than the BUILD_VECTOR.
57804 // Both extracts must be from same MUL.
57805 Mul = Vec0L;
57806 if (Mul.getOpcode() != ISD::MUL ||
57807 Mul.getValueType().getVectorNumElements() != 2 * e)
57808 return SDValue();
57809 }
57810 // Check that the extract is from the same MUL previously seen.
57811 if (Mul != Vec0L || Mul != Vec1L || Mul != Vec0H || Mul != Vec1H)
57812 return SDValue();
57813 }
57814
57815 // Check if the Mul source can be safely shrunk.
57817 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
57819 return SDValue();
57820
57821 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
57822 VT.getVectorNumElements() * 2);
57823 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
57824 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
57825
57826 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
57828 EVT InVT = Ops[0].getValueType();
57829 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
57830 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
57831 InVT.getVectorNumElements() / 2);
57832 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
57833 };
57834 SDValue R = SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDBuilder);
57835 if (Accum)
57836 R = DAG.getNode(ISD::ADD, DL, VT, R, Accum);
57837 return R;
57838}
57839
57840// Attempt to turn this pattern into PMADDWD.
57841// (add (mul (sext (build_vector)), (sext (build_vector))),
57842// (mul (sext (build_vector)), (sext (build_vector)))
57844 const SDLoc &DL, EVT VT,
57845 const X86Subtarget &Subtarget) {
57846 using namespace SDPatternMatch;
57847
57848 if (!Subtarget.hasSSE2())
57849 return SDValue();
57850
57851 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
57852 VT.getVectorNumElements() < 4 ||
57854 return SDValue();
57855
57856 // All inputs need to be sign extends.
57857 // TODO: Support ZERO_EXTEND from known positive?
57858 SDValue N00, N01, N10, N11;
57859 if (!sd_match(N, m_Add(m_Mul(m_SExt(m_Value(N00)), m_SExt(m_Value(N01))),
57860 m_Mul(m_SExt(m_Value(N10)), m_SExt(m_Value(N11))))))
57861 return SDValue();
57862
57863 // Must be extending from vXi16.
57864 EVT InVT = N00.getValueType();
57865 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
57866 N10.getValueType() != InVT || N11.getValueType() != InVT)
57867 return SDValue();
57868
57869 // All inputs should be build_vectors.
57870 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
57871 N01.getOpcode() != ISD::BUILD_VECTOR ||
57872 N10.getOpcode() != ISD::BUILD_VECTOR ||
57874 return SDValue();
57875
57876 // For each element, we need to ensure we have an odd element from one vector
57877 // multiplied by the odd element of another vector and the even element from
57878 // one of the same vectors being multiplied by the even element from the
57879 // other vector. So we need to make sure for each element i, this operator
57880 // is being performed:
57881 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
57882 SDValue In0, In1;
57883 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
57884 SDValue N00Elt = N00.getOperand(i);
57885 SDValue N01Elt = N01.getOperand(i);
57886 SDValue N10Elt = N10.getOperand(i);
57887 SDValue N11Elt = N11.getOperand(i);
57888 // TODO: Be more tolerant to undefs.
57889 SDValue N00In, N01In, N10In, N11In;
57890 APInt IdxN00, IdxN01, IdxN10, IdxN11;
57891 if (!sd_match(N00Elt, m_ExtractElt(m_Value(N00In), m_ConstInt(IdxN00))) ||
57892 !sd_match(N01Elt, m_ExtractElt(m_Value(N01In), m_ConstInt(IdxN01))) ||
57893 !sd_match(N10Elt, m_ExtractElt(m_Value(N10In), m_ConstInt(IdxN10))) ||
57894 !sd_match(N11Elt, m_ExtractElt(m_Value(N11In), m_ConstInt(IdxN11))))
57895 return SDValue();
57896 // Add is commutative so indices can be reordered.
57897 if (IdxN00.getZExtValue() > IdxN10.getZExtValue()) {
57898 std::swap(IdxN00, IdxN10);
57899 std::swap(IdxN01, IdxN11);
57900 }
57901 // N0 indices be the even element. N1 indices must be the next odd element.
57902 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || IdxN01 != 2 * i ||
57903 IdxN11 != 2 * i + 1)
57904 return SDValue();
57905
57906 // First time we find an input capture it.
57907 if (!In0) {
57908 In0 = N00In;
57909 In1 = N01In;
57910
57911 // The input vectors must be at least as wide as the output.
57912 // If they are larger than the output, we extract subvector below.
57913 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
57914 In1.getValueSizeInBits() < VT.getSizeInBits())
57915 return SDValue();
57916 }
57917 // Mul is commutative so the input vectors can be in any order.
57918 // Canonicalize to make the compares easier.
57919 if (In0 != N00In)
57920 std::swap(N00In, N01In);
57921 if (In0 != N10In)
57922 std::swap(N10In, N11In);
57923 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
57924 return SDValue();
57925 }
57926
57927 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
57929 EVT OpVT = Ops[0].getValueType();
57930 assert(OpVT.getScalarType() == MVT::i16 &&
57931 "Unexpected scalar element type");
57932 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
57933 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
57934 OpVT.getVectorNumElements() / 2);
57935 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
57936 };
57937
57938 // If the output is narrower than an input, extract the low part of the input
57939 // vector.
57940 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
57941 VT.getVectorNumElements() * 2);
57942 if (OutVT16.bitsLT(In0.getValueType())) {
57943 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
57944 DAG.getVectorIdxConstant(0, DL));
57945 }
57946 if (OutVT16.bitsLT(In1.getValueType())) {
57947 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
57948 DAG.getVectorIdxConstant(0, DL));
57949 }
57950 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
57951 PMADDBuilder);
57952}
57953
57954// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
57955// If upper element in each pair of both VPMADDWD are zero then we can merge
57956// the operand elements and use the implicit add of VPMADDWD.
57957// TODO: Add support for VPMADDUBSW (which isn't commutable).
57959 const SDLoc &DL, EVT VT) {
57960 if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
57961 return SDValue();
57962
57963 // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
57964 if (VT.getSizeInBits() > 128)
57965 return SDValue();
57966
57967 unsigned NumElts = VT.getVectorNumElements();
57968 MVT OpVT = N0.getOperand(0).getSimpleValueType();
57970 APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
57971
57972 bool Op0HiZero =
57973 DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
57974 DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
57975 bool Op1HiZero =
57976 DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
57977 DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
57978
57979 // TODO: Check for zero lower elements once we have actual codegen that
57980 // creates them.
57981 if (!Op0HiZero || !Op1HiZero)
57982 return SDValue();
57983
57984 // Create a shuffle mask packing the lower elements from each VPMADDWD.
57985 SmallVector<int> Mask;
57986 for (int i = 0; i != (int)NumElts; ++i) {
57987 Mask.push_back(2 * i);
57988 Mask.push_back(2 * (i + NumElts));
57989 }
57990
57991 SDValue LHS =
57992 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
57993 SDValue RHS =
57994 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
57995 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
57996}
57997
57998/// CMOV of constants requires materializing constant operands in registers.
57999/// Try to fold those constants into an 'add' instruction to reduce instruction
58000/// count. We do this with CMOV rather the generic 'select' because there are
58001/// earlier folds that may be used to turn select-of-constants into logic hacks.
58003 SelectionDAG &DAG,
58004 const X86Subtarget &Subtarget) {
58005 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
58006 // better because we eliminate 1-2 instructions. This transform is still
58007 // an improvement without zero operands because we trade 2 move constants and
58008 // 1 add for 2 adds (LEA) as long as the constants can be represented as
58009 // immediate asm operands (fit in 32-bits).
58010 auto isSuitableCmov = [](SDValue V) {
58011 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
58012 return false;
58013 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
58014 !isa<ConstantSDNode>(V.getOperand(1)))
58015 return false;
58016 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
58017 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
58018 V.getConstantOperandAPInt(1).isSignedIntN(32));
58019 };
58020
58021 // Match an appropriate CMOV as the first operand of the add.
58022 SDValue Cmov = N->getOperand(0);
58023 SDValue OtherOp = N->getOperand(1);
58024 if (!isSuitableCmov(Cmov))
58025 std::swap(Cmov, OtherOp);
58026 if (!isSuitableCmov(Cmov))
58027 return SDValue();
58028
58029 // Don't remove a load folding opportunity for the add. That would neutralize
58030 // any improvements from removing constant materializations.
58031 if (X86::mayFoldLoad(OtherOp, Subtarget))
58032 return SDValue();
58033
58034 EVT VT = N->getValueType(0);
58035 SDValue FalseOp = Cmov.getOperand(0);
58036 SDValue TrueOp = Cmov.getOperand(1);
58037
58038 // We will push the add through the select, but we can potentially do better
58039 // if we know there is another add in the sequence and this is pointer math.
58040 // In that case, we can absorb an add into the trailing memory op and avoid
58041 // a 3-operand LEA which is likely slower than a 2-operand LEA.
58042 // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
58043 if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
58044 !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
58045 all_of(N->users(), [&](SDNode *Use) {
58046 auto *MemNode = dyn_cast<MemSDNode>(Use);
58047 return MemNode && MemNode->getBasePtr().getNode() == N;
58048 })) {
58049 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
58050 // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
58051 // it is possible that choosing op1 might be better.
58052 SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
58053 FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
58054 TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
58055 Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
58056 Cmov.getOperand(2), Cmov.getOperand(3));
58057 return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
58058 }
58059
58060 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
58061 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
58062 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
58063 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
58064 Cmov.getOperand(3));
58065}
58066
58067// Attempt to turn ADD(MUL(x, y), acc)) -> VPMADD52L
58068// When upper 12 bits of x, y and MUL(x, y) are known to be 0
58070 EVT VT, const X86Subtarget &Subtarget) {
58071 using namespace SDPatternMatch;
58072 if (!VT.isVector() || VT.getScalarSizeInBits() != 64 ||
58073 (!Subtarget.hasAVXIFMA() && !Subtarget.hasIFMA()))
58074 return SDValue();
58075
58076 // Need AVX-512VL vector length extensions if operating on XMM/YMM registers
58077 if (!Subtarget.hasAVXIFMA() && !Subtarget.hasVLX() &&
58078 VT.getSizeInBits() < 512)
58079 return SDValue();
58080
58081 const auto TotalSize = VT.getSizeInBits();
58082 if (TotalSize < 128 || !isPowerOf2_64(TotalSize))
58083 return SDValue();
58084
58085 SDValue X, Y, Acc;
58086 if (!sd_match(N, m_Add(m_Mul(m_Value(X), m_Value(Y)), m_Value(Acc))))
58087 return SDValue();
58088
58089 KnownBits KnownX = DAG.computeKnownBits(X);
58090 if (KnownX.countMinLeadingZeros() < 12)
58091 return SDValue();
58092 KnownBits KnownY = DAG.computeKnownBits(Y);
58093 if (KnownY.countMinLeadingZeros() < 12)
58094 return SDValue();
58095 KnownBits KnownMul = KnownBits::mul(KnownX, KnownY);
58096 if (KnownMul.countMinLeadingZeros() < 12)
58097 return SDValue();
58098
58099 auto VPMADD52Builder = [](SelectionDAG &G, SDLoc DL,
58100 ArrayRef<SDValue> SubOps) {
58101 EVT SubVT = SubOps[0].getValueType();
58102 assert(SubVT.getScalarSizeInBits() == 64 &&
58103 "Unexpected element size, only supports 64bit size");
58104 return G.getNode(X86ISD::VPMADD52L, DL, SubVT, SubOps[1] /*X*/,
58105 SubOps[2] /*Y*/, SubOps[0] /*Acc*/);
58106 };
58107
58108 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Acc, X, Y}, VPMADD52Builder,
58109 /*CheckBWI*/ false,
58110 /*AllowAVX512*/ Subtarget.hasIFMA());
58111}
58112
58115 const X86Subtarget &Subtarget) {
58116 using namespace SDPatternMatch;
58117 EVT VT = N->getValueType(0);
58118 SDValue Op0 = N->getOperand(0);
58119 SDValue Op1 = N->getOperand(1);
58120 SDLoc DL(N);
58121
58122 if (SDValue Select = pushAddIntoCmovOfConsts(N, DL, DAG, Subtarget))
58123 return Select;
58124
58125 if (SDValue MAdd = matchPMADDWD(DAG, N, DL, VT, Subtarget))
58126 return MAdd;
58127 if (SDValue MAdd = matchPMADDWD_2(DAG, N, DL, VT, Subtarget))
58128 return MAdd;
58129 if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
58130 return MAdd;
58131
58132 // Try to synthesize horizontal adds from adds of shuffles.
58133 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
58134 return V;
58135
58136 // Prefer VSHLI to reduce uses, X86FixupInstTunings may revert this depending
58137 // on the scheduler model. Limit multiple users to AVX+ targets to prevent
58138 // introducing extra register moves.
58139 if (Op0 == Op1 && supportedVectorShiftWithImm(VT, Subtarget, ISD::SHL))
58140 if (Subtarget.hasAVX() || N->isOnlyUserOf(Op0.getNode()))
58142 Op0, 1, DAG);
58143
58144 // Canonicalize hidden LEA pattern:
58145 // Fold (add (sub (shl x, c), y), z) -> (sub (add (shl x, c), z), y)
58146 // iff c < 4
58147 if (VT == MVT::i32 || VT == MVT::i64) {
58148 SDValue Y, Z, Shift;
58149 APInt Amt;
58150 if (sd_match(
58152 m_Shl(m_Value(), m_ConstInt(Amt))),
58153 m_Value(Y))),
58154 m_Value(Z))) &&
58155 Amt.ult(4) && !isa<ConstantSDNode>(Z)) {
58156 return DAG.getNode(ISD::SUB, DL, VT,
58157 DAG.getNode(ISD::ADD, DL, VT, Shift, Z), Y);
58158 }
58159 }
58160
58161 SDValue X, Y;
58162
58163 // add(psadbw(X,0),psadbw(Y,0)) -> psadbw(add(X,Y),0)
58164 // iff X and Y won't overflow.
58165 if (sd_match(Op0, m_c_BinOp(X86ISD::PSADBW, m_Value(X), m_Zero())) &&
58167 DAG.willNotOverflowAdd(/*IsSigned=*/false, X, Y)) {
58168 MVT OpVT = X.getSimpleValueType();
58169 SDValue Sum = DAG.getNode(ISD::ADD, DL, OpVT, X, Y);
58170 return DAG.getNode(X86ISD::PSADBW, DL, VT, Sum,
58171 getZeroVector(OpVT, Subtarget, DAG, DL));
58172 }
58173
58174 if (VT.isVector()) {
58175 EVT BoolVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
58177
58178 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
58179 // (sub Y, (sext (vXi1 X))).
58180 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y)
58181 // in generic DAG combine without a legal type check, but adding this there
58182 // caused regressions.
58183 if (DAG.getTargetLoweringInfo().isTypeLegal(BoolVT) &&
58185 m_Value(Y)))) {
58186 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, X);
58187 return DAG.getNode(ISD::SUB, DL, VT, Y, SExt);
58188 }
58189
58190 // Fold (add X, (srl Y, 7)) -> (sub X, (icmp_sgt 0, Y)) to undo instcombine
58191 // canonicalisation as we don't have good vXi8 shifts.
58192 if (VT.getScalarType() == MVT::i8 &&
58194 SDValue Cmp =
58195 DAG.getSetCC(DL, BoolVT, DAG.getConstant(0, DL, VT), Y, ISD::SETGT);
58196 return DAG.getNode(ISD::SUB, DL, VT, X, DAG.getSExtOrTrunc(Cmp, DL, VT));
58197 }
58198 }
58199
58200 // Peephole for 512-bit VPDPBSSD on non-VLX targets.
58201 // TODO: Should this be part of matchPMADDWD/matchPMADDWD_2?
58202 if (Subtarget.hasVNNI() && Subtarget.useAVX512Regs() && VT == MVT::v16i32) {
58203 SDValue Accum, Lo0, Lo1, Hi0, Hi1;
58204 if (sd_match(N, m_Add(m_Value(Accum),
58207 m_Value(Lo1)),
58209 m_Value(Hi1)))))) {
58210 return DAG.getNode(X86ISD::VPDPWSSD, DL, VT, Accum,
58211 concatSubVectors(Lo0, Hi0, DAG, DL),
58212 concatSubVectors(Lo1, Hi1, DAG, DL));
58213 }
58214 }
58215
58216 // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
58217 if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
58218 X86::isZeroNode(Op0.getOperand(1))) {
58219 assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use");
58220 return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
58221 Op0.getOperand(0), Op0.getOperand(2));
58222 }
58223
58224 if (SDValue IFMA52 = matchVPMADD52(N, DAG, DL, VT, Subtarget))
58225 return IFMA52;
58226
58227 return combineAddOrSubToADCOrSBB(N, DL, DAG);
58228}
58229
58230// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
58231// condition comes from the subtract node that produced -X. This matches the
58232// cmov expansion for absolute value. By swapping the operands we convert abs
58233// to nabs.
58234static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1,
58235 SelectionDAG &DAG) {
58236 if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
58237 return SDValue();
58238
58239 SDValue Cond = N1.getOperand(3);
58240 if (Cond.getOpcode() != X86ISD::SUB)
58241 return SDValue();
58242 assert(Cond.getResNo() == 1 && "Unexpected result number");
58243
58244 SDValue FalseOp = N1.getOperand(0);
58245 SDValue TrueOp = N1.getOperand(1);
58247
58248 // ABS condition should come from a negate operation.
58249 if ((CC == X86::COND_S || CC == X86::COND_NS) &&
58250 isNullConstant(Cond.getOperand(0))) {
58251 // Get the X and -X from the negate.
58252 SDValue NegX = Cond.getValue(0);
58253 SDValue X = Cond.getOperand(1);
58254
58255 // Cmov operands should be X and NegX. Order doesn't matter.
58256 if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
58257 return SDValue();
58258
58259 // Build a new CMOV with the operands swapped.
58260 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
58261 N1.getOperand(2), Cond);
58262 // Convert sub to add.
58263 return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
58264 }
58265
58266 // Handle ABD special case:
58267 // NEG(ABD(X,Y)) -> NEG(CMOV(SUB(X,Y),SUB(Y,X))) -> CMOV(SUB(Y,X),SUB(X,Y)).
58268 // ABD condition should come from a pair of matching subtracts.
58269 if ((CC == X86::COND_L || CC == X86::COND_B) && isNullConstant(N0) &&
58270 (FalseOp == Cond.getValue(0) || TrueOp == Cond.getValue(0)) &&
58271 (TrueOp.getOpcode() == ISD::SUB || TrueOp.getOpcode() == X86ISD::SUB) &&
58272 (FalseOp.getOpcode() == ISD::SUB || FalseOp.getOpcode() == X86ISD::SUB) &&
58273 (TrueOp.getOperand(0) == FalseOp.getOperand(1)) &&
58274 (TrueOp.getOperand(1) == FalseOp.getOperand(0))) {
58275 // Build a new CMOV with the operands swapped.
58276 return DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp, N1.getOperand(2),
58277 Cond);
58278 }
58279
58280 return SDValue();
58281}
58282
58284 SDValue Op0 = N->getOperand(0);
58285 SDValue Op1 = N->getOperand(1);
58286
58287 // (sub C (zero_extend (setcc)))
58288 // =>
58289 // (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate
58290 // Don't disturb (sub 0 setcc), which is easily done with neg.
58291 EVT VT = N->getValueType(0);
58292 auto *Op0C = dyn_cast<ConstantSDNode>(Op0);
58293 if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&
58294 !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&
58295 Op1.getOperand(0).hasOneUse()) {
58296 SDValue SetCC = Op1.getOperand(0);
58299 APInt NewImm = Op0C->getAPIntValue() - 1;
58300 SDLoc DL(Op1);
58301 SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);
58302 NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);
58303 return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,
58304 DAG.getConstant(NewImm, DL, VT));
58305 }
58306
58307 return SDValue();
58308}
58309
58311 if (N->getConstantOperandVal(3) != X86::COND_NE)
58312 return SDValue();
58313
58314 SDValue Sub = N->getOperand(4);
58315 if (Sub.getOpcode() != X86ISD::SUB)
58316 return SDValue();
58317
58318 SDValue Op1 = Sub.getOperand(1);
58319
58320 if (!X86::isZeroNode(Sub.getOperand(0)))
58321 return SDValue();
58322
58323 SDLoc DL(N);
58324 SmallVector<SDValue, 5> Ops(N->op_values());
58325 if (Op1.getOpcode() == X86ISD::SETCC) {
58326 // res, flags2 = sub 0, (setcc cc, flag)
58327 // cload/cstore ..., cond_ne, flag2
58328 // ->
58329 // cload/cstore cc, flag
58330 Ops[3] = Op1.getOperand(0);
58331 Ops[4] = Op1.getOperand(1);
58332 } else if (Op1.getOpcode() == ISD::AND && Sub.getValue(0).use_empty()) {
58333 SDValue Src = Op1;
58334 SDValue Op10 = Op1.getOperand(0);
58335 if (Op10.getOpcode() == ISD::XOR && isAllOnesConstant(Op10.getOperand(1))) {
58336 // res, flags2 = sub 0, (and (xor X, -1), Y)
58337 // cload/cstore ..., cond_ne, flag2
58338 // ->
58339 // res, flags2 = sub 0, (and X, Y)
58340 // cload/cstore ..., cond_e, flag2
58341 Src = DAG.getNode(ISD::AND, DL, Op1.getValueType(), Op10.getOperand(0),
58342 Op1.getOperand(1));
58343 Ops[3] = DAG.getTargetConstant(X86::COND_E, DL, MVT::i8);
58344 }
58345 // res, flags2 = sub 0, (and X, Y)
58346 // cload/cstore ..., cc, flag2
58347 // ->
58348 // res, flags2 = cmp (and X, Y), 0
58349 // cload/cstore ..., cc, flag2
58350 Ops[4] = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Src, Sub.getOperand(0));
58351 } else {
58352 return SDValue();
58353 }
58354
58355 return DAG.getMemIntrinsicNode(N->getOpcode(), DL, N->getVTList(), Ops,
58356 cast<MemSDNode>(N)->getMemoryVT(),
58357 cast<MemSDNode>(N)->getMemOperand());
58358}
58359
58362 const X86Subtarget &Subtarget) {
58363 EVT VT = N->getValueType(0);
58364 SDValue Op0 = N->getOperand(0);
58365 SDValue Op1 = N->getOperand(1);
58366 SDLoc DL(N);
58367
58368 auto IsNonOpaqueConstant = [&](SDValue Op) {
58370 /*AllowOpaques*/ false);
58371 };
58372
58373 // X86 can't encode an immediate LHS of a sub. See if we can push the
58374 // negation into a preceding instruction. If the RHS of the sub is a XOR with
58375 // one use and a constant, invert the immediate, saving one register.
58376 // However, ignore cases where C1 is 0, as those will become a NEG.
58377 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
58378 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
58379 !isNullConstant(Op0) && IsNonOpaqueConstant(Op1.getOperand(1)) &&
58380 Op1->hasOneUse()) {
58381 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
58382 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
58383 SDValue NewAdd =
58384 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
58385 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
58386 }
58387
58388 if (SDValue V = combineSubABS(VT, DL, Op0, Op1, DAG))
58389 return V;
58390
58391 // Try to synthesize horizontal subs from subs of shuffles.
58392 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
58393 return V;
58394
58395 // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
58396 if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
58397 X86::isZeroNode(Op1.getOperand(1))) {
58398 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
58399 return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
58400 Op1.getOperand(0), Op1.getOperand(2));
58401 }
58402
58403 // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
58404 // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
58405 if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
58406 !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
58407 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
58408 SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
58409 Op1.getOperand(1), Op1.getOperand(2));
58410 return DAG.getNode(ISD::SUB, DL, VT, ADC.getValue(0), Op1.getOperand(0));
58411 }
58412
58413 if (SDValue V = combineXorSubCTLZ(N, DL, DAG, Subtarget))
58414 return V;
58415
58416 if (SDValue V = combineAddOrSubToADCOrSBB(N, DL, DAG))
58417 return V;
58418
58419 return combineSubSetcc(N, DAG);
58420}
58421
58423 const X86Subtarget &Subtarget) {
58424 unsigned Opcode = N->getOpcode();
58425 assert((Opcode == X86ISD::PCMPEQ || Opcode == X86ISD::PCMPGT) &&
58426 "Unknown PCMP opcode");
58427
58428 SDValue LHS = N->getOperand(0);
58429 SDValue RHS = N->getOperand(1);
58430 MVT VT = N->getSimpleValueType(0);
58431 unsigned EltBits = VT.getScalarSizeInBits();
58432 unsigned NumElts = VT.getVectorNumElements();
58433 SDLoc DL(N);
58434
58435 if (LHS == RHS)
58436 return (Opcode == X86ISD::PCMPEQ) ? DAG.getAllOnesConstant(DL, VT)
58437 : DAG.getConstant(0, DL, VT);
58438
58439 // Constant Folding.
58440 // PCMPEQ(X,UNDEF) -> UNDEF
58441 // PCMPGT(X,UNDEF) -> 0
58442 // PCMPGT(UNDEF,X) -> 0
58443 APInt LHSUndefs, RHSUndefs;
58444 SmallVector<APInt> LHSBits, RHSBits;
58445 if (getTargetConstantBitsFromNode(LHS, EltBits, LHSUndefs, LHSBits) &&
58446 getTargetConstantBitsFromNode(RHS, EltBits, RHSUndefs, RHSBits)) {
58447 APInt Ones = APInt::getAllOnes(EltBits);
58448 APInt Zero = APInt::getZero(EltBits);
58449 SmallVector<APInt> Results(NumElts);
58450 for (unsigned I = 0; I != NumElts; ++I) {
58451 if (Opcode == X86ISD::PCMPEQ) {
58452 Results[I] = (LHSBits[I] == RHSBits[I]) ? Ones : Zero;
58453 } else {
58454 bool AnyUndef = LHSUndefs[I] || RHSUndefs[I];
58455 Results[I] = (!AnyUndef && LHSBits[I].sgt(RHSBits[I])) ? Ones : Zero;
58456 }
58457 }
58458 if (Opcode == X86ISD::PCMPEQ)
58459 return getConstVector(Results, LHSUndefs | RHSUndefs, VT, DAG, DL);
58460 return getConstVector(Results, VT, DAG, DL);
58461 }
58462
58463 return SDValue();
58464}
58465
58466// Helper to determine if we can convert an integer comparison to a float
58467// comparison byt casting the operands.
58468static std::optional<unsigned>
58469CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS,
58470 unsigned NumSignificantBitsRHS) {
58471 MVT SVT = VT.getScalarType();
58472 assert(SVT == MVT::f32 && "Only tested for float so far");
58473 const fltSemantics &Sem = SVT.getFltSemantics();
58474 assert((CC == ISD::SETEQ || CC == ISD::SETGT) &&
58475 "Only PCMPEQ/PCMPGT currently supported");
58476
58477 // TODO: Handle bitcastable integers.
58478
58479 // For cvt + signed compare we need lhs and rhs to be exactly representable as
58480 // a fp value.
58481 unsigned FPPrec = APFloat::semanticsPrecision(Sem);
58482 if (FPPrec >= NumSignificantBitsLHS && FPPrec >= NumSignificantBitsRHS)
58483 return ISD::SINT_TO_FP;
58484
58485 return std::nullopt;
58486}
58487
58488/// Helper that combines an array of subvector ops as if they were the operands
58489/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
58490/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
58493 const X86Subtarget &Subtarget,
58494 unsigned Depth) {
58495 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
58496 unsigned EltSizeInBits = VT.getScalarSizeInBits();
58497
58498 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
58499 return DAG.getUNDEF(VT);
58500
58501 if (llvm::all_of(Ops, [](SDValue Op) {
58502 return Op.isUndef() || ISD::isBuildVectorAllZeros(Op.getNode());
58503 }))
58504 return getZeroVector(VT, Subtarget, DAG, DL);
58505
58507 return SDValue(); // Limit search depth.
58508
58509 SDValue Op0 = Ops[0];
58510 bool IsSplat = llvm::all_equal(Ops);
58511 unsigned NumOps = Ops.size();
58512 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
58513 LLVMContext &Ctx = *DAG.getContext();
58514
58515 // Repeated subvectors.
58516 if (IsSplat &&
58517 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
58518 // If this broadcast is inserted into both halves, use a larger broadcast.
58519 if (Op0.getOpcode() == X86ISD::VBROADCAST)
58520 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
58521
58522 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
58523 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
58524 (Subtarget.hasAVX2() ||
58526 VT.getScalarType(), Subtarget)))
58527 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
58528 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
58529 Op0.getOperand(0),
58530 DAG.getVectorIdxConstant(0, DL)));
58531
58532 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
58533 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
58534 (Subtarget.hasAVX2() ||
58535 (EltSizeInBits >= 32 &&
58536 X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
58537 Op0.getOperand(0).getValueType() == VT.getScalarType())
58538 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
58539
58540 // concat_vectors(extract_subvector(splat(x)),
58541 // extract_subvector(splat(x))) -> splat(x)
58542 // concat_vectors(extract_subvector(subv_broadcast(x)),
58543 // extract_subvector(subv_broadcast(x))) -> subv_broadcast(x)
58544 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58545 Op0.getOperand(0).getValueType() == VT) {
58546 SDValue SrcVec = Op0.getOperand(0);
58547 if (DAG.isSplatValue(SrcVec, /*AllowUndefs*/ false))
58548 return SrcVec;
58549 if (SrcVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
58550 Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT())
58551 return SrcVec;
58552 }
58553
58554 // concat_vectors(permq(x),permq(x)) -> permq(concat_vectors(x,x))
58555 if (Op0.getOpcode() == X86ISD::VPERMI && Subtarget.useAVX512Regs() &&
58556 !X86::mayFoldLoad(Op0.getOperand(0), Subtarget))
58557 return DAG.getNode(Op0.getOpcode(), DL, VT,
58559 Op0.getOperand(0), Op0.getOperand(0)),
58560 Op0.getOperand(1));
58561 }
58562
58563 // TODO: This should go in combineX86ShufflesRecursively eventually.
58564 if (NumOps == 2) {
58565 SDValue Src0 = peekThroughBitcasts(Ops[0]);
58566 SDValue Src1 = peekThroughBitcasts(Ops[1]);
58567 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58569 EVT SrcVT0 = Src0.getOperand(0).getValueType();
58570 EVT SrcVT1 = Src1.getOperand(0).getValueType();
58571 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
58572 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
58573 const APInt &SrcIdx0 = Src0.getConstantOperandAPInt(1);
58574 const APInt &SrcIdx1 = Src1.getConstantOperandAPInt(1);
58575 // concat(extract_subvector(v0), extract_subvector(v1)) -> vperm2x128.
58576 // Only concat of subvector high halves which vperm2x128 is best at or if
58577 // it should fold into a subvector broadcast.
58578 if (VT.is256BitVector() && SrcVT0.is256BitVector() &&
58579 SrcVT1.is256BitVector()) {
58580 assert((SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58581 (SrcIdx1 == 0 || SrcIdx1 == (NumSrcElts1 / 2)) &&
58582 "Bad subvector index");
58583 if ((SrcIdx0 == (NumSrcElts0 / 2) && SrcIdx1 == (NumSrcElts1 / 2)) ||
58584 (IsSplat && ISD::isNormalLoad(Src0.getOperand(0).getNode()))) {
58585 unsigned Index = 0;
58586 Index |= SrcIdx0 == 0 ? 0x00 : 0x01;
58587 Index |= SrcIdx1 == 0 ? 0x20 : 0x30;
58588 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
58589 DAG.getBitcast(VT, Src0.getOperand(0)),
58590 DAG.getBitcast(VT, Src1.getOperand(0)),
58591 DAG.getTargetConstant(Index, DL, MVT::i8));
58592 }
58593 }
58594 // Widen extract_subvector
58595 // concat(extract_subvector(x,lo), extract_subvector(x,hi))
58596 // --> extract_subvector(x,lo)
58597 unsigned NumSubElts0 = Src0.getValueType().getVectorNumElements();
58598 if (Src0.getOperand(0) == Src1.getOperand(0) &&
58599 (SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58600 SrcIdx1 == (SrcIdx0 + NumSubElts0)) {
58601 return DAG.getBitcast(VT,
58603 Src0.getConstantOperandVal(1),
58604 DAG, DL, VT.getSizeInBits()));
58605 }
58606 }
58607 }
58608
58609 // Repeated opcode.
58610 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
58611 // but it currently struggles with different vector widths.
58612 if (llvm::all_of(Ops, [Op0](SDValue Op) {
58613 return Op.getOpcode() == Op0.getOpcode() && Op.hasOneUse();
58614 })) {
58615 auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
58617 for (SDValue SubOp : SubOps)
58618 Subs.push_back(SubOp.getOperand(I));
58619 // Attempt to peek through bitcasts and concat the original subvectors.
58620 EVT SubVT = peekThroughBitcasts(Subs[0]).getValueType();
58621 if (SubVT.isSimple() && SubVT.isVector()) {
58622 MVT ConcatVT =
58624 SubVT.getVectorElementCount() * Subs.size());
58625 for (SDValue &Sub : Subs)
58626 Sub = DAG.getBitcast(SubVT, Sub);
58627 if (SDValue ConcatSrc = combineConcatVectorOps(DL, ConcatVT, Subs, DAG,
58628 Subtarget, Depth + 1))
58629 return DAG.getBitcast(VT, ConcatSrc);
58630 return DAG.getBitcast(
58631 VT, DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, Subs));
58632 }
58633 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
58634 };
58635 auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
58636 bool AllConstants = true;
58637 bool AllSubs = true;
58638 unsigned VecSize = VT.getSizeInBits();
58639 SDValue BC0 = peekThroughBitcasts(SubOps[0].getOperand(Op));
58640 if (isa<LoadSDNode>(BC0) && all_of(SubOps, [&](SDValue SubOp) {
58641 return BC0 == peekThroughBitcasts(SubOp.getOperand(Op));
58642 }))
58643 return true;
58644 for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
58645 SDValue BC = peekThroughBitcasts(SubOps[I].getOperand(Op));
58646 unsigned SubSize = BC.getValueSizeInBits();
58647 unsigned EltSize = BC.getScalarValueSizeInBits();
58648 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
58650 AllSubs &= BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58651 BC.getOperand(0).getValueSizeInBits() == VecSize &&
58652 (BC.getConstantOperandVal(1) * EltSize) == (I * SubSize);
58653 }
58654 return AllConstants || AllSubs;
58655 };
58656 auto CombineSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
58657 bool AllConstants = true;
58659 for (SDValue SubOp : SubOps) {
58660 SDValue BC = peekThroughBitcasts(SubOp.getOperand(I));
58661 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
58663 Subs.push_back(SubOp.getOperand(I));
58664 }
58665 if (AllConstants)
58666 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
58667 return combineConcatVectorOps(DL, VT, Subs, DAG, Subtarget, Depth + 1);
58668 };
58669
58670 unsigned Opcode = Op0.getOpcode();
58671 switch (Opcode) {
58672 case ISD::BITCAST: {
58673 // TODO: Support AVX1/AVX2 bitcasts.
58675 for (SDValue SubOp : Ops)
58676 SubOps.push_back(peekThroughBitcasts(SubOp.getOperand(0)));
58677 EVT InnerVT = SubOps[0].getValueType();
58678 unsigned InnerSizeInBits = InnerVT.getScalarSizeInBits();
58679 if (!IsSplat && InnerVT.isSimple() && InnerVT.isVector() &&
58680 (Subtarget.hasBWI() ||
58681 (EltSizeInBits >= 32 && InnerSizeInBits >= 32)) &&
58682 ((VT.is256BitVector() && Subtarget.hasVLX()) ||
58683 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58684 llvm::all_of(SubOps, [InnerVT](SDValue Op) {
58685 return Op.getValueType() == InnerVT;
58686 })) {
58687 MVT ConcatSVT = InnerVT.getScalarType().getSimpleVT();
58688 MVT ConcatVT = MVT::getVectorVT(
58689 ConcatSVT, VT.getSizeInBits() / ConcatSVT.getSizeInBits());
58690 if (SDValue ConcatSrc = combineConcatVectorOps(
58691 DL, ConcatVT, SubOps, DAG, Subtarget, Depth + 1))
58692 return DAG.getBitcast(VT, ConcatSrc);
58693 }
58694 break;
58695 }
58696 case ISD::VECTOR_SHUFFLE: {
58697 // TODO: Generalize NumOps support.
58698 if (!IsSplat && NumOps == 2 &&
58699 ((VT.is256BitVector() &&
58700 (EltSizeInBits >= 32 || Subtarget.hasInt256())) ||
58701 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58702 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
58703 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58704 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58705 if (Concat0 || Concat1 ||
58706 (Ops[0].getOperand(0) == Ops[1].getOperand(0) &&
58707 Ops[0].getOperand(1) == Ops[1].getOperand(1) &&
58708 Subtarget.hasVBMI())) {
58709 int NumSubElts = Op0.getValueType().getVectorNumElements();
58710 SmallVector<int> NewMask;
58711 for (int M : cast<ShuffleVectorSDNode>(Ops[0])->getMask()) {
58712 M = M >= NumSubElts ? M + NumSubElts : M;
58713 NewMask.push_back(M);
58714 }
58715 for (int M : cast<ShuffleVectorSDNode>(Ops[1])->getMask()) {
58716 if (0 <= M)
58717 M = (M >= NumSubElts ? M + NumSubElts : M) + NumSubElts;
58718 NewMask.push_back(M);
58719 }
58720 Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0);
58721 Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1);
58722 return DAG.getVectorShuffle(VT, DL, Concat0, Concat1, NewMask);
58723 }
58724 }
58725 break;
58726 }
58727 case X86ISD::VBROADCAST: {
58728 // TODO: 512-bit VBROADCAST concatenation.
58729 if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {
58730 return Op.getOperand(0).getValueType().is128BitVector();
58731 })) {
58732 if (VT == MVT::v4f64 || VT == MVT::v4i64)
58733 return DAG.getNode(X86ISD::UNPCKL, DL, VT,
58734 ConcatSubOperand(VT, Ops, 0),
58735 ConcatSubOperand(VT, Ops, 0));
58736 // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.
58737 if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))
58738 return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI
58740 DL, VT, ConcatSubOperand(VT, Ops, 0),
58741 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
58742 }
58743 break;
58744 }
58745 case X86ISD::MOVDDUP:
58746 case X86ISD::MOVSHDUP:
58747 case X86ISD::MOVSLDUP: {
58748 if (!IsSplat && (VT.is256BitVector() ||
58749 (VT.is512BitVector() && Subtarget.useAVX512Regs())))
58750 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
58751 break;
58752 }
58753 case X86ISD::SHUFP: {
58754 if (!IsSplat &&
58755 (VT == MVT::v8f32 ||
58756 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) &&
58757 llvm::all_of(Ops, [Op0](SDValue Op) {
58758 return Op.getOperand(2) == Op0.getOperand(2);
58759 })) {
58760 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58761 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58762 if (Concat0 || Concat1)
58763 return DAG.getNode(Opcode, DL, VT,
58764 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58765 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
58766 Op0.getOperand(2));
58767 }
58768 break;
58769 }
58770 case X86ISD::UNPCKH:
58771 case X86ISD::UNPCKL: {
58772 // TODO: UNPCK should use CombineSubOperand
58773 // Don't concatenate build_vector patterns.
58774 if (!IsSplat &&
58775 ((VT.is256BitVector() &&
58776 (EltSizeInBits >= 32 || Subtarget.hasInt256())) ||
58777 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58778 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58779 none_of(Ops, [](SDValue Op) {
58780 return peekThroughBitcasts(Op.getOperand(0)).getOpcode() ==
58782 peekThroughBitcasts(Op.getOperand(1)).getOpcode() ==
58784 })) {
58785 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58786 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58787 if (Concat0 || Concat1 ||
58788 (Subtarget.hasInt256() && EltSizeInBits == 64))
58789 return DAG.getNode(Opcode, DL, VT,
58790 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58791 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58792 }
58793 break;
58794 }
58795 case X86ISD::PSHUFHW:
58796 case X86ISD::PSHUFLW:
58797 case X86ISD::PSHUFD:
58798 if (!IsSplat &&
58799 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58800 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58801 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58802 llvm::all_of(Ops, [Op0](SDValue Op) {
58803 return Op.getOperand(1) == Op0.getOperand(1);
58804 })) {
58805 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
58806 Op0.getOperand(1));
58807 }
58808 [[fallthrough]];
58809 case X86ISD::VPERMILPI:
58810 if (!IsSplat && EltSizeInBits == 32 &&
58811 (VT.is256BitVector() ||
58812 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58813 all_of(Ops, [&Op0](SDValue Op) {
58814 return Op0.getOperand(1) == Op.getOperand(1);
58815 })) {
58816 MVT FloatVT = VT.changeVectorElementType(MVT::f32);
58817 SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));
58818 Res =
58819 DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
58820 return DAG.getBitcast(VT, Res);
58821 }
58822 break;
58823 case X86ISD::VPERMILPV:
58824 if (!IsSplat && (VT.is256BitVector() ||
58825 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
58826 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58827 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58828 if (Concat0 || Concat1)
58829 return DAG.getNode(Opcode, DL, VT,
58830 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58831 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58832 }
58833 break;
58834 case X86ISD::PSHUFB:
58835 case X86ISD::PSADBW:
58836 case X86ISD::VPMADDUBSW:
58837 case X86ISD::VPMADDWD:
58838 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58839 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
58840 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
58841 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
58842 NumOps * SrcVT.getVectorNumElements());
58843 SDValue Concat0 = CombineSubOperand(SrcVT, Ops, 0);
58844 SDValue Concat1 = CombineSubOperand(SrcVT, Ops, 1);
58845 if (Concat0 || Concat1)
58846 return DAG.getNode(
58847 Opcode, DL, VT,
58848 Concat0 ? Concat0 : ConcatSubOperand(SrcVT, Ops, 0),
58849 Concat1 ? Concat1 : ConcatSubOperand(SrcVT, Ops, 1));
58850 }
58851 break;
58852 case X86ISD::VPERMV:
58853 // TODO: Handle 256-bit and NumOps == 4 cases.
58854 if (!IsSplat && NumOps == 2 &&
58855 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
58856 MVT OpVT = Op0.getSimpleValueType();
58857 int NumSrcElts = OpVT.getVectorNumElements();
58858 SmallVector<int, 64> ConcatMask;
58859 for (unsigned i = 0; i != NumOps; ++i) {
58860 SmallVector<int, 64> SubMask;
58862 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
58863 break;
58864 for (int M : SubMask) {
58865 if (0 <= M)
58866 M += i * NumSrcElts;
58867 ConcatMask.push_back(M);
58868 }
58869 }
58870 if (ConcatMask.size() == (NumOps * NumSrcElts))
58871 return lowerShuffleWithPERMV(DL, VT, ConcatMask,
58872 ConcatSubOperand(VT, Ops, 1),
58873 DAG.getUNDEF(VT), Subtarget, DAG);
58874 }
58875 break;
58876 case X86ISD::VPERMV3:
58877 // TODO: Handle 256-bit and NumOps == 4 cases.
58878 if (!IsSplat && NumOps == 2 &&
58879 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
58880 MVT OpVT = Op0.getSimpleValueType();
58881 int NumSrcElts = OpVT.getVectorNumElements();
58882 SmallVector<int, 64> ConcatMask;
58883 for (unsigned i = 0; i != NumOps; ++i) {
58884 SmallVector<int, 64> SubMask;
58886 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
58887 break;
58888 for (int M : SubMask) {
58889 if (0 <= M) {
58890 int Src = M < NumSrcElts ? 0 : 2;
58891 M += M < NumSrcElts ? 0 : NumSrcElts;
58892
58893 // Reference the lowest sub if the upper sub is the same.
58894 if (Ops[0].getOperand(Src) != Ops[i].getOperand(Src))
58895 M += i * NumSrcElts;
58896 }
58897 ConcatMask.push_back(M);
58898 }
58899 }
58900 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
58901 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58902 SDValue Concat1 = CombineSubOperand(VT, Ops, 2);
58903 if (Concat0 || Concat1)
58904 return lowerShuffleWithPERMV(
58905 DL, VT, ConcatMask,
58906 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58907 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 2), Subtarget,
58908 DAG);
58909 }
58910 }
58911 break;
58912 case X86ISD::VPERM2X128: {
58913 if (!IsSplat && VT.is512BitVector() && Subtarget.useAVX512Regs()) {
58914 assert(NumOps == 2 && "Bad concat_vectors operands");
58915 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
58916 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
58917 // TODO: Handle zero'd subvectors.
58918 if ((Imm0 & 0x88) == 0 && (Imm1 & 0x88) == 0) {
58919 int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3), (int)(Imm1 & 0x03),
58920 (int)((Imm1 >> 4) & 0x3)};
58921 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
58922 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
58923 Ops[0].getOperand(1), DAG, DL);
58924 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
58925 Ops[1].getOperand(1), DAG, DL);
58926 SDValue Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
58927 DAG.getBitcast(ShuffleVT, LHS),
58928 DAG.getBitcast(ShuffleVT, RHS),
58929 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
58930 return DAG.getBitcast(VT, Res);
58931 }
58932 }
58933 break;
58934 }
58935 case X86ISD::SHUF128: {
58936 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
58937 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
58938 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
58939 unsigned Imm = ((Imm0 & 1) << 0) | ((Imm0 & 2) << 1) | 0x08 |
58940 ((Imm1 & 1) << 4) | ((Imm1 & 2) << 5) | 0x80;
58941 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
58942 Ops[0].getOperand(1), DAG, DL);
58943 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
58944 Ops[1].getOperand(1), DAG, DL);
58945 return DAG.getNode(X86ISD::SHUF128, DL, VT, LHS, RHS,
58946 DAG.getTargetConstant(Imm, DL, MVT::i8));
58947 }
58948 break;
58949 }
58950 case ISD::TRUNCATE:
58951 if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {
58952 EVT SrcVT = Ops[0].getOperand(0).getValueType();
58953 if (SrcVT.is256BitVector() && SrcVT.isSimple() &&
58954 SrcVT == Ops[1].getOperand(0).getValueType() &&
58955 Subtarget.useAVX512Regs() &&
58956 Subtarget.getPreferVectorWidth() >= 512 &&
58957 (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {
58958 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
58959 return DAG.getNode(ISD::TRUNCATE, DL, VT,
58960 ConcatSubOperand(NewSrcVT, Ops, 0));
58961 }
58962 }
58963 break;
58964 case ISD::ANY_EXTEND:
58965 case ISD::SIGN_EXTEND:
58966 case ISD::ZERO_EXTEND:
58967 // TODO: Handle ANY_EXTEND combos with SIGN/ZERO_EXTEND.
58968 if (!IsSplat && NumOps == 2 &&
58969 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58970 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58971 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
58972 EVT SrcVT = Ops[0].getOperand(0).getValueType();
58973 if (SrcVT.isSimple() && SrcVT.is128BitVector() &&
58974 SrcVT == Ops[1].getOperand(0).getValueType()) {
58975 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
58976 return DAG.getNode(Opcode, DL, VT,
58977 ConcatSubOperand(NewSrcVT, Ops, 0));
58978 }
58979 }
58980 break;
58984 // TODO: Handle ANY_EXTEND_INREG combos with SIGN/ZERO_EXTEND_INREG.
58985 if (!IsSplat && NumOps == 2 &&
58986 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58987 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58988 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58990 Op0.getOperand(0).getValueType() ==
58991 Ops[0].getOperand(0).getValueType()) {
58992 EVT SrcVT = Op0.getOperand(0).getValueType();
58993 unsigned NumElts = VT.getVectorNumElements();
58994 MVT UnpackSVT =
58995 MVT::getIntegerVT(SrcVT.getScalarSizeInBits() * (NumElts / 2));
58996 MVT UnpackVT =
58997 MVT::getVectorVT(UnpackSVT, 128 / UnpackSVT.getScalarSizeInBits());
58998 SDValue Unpack =
58999 DAG.getNode(X86ISD::UNPCKL, DL, UnpackVT,
59000 DAG.getBitcast(UnpackVT, Ops[0].getOperand(0)),
59001 DAG.getBitcast(UnpackVT, Ops[1].getOperand(0)));
59002 return getEXTEND_VECTOR_INREG(Opcode, DL, VT,
59003 DAG.getBitcast(SrcVT, Unpack), DAG);
59004 }
59005 break;
59006 }
59007 case X86ISD::VSHLI:
59008 case X86ISD::VSRLI:
59009 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
59010 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
59011 llvm::all_of(Ops, [](SDValue Op) {
59012 return Op.getConstantOperandAPInt(1) == 32;
59013 })) {
59014 if (SDValue Res = CombineSubOperand(VT, Ops, 0)) {
59015 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
59016 Res = DAG.getBitcast(MVT::v8i32, Res);
59017 if (Opcode == X86ISD::VSHLI) {
59018 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
59019 {8, 0, 8, 2, 8, 4, 8, 6});
59020 } else {
59021 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
59022 {1, 8, 3, 8, 5, 8, 7, 8});
59023 }
59024 return DAG.getBitcast(VT, Res);
59025 }
59026 }
59027 [[fallthrough]];
59028 case X86ISD::VSRAI:
59029 case X86ISD::VSHL:
59030 case X86ISD::VSRL:
59031 case X86ISD::VSRA:
59032 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
59033 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
59034 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
59035 llvm::all_of(Ops, [Op0](SDValue Op) {
59036 return Op0.getOperand(1) == Op.getOperand(1);
59037 })) {
59038 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59039 Op0.getOperand(1));
59040 }
59041 break;
59042 case X86ISD::VPERMI:
59043 case X86ISD::VROTLI:
59044 case X86ISD::VROTRI:
59045 if (!IsSplat &&
59046 ((VT.is256BitVector() && Subtarget.hasVLX()) ||
59047 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59048 llvm::all_of(Ops, [Op0](SDValue Op) {
59049 return Op0.getOperand(1) == Op.getOperand(1);
59050 })) {
59051 assert(!(Opcode == X86ISD::VPERMI &&
59052 Op0.getValueType().is128BitVector()) &&
59053 "Illegal 128-bit X86ISD::VPERMI nodes");
59054 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59055 Op0.getOperand(1));
59056 }
59057 break;
59058 case ISD::AND:
59059 case ISD::OR:
59060 case ISD::XOR:
59061 case X86ISD::ANDNP:
59062 // TODO: AVX512 targets should only use CombineSubOperand like AVX1/2.
59063 if (!IsSplat && (VT.is256BitVector() ||
59064 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59065 // Don't concatenate root AVX1 NOT patterns.
59066 // TODO: Allow NOT folding if Concat0 succeeds.
59067 if (Opcode == ISD::XOR && Depth == 0 && !Subtarget.hasInt256() &&
59068 llvm::all_of(Ops, [](SDValue X) {
59069 return ISD::isBuildVectorAllOnes(X.getOperand(1).getNode());
59070 }))
59071 break;
59072 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59073 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59074 if (Concat0 || Concat1 || Subtarget.useAVX512Regs())
59075 return DAG.getNode(Opcode, DL, VT,
59076 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59077 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59078 }
59079 break;
59080 case X86ISD::PCMPEQ:
59081 case X86ISD::PCMPGT:
59082 // TODO: 512-bit PCMPEQ/PCMPGT -> VPCMP+VPMOVM2 handling.
59083 if (!IsSplat && VT.is256BitVector() && Subtarget.hasInt256()) {
59084 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59085 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59086 if (Concat0 || Concat1)
59087 return DAG.getNode(Opcode, DL, VT,
59088 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59089 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59090 break;
59091 }
59092
59093 if (!IsSplat && VT == MVT::v8i32) {
59094 // Without AVX2, see if we can cast the values to v8f32 and use fcmp.
59095 // TODO: Handle v4f64 as well?
59096 unsigned MaxSigBitsLHS = 0, MaxSigBitsRHS = 0;
59097 for (unsigned I = 0; I != NumOps; ++I) {
59098 MaxSigBitsLHS =
59099 std::max(MaxSigBitsLHS,
59100 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(0)));
59101 MaxSigBitsRHS =
59102 std::max(MaxSigBitsRHS,
59103 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(1)));
59104 if (MaxSigBitsLHS == EltSizeInBits && MaxSigBitsRHS == EltSizeInBits)
59105 break;
59106 }
59107
59108 ISD::CondCode ICC =
59109 Opcode == X86ISD::PCMPEQ ? ISD::SETEQ : ISD::SETGT;
59110 ISD::CondCode FCC =
59112
59113 MVT FpSVT = MVT::getFloatingPointVT(EltSizeInBits);
59114 MVT FpVT = VT.changeVectorElementType(FpSVT);
59115
59116 if (std::optional<unsigned> CastOpc =
59117 CastIntSETCCtoFP(FpVT, ICC, MaxSigBitsLHS, MaxSigBitsRHS)) {
59118 SDValue LHS = CombineSubOperand(VT, Ops, 0);
59119 SDValue RHS = CombineSubOperand(VT, Ops, 1);
59120 LHS = LHS ? LHS : ConcatSubOperand(VT, Ops, 0);
59121 RHS = RHS ? RHS : ConcatSubOperand(VT, Ops, 1);
59122 LHS = DAG.getNode(*CastOpc, DL, FpVT, LHS);
59123 RHS = DAG.getNode(*CastOpc, DL, FpVT, RHS);
59124
59125 bool IsAlwaysSignaling;
59126 unsigned FSETCC =
59127 translateX86FSETCC(FCC, LHS, RHS, IsAlwaysSignaling);
59128 return DAG.getBitcast(
59129 VT, DAG.getNode(X86ISD::CMPP, DL, FpVT, LHS, RHS,
59130 DAG.getTargetConstant(FSETCC, DL, MVT::i8)));
59131 }
59132 }
59133 break;
59134 case ISD::CTPOP:
59135 case ISD::CTTZ:
59136 case ISD::CTLZ:
59139 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59140 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
59141 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
59142 }
59143 break;
59145 // TODO: GF2P8AFFINEQB should use CombineSubOperand.
59146 if (!IsSplat &&
59147 (VT.is256BitVector() ||
59148 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59149 llvm::all_of(Ops, [Op0](SDValue Op) {
59150 return Op0.getOperand(2) == Op.getOperand(2);
59151 })) {
59152 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59153 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
59154 }
59155 break;
59156 case ISD::ADD:
59157 case ISD::SUB:
59158 case ISD::MUL:
59159 // TODO: Add more integer binops?
59160 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59161 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
59162 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
59163 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59164 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59165 if (Concat0 || Concat1 || llvm::all_of(Ops, [](SDValue Op) {
59166 return Op.getOperand(0) == Op.getOperand(1);
59167 }))
59168 return DAG.getNode(Opcode, DL, VT,
59169 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59170 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59171 }
59172 break;
59173 // Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and
59174 // their latency are short, so here we don't replace them unless we won't
59175 // introduce extra VINSERT.
59176 case ISD::FADD:
59177 case ISD::FSUB:
59178 case ISD::FMUL:
59179 if (!IsSplat && (VT.is256BitVector() ||
59180 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59181 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59182 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59183 if (Concat0 || Concat1)
59184 return DAG.getNode(Opcode, DL, VT,
59185 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59186 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59187 }
59188 break;
59189 // Always prefer to concatenate high latency FDIV instructions.
59190 case ISD::FDIV:
59191 if (!IsSplat && (VT.is256BitVector() ||
59192 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59193 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59194 ConcatSubOperand(VT, Ops, 1));
59195 }
59196 break;
59197 case X86ISD::HADD:
59198 case X86ISD::HSUB:
59199 case X86ISD::FHADD:
59200 case X86ISD::FHSUB:
59201 if (!IsSplat && VT.is256BitVector() &&
59202 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
59203 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59204 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59205 if (Concat0 || Concat1)
59206 return DAG.getNode(Opcode, DL, VT,
59207 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59208 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59209 }
59210 break;
59211 case X86ISD::PACKSS:
59212 case X86ISD::PACKUS:
59213 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59214 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
59215 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
59216 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
59217 NumOps * SrcVT.getVectorNumElements());
59218 SDValue Concat0 = CombineSubOperand(SrcVT, Ops, 0);
59219 SDValue Concat1 = CombineSubOperand(SrcVT, Ops, 1);
59220 if (Concat0 || Concat1)
59221 return DAG.getNode(
59222 Opcode, DL, VT,
59223 Concat0 ? Concat0 : ConcatSubOperand(SrcVT, Ops, 0),
59224 Concat1 ? Concat1 : ConcatSubOperand(SrcVT, Ops, 1));
59225 }
59226 break;
59227 case X86ISD::VSHLD:
59228 case X86ISD::VSHRD:
59229 case X86ISD::PALIGNR:
59230 if (!IsSplat &&
59231 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59232 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
59233 llvm::all_of(Ops, [Op0](SDValue Op) {
59234 return Op0.getOperand(2) == Op.getOperand(2);
59235 })) {
59236 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59237 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59238 if (Concat0 || Concat1)
59239 return DAG.getNode(Opcode, DL, VT,
59240 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59241 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
59242 Op0.getOperand(2));
59243 }
59244 break;
59245 case X86ISD::BLENDI:
59246 if (VT.is256BitVector() && NumOps == 2 &&
59247 (EltSizeInBits >= 32 ||
59248 (Subtarget.hasInt256() &&
59249 Ops[0].getOperand(2) == Ops[1].getOperand(2)))) {
59250 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59251 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59252 if (Concat0 || Concat1) {
59253 unsigned NumElts = VT.getVectorNumElements();
59254 APInt Mask = getBLENDIBlendMask(Ops[0]).zext(NumElts);
59255 Mask.insertBits(getBLENDIBlendMask(Ops[1]), NumElts / 2);
59256 Mask = Mask.zextOrTrunc(8);
59257 return DAG.getNode(Opcode, DL, VT,
59258 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59259 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
59260 DAG.getTargetConstant(Mask, DL, MVT::i8));
59261 }
59262 }
59263 // TODO: BWI targets should only use CombineSubOperand.
59264 if (((VT.is256BitVector() && Subtarget.hasVLX()) ||
59265 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59266 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())) {
59267 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59268 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59269 if (Concat0 || Concat1 || Subtarget.useBWIRegs()) {
59270 unsigned NumElts = VT.getVectorNumElements();
59271 APInt Mask = getBLENDIBlendMask(Ops[0]).zext(NumElts);
59272 for (unsigned I = 1; I != NumOps; ++I)
59273 Mask.insertBits(getBLENDIBlendMask(Ops[I]), I * (NumElts / NumOps));
59274 unsigned NumMaskBits = NumElts >= 8 ? NumElts : 8;
59275 Mask = Mask.zextOrTrunc(NumMaskBits);
59276 MVT MaskSVT = MVT::getIntegerVT(NumMaskBits);
59277 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumMaskBits);
59278 SDValue Sel =
59279 DAG.getBitcast(MaskVT, DAG.getConstant(Mask, DL, MaskSVT));
59280 Sel = extractSubVector(Sel, 0, DAG, DL, NumElts);
59281 Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0);
59282 Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1);
59283 return DAG.getSelect(DL, VT, Sel, Concat1, Concat0);
59284 }
59285 }
59286 break;
59287 case ISD::VSELECT:
59288 // TODO: VSELECT should use CombineSubOperand.
59289 if (!IsSplat && Subtarget.hasAVX512() &&
59290 (VT.is256BitVector() ||
59291 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59292 (EltSizeInBits >= 32 || Subtarget.hasBWI())) {
59293 EVT SelVT = Ops[0].getOperand(0).getValueType();
59294 if (SelVT.getVectorElementType() == MVT::i1) {
59295 SelVT = EVT::getVectorVT(Ctx, MVT::i1,
59296 NumOps * SelVT.getVectorNumElements());
59297 if (TLI.isTypeLegal(SelVT))
59298 return DAG.getNode(
59299 Opcode, DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
59300 ConcatSubOperand(VT, Ops, 1), ConcatSubOperand(VT, Ops, 2));
59301 }
59302 }
59303 [[fallthrough]];
59304 case X86ISD::BLENDV:
59305 // TODO: BLENDV should use CombineSubOperand.
59306 if (!IsSplat && VT.is256BitVector() && NumOps == 2 &&
59307 (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
59308 IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {
59309 EVT SelVT = Ops[0].getOperand(0).getValueType();
59310 SelVT = SelVT.getDoubleNumVectorElementsVT(Ctx);
59311 if (TLI.isTypeLegal(SelVT))
59312 return DAG.getNode(
59313 Opcode, DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
59314 ConcatSubOperand(VT, Ops, 1), ConcatSubOperand(VT, Ops, 2));
59315 }
59316 break;
59317 }
59318 }
59319
59320 // Fold subvector loads into one.
59321 // If needed, look through bitcasts to get to the load.
59322 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
59323 unsigned Fast;
59324 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
59325 if (TLI->allowsMemoryAccess(Ctx, DAG.getDataLayout(), VT,
59326 *FirstLd->getMemOperand(), &Fast) &&
59327 Fast) {
59328 if (SDValue Ld =
59329 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
59330 return Ld;
59331 }
59332 }
59333
59334 // Attempt to fold target constant loads.
59335 if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
59336 SmallVector<APInt> EltBits;
59337 APInt UndefElts = APInt::getZero(VT.getVectorNumElements());
59338 for (unsigned I = 0; I != NumOps; ++I) {
59339 APInt OpUndefElts;
59340 SmallVector<APInt> OpEltBits;
59341 if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
59342 OpEltBits, /*AllowWholeUndefs*/ true,
59343 /*AllowPartialUndefs*/ false))
59344 break;
59345 EltBits.append(OpEltBits);
59346 UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
59347 }
59348 if (EltBits.size() == VT.getVectorNumElements()) {
59349 Constant *C = getConstantVector(VT, EltBits, UndefElts, Ctx);
59350 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
59351 SDValue CV = DAG.getConstantPool(C, PVT);
59354 SDValue Ld = DAG.getLoad(VT, DL, DAG.getEntryNode(), CV, MPI);
59355 SDValue Sub = extractSubVector(Ld, 0, DAG, DL, Op0.getValueSizeInBits());
59357 return Ld;
59358 }
59359 }
59360
59361 // If this simple subvector or scalar/subvector broadcast_load is inserted
59362 // into both halves, use a larger broadcast_load. Update other uses to use
59363 // an extracted subvector.
59364 if (IsSplat &&
59365 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
59366 if (ISD::isNormalLoad(Op0.getNode()) ||
59369 auto *Mem = cast<MemSDNode>(Op0);
59370 unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
59373 if (SDValue BcastLd =
59374 getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
59375 SDValue BcastSrc =
59376 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
59377 DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
59378 return BcastLd;
59379 }
59380 }
59381 }
59382
59383 // If we're splatting a 128-bit subvector to 512-bits, use SHUF128 directly.
59384 if (IsSplat && NumOps == 4 && VT.is512BitVector() &&
59385 Subtarget.useAVX512Regs()) {
59386 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
59387 SDValue Res = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
59388 Res = DAG.getBitcast(ShuffleVT, Res);
59389 Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT, Res, Res,
59390 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
59391 return DAG.getBitcast(VT, Res);
59392 }
59393
59394 // We can always convert per-lane vXf64 shuffles into VSHUFPD.
59395 if (!IsSplat &&
59396 ((NumOps == 2 && VT == MVT::v4f64) ||
59397 (NumOps == 4 && VT == MVT::v8f64 && Subtarget.useAVX512Regs())) &&
59398 all_of(Ops, [](SDValue Op) { return Op.hasOneUse(); })) {
59399 // Collect the individual per-lane v2f64/v4f64 shuffles.
59400 MVT OpVT = Ops[0].getSimpleValueType();
59401 unsigned NumOpElts = OpVT.getVectorNumElements();
59404 if (all_of(seq<int>(NumOps), [&](int I) {
59405 return getTargetShuffleInputs(Ops[I], SrcOps[I], SrcMasks[I], DAG,
59406 Depth + 1) &&
59407 !is128BitLaneCrossingShuffleMask(OpVT, SrcMasks[I]) &&
59408 none_of(SrcMasks[I], isUndefOrZero) &&
59409 SrcMasks[I].size() == NumOpElts &&
59410 all_of(SrcOps[I], [&OpVT](SDValue V) {
59411 return V.getValueType() == OpVT;
59412 });
59413 })) {
59414 // Concatenate the shuffle masks into SHUFPD mask and collect subops.
59415 bool Unary = true;
59416 unsigned SHUFPDMask = 0;
59418 for (unsigned I = 0; I != NumOps; ++I) {
59419 LHS[I] = SrcOps[I][SrcMasks[I][0] / NumOpElts];
59420 RHS[I] = SrcOps[I][SrcMasks[I][1] / NumOpElts];
59421 Unary &= LHS[I] == RHS[I];
59422 for (unsigned J = 0; J != NumOpElts; ++J)
59423 SHUFPDMask |= (SrcMasks[I][J] & 1) << ((I * NumOpElts) + J);
59424 }
59425 // Concat SHUFPD LHS/RHS operands - if they match then it will become a
59426 // PERMILPD mask and we can always profitably concatenate them.
59427 SDValue Concat0 =
59428 combineConcatVectorOps(DL, VT, LHS, DAG, Subtarget, Depth + 1);
59429 SDValue Concat1 =
59430 combineConcatVectorOps(DL, VT, RHS, DAG, Subtarget, Depth + 1);
59431 if (Unary || Concat0 || Concat1) {
59432 Concat0 =
59433 Concat0 ? Concat0 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS);
59434 Concat1 =
59435 Concat1 ? Concat1 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS);
59436 return DAG.getNode(X86ISD::SHUFP, DL, VT, Concat0, Concat1,
59437 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
59438 }
59439 }
59440 }
59441
59442 return SDValue();
59443}
59444
59447 const X86Subtarget &Subtarget) {
59448 EVT VT = N->getValueType(0);
59449 EVT SrcVT = N->getOperand(0).getValueType();
59450 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
59452
59453 if (VT.getVectorElementType() == MVT::i1) {
59454 // Attempt to constant fold.
59455 unsigned SubSizeInBits = SrcVT.getSizeInBits();
59457 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
59459 if (!C) break;
59460 Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
59461 if (I == (E - 1)) {
59462 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
59463 if (TLI.isTypeLegal(IntVT))
59464 return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));
59465 }
59466 }
59467
59468 // Don't do anything else for i1 vectors.
59469 return SDValue();
59470 }
59471
59472 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
59473 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
59474 Subtarget))
59475 return R;
59476 }
59477
59478 return SDValue();
59479}
59480
59483 const X86Subtarget &Subtarget) {
59484 if (DCI.isBeforeLegalizeOps())
59485 return SDValue();
59486
59487 MVT OpVT = N->getSimpleValueType(0);
59488
59489 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
59490
59491 SDLoc dl(N);
59492 SDValue Vec = N->getOperand(0);
59493 SDValue SubVec = N->getOperand(1);
59494
59495 uint64_t IdxVal = N->getConstantOperandVal(2);
59496 MVT SubVecVT = SubVec.getSimpleValueType();
59497 int VecNumElts = OpVT.getVectorNumElements();
59498 int SubVecNumElts = SubVecVT.getVectorNumElements();
59499
59500 if (Vec.isUndef() && SubVec.isUndef())
59501 return DAG.getUNDEF(OpVT);
59502
59503 // Inserting undefs/zeros into zeros/undefs is a zero vector.
59504 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
59505 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
59506 return getZeroVector(OpVT, Subtarget, DAG, dl);
59507
59509 // If we're inserting into a zero vector and then into a larger zero vector,
59510 // just insert into the larger zero vector directly.
59511 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
59513 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
59514 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59515 getZeroVector(OpVT, Subtarget, DAG, dl),
59516 SubVec.getOperand(1),
59517 DAG.getVectorIdxConstant(IdxVal + Idx2Val, dl));
59518 }
59519
59520 // If we're inserting into a zero vector and our input was extracted from an
59521 // insert into a zero vector of the same type and the extraction was at
59522 // least as large as the original insertion. Just insert the original
59523 // subvector into a zero vector.
59524 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
59525 isNullConstant(SubVec.getOperand(1)) &&
59527 SDValue Ins = SubVec.getOperand(0);
59528 if (isNullConstant(Ins.getOperand(2)) &&
59529 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
59530 Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
59531 SubVecVT.getFixedSizeInBits())
59532 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59533 getZeroVector(OpVT, Subtarget, DAG, dl),
59534 Ins.getOperand(1), N->getOperand(2));
59535 }
59536 }
59537
59538 // Stop here if this is an i1 vector.
59539 if (IsI1Vector)
59540 return SDValue();
59541
59542 // Eliminate an intermediate vector widening:
59543 // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->
59544 // insert_subvector X, Y, Idx
59545 // TODO: This is a more general version of a DAGCombiner fold, can we move it
59546 // there?
59547 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
59548 SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))
59549 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
59550 SubVec.getOperand(1), N->getOperand(2));
59551
59552 // If this is an insert of an extract, combine to a shuffle. Don't do this
59553 // if the insert or extract can be represented with a subregister operation.
59554 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
59555 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
59556 (IdxVal != 0 ||
59557 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
59558 SDValue ExtSrc = SubVec.getOperand(0);
59559 int ExtIdxVal = SubVec.getConstantOperandVal(1);
59560 // Create a shuffle mask matching the extraction and insertion.
59561 SmallVector<int, 64> Mask(VecNumElts);
59562 std::iota(Mask.begin(), Mask.end(), 0);
59563 std::iota(Mask.begin() + IdxVal, Mask.begin() + IdxVal + SubVecNumElts,
59564 ExtIdxVal + VecNumElts);
59565 if (ExtIdxVal != 0)
59566 return DAG.getVectorShuffle(OpVT, dl, Vec, ExtSrc, Mask);
59567 // See if we can use a blend instead of extract/insert pair.
59568 SmallVector<int, 64> BlendMask(VecNumElts);
59569 std::iota(BlendMask.begin(), BlendMask.end(), 0);
59570 std::iota(BlendMask.begin() + IdxVal,
59571 BlendMask.begin() + IdxVal + SubVecNumElts, VecNumElts + IdxVal);
59572 if (isShuffleEquivalent(Mask, BlendMask, Vec, ExtSrc) &&
59573 VecNumElts == (2 * SubVecNumElts)) {
59574 assert((IdxVal % SubVecNumElts) == 0 && "Unaligned subvector insertion");
59575 if (OpVT.is256BitVector() && SubVecVT.is128BitVector()) {
59576 SDValue Blend = DAG.getNode(
59577 X86ISD::BLENDI, dl, MVT::v8f32, DAG.getBitcast(MVT::v8f32, Vec),
59578 DAG.getBitcast(MVT::v8f32, ExtSrc),
59579 DAG.getTargetConstant(IdxVal == 0 ? 0x0F : 0xF0, dl, MVT::i8));
59580 return DAG.getBitcast(OpVT, Blend);
59581 } else if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) {
59582 MVT ShufVT = OpVT.isInteger() ? MVT::v8i64 : MVT::v8f64;
59583 SDValue Lo = DAG.getBitcast(ShufVT, IdxVal == 0 ? ExtSrc : Vec);
59584 SDValue Hi = DAG.getBitcast(ShufVT, IdxVal == 0 ? Vec : ExtSrc);
59585 SDValue Shuffle =
59586 DAG.getNode(X86ISD::SHUF128, dl, ShufVT, Lo, Hi,
59587 getV4X86ShuffleImm8ForMask({0, 1, 2, 3}, dl, DAG));
59588 return DAG.getBitcast(OpVT, Shuffle);
59589 }
59590 }
59591 }
59592
59593 // Match concat_vector style patterns.
59594 SmallVector<SDValue, 2> SubVectorOps;
59595 if (collectConcatOps(N, SubVectorOps, DAG)) {
59596 if (SDValue Fold =
59597 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, Subtarget))
59598 return Fold;
59599
59600 // If we're inserting all zeros into the upper half, change this to
59601 // a concat with zero. We will match this to a move
59602 // with implicit upper bit zeroing during isel.
59603 // We do this here because we don't want combineConcatVectorOps to
59604 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
59605 if (SubVectorOps.size() == 2 &&
59606 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
59607 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59608 getZeroVector(OpVT, Subtarget, DAG, dl),
59609 SubVectorOps[0], DAG.getVectorIdxConstant(0, dl));
59610
59611 // Attempt to recursively combine to a shuffle.
59612 if (all_of(SubVectorOps, [](SDValue SubOp) {
59614 })) {
59615 SDValue Op(N, 0);
59616 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
59617 return Res;
59618 }
59619 }
59620
59621 // If this is a broadcast insert into an upper undef, use a larger broadcast.
59622 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
59623 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
59624
59625 // If this is a broadcast load inserted into an upper undef, use a larger
59626 // broadcast load.
59627 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
59628 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
59629 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
59631 MemIntr->getMemoryVT(), MemIntr, 0, DAG);
59632 }
59633
59634 // If we're splatting the lower half subvector of a full vector load into the
59635 // upper half, attempt to create a subvector broadcast.
59636 if ((int)IdxVal == (VecNumElts / 2) &&
59637 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
59638 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
59639 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
59640 if (VecLd && SubLd &&
59642 SubLd, VecLd, SubVec.getValueSizeInBits() / 8, 0)) {
59644 SubVecVT, SubLd, 0, DAG);
59645 SDValue NewSubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT,
59646 BcastLd, DAG.getVectorIdxConstant(0, dl));
59647 DCI.CombineTo(SubLd, NewSubVec, BcastLd.getValue(1));
59648 return BcastLd;
59649 }
59650 }
59651
59652 // Attempt to constant fold (if we're not widening).
59653 if (!Vec.isUndef() && !ISD::isBuildVectorAllZeros(Vec.getNode())) {
59654 unsigned EltSizeInBits = OpVT.getScalarSizeInBits();
59655 APInt VecUndefElts, SubUndefElts;
59656 SmallVector<APInt, 16> VecEltBits, SubEltBits;
59657 if (getTargetConstantBitsFromNode(Vec, EltSizeInBits, VecUndefElts,
59658 VecEltBits) &&
59659 getTargetConstantBitsFromNode(SubVec, EltSizeInBits, SubUndefElts,
59660 SubEltBits)) {
59661 VecUndefElts.insertBits(SubUndefElts, IdxVal);
59662 llvm::copy(SubEltBits, VecEltBits.begin() + IdxVal);
59663 return getConstVector(VecEltBits, VecUndefElts, OpVT, DAG, dl);
59664 }
59665 }
59666
59667 // Attempt to recursively combine to a shuffle.
59670 SDValue Op(N, 0);
59671 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
59672 return Res;
59673 }
59674
59675 // Match insertion of subvector load that perfectly aliases a base load.
59676 if ((IdxVal % SubVecNumElts) == 0 && ISD::isNormalLoad(Vec.getNode()) &&
59677 ISD::isNormalLoad(SubVec.getNode()) &&
59679 cast<LoadSDNode>(SubVec), cast<LoadSDNode>(Vec),
59680 SubVec.getValueSizeInBits() / 8, IdxVal / SubVecNumElts))
59681 return Vec;
59682
59683 return SDValue();
59684}
59685
59686/// If we are extracting a subvector of a vector select and the select condition
59687/// is composed of concatenated vectors, try to narrow the select width. This
59688/// is a common pattern for AVX1 integer code because 256-bit selects may be
59689/// legal, but there is almost no integer math/logic available for 256-bit.
59690/// This function should only be called with legal types (otherwise, the calls
59691/// to get simple value types will assert).
59693 SelectionDAG &DAG) {
59694 SDValue Sel = Ext->getOperand(0);
59695 if (Sel.getOpcode() != ISD::VSELECT ||
59696 !isFreeToSplitVector(Sel.getOperand(0), DAG))
59697 return SDValue();
59698
59699 // Note: We assume simple value types because this should only be called with
59700 // legal operations/types.
59701 // TODO: This can be extended to handle extraction to 256-bits.
59702 MVT VT = Ext->getSimpleValueType(0);
59703 if (!VT.is128BitVector())
59704 return SDValue();
59705
59706 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
59707 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
59708 return SDValue();
59709
59710 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
59711 MVT SelVT = Sel.getSimpleValueType();
59712 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
59713 "Unexpected vector type with legal operations");
59714
59715 unsigned SelElts = SelVT.getVectorNumElements();
59716 unsigned CastedElts = WideVT.getVectorNumElements();
59717 unsigned ExtIdx = Ext->getConstantOperandVal(1);
59718 if (SelElts % CastedElts == 0) {
59719 // The select has the same or more (narrower) elements than the extract
59720 // operand. The extraction index gets scaled by that factor.
59721 ExtIdx *= (SelElts / CastedElts);
59722 } else if (CastedElts % SelElts == 0) {
59723 // The select has less (wider) elements than the extract operand. Make sure
59724 // that the extraction index can be divided evenly.
59725 unsigned IndexDivisor = CastedElts / SelElts;
59726 if (ExtIdx % IndexDivisor != 0)
59727 return SDValue();
59728 ExtIdx /= IndexDivisor;
59729 } else {
59730 llvm_unreachable("Element count of simple vector types are not divisible?");
59731 }
59732
59733 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
59734 unsigned NarrowElts = SelElts / NarrowingFactor;
59735 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
59736 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
59737 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
59738 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
59739 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
59740 return DAG.getBitcast(VT, NarrowSel);
59741}
59742
59745 const X86Subtarget &Subtarget) {
59746 if (!N->getValueType(0).isSimple())
59747 return SDValue();
59748
59749 MVT VT = N->getSimpleValueType(0);
59750 SDValue InVec = N->getOperand(0);
59751 unsigned IdxVal = N->getConstantOperandVal(1);
59752 EVT InVecVT = InVec.getValueType();
59753 unsigned SizeInBits = VT.getSizeInBits();
59754 unsigned InSizeInBits = InVecVT.getSizeInBits();
59755 unsigned NumSubElts = VT.getVectorNumElements();
59756 unsigned NumInElts = InVecVT.getVectorNumElements();
59757 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
59758 SDLoc DL(N);
59759
59760 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
59761 // eventually get combined/lowered into ANDNP) with a concatenated operand,
59762 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
59763 // We let generic combining take over from there to simplify the
59764 // insert/extract and 'not'.
59765 // This pattern emerges during AVX1 legalization. We handle it before lowering
59766 // to avoid complications like splitting constant vector loads.
59767 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && TLI.isTypeLegal(InVecVT) &&
59768 InSizeInBits == 256 && InVec.getOpcode() == ISD::AND) {
59769 auto isConcatenatedNot = [](SDValue V) {
59770 V = peekThroughBitcasts(V);
59771 if (!isBitwiseNot(V))
59772 return false;
59773 SDValue NotOp = V->getOperand(0);
59775 };
59776 if (isConcatenatedNot(InVec.getOperand(0)) ||
59777 isConcatenatedNot(InVec.getOperand(1))) {
59778 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
59779 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
59780 splitVectorIntBinary(InVec, DAG, DL),
59781 N->getOperand(1));
59782 }
59783 }
59784
59785 if (DCI.isBeforeLegalizeOps())
59786 return SDValue();
59787
59788 if (SDValue V = narrowExtractedVectorSelect(N, DL, DAG))
59789 return V;
59790
59792 return getZeroVector(VT, Subtarget, DAG, DL);
59793
59794 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
59795 if (VT.getScalarType() == MVT::i1)
59796 return DAG.getConstant(1, DL, VT);
59797 return getOnesVector(VT, DAG, DL);
59798 }
59799
59800 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
59801 return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts));
59802
59803 // EXTRACT_SUBVECTOR(EXTRACT_SUBVECTOR(V,C1)),C2) - EXTRACT_SUBVECTOR(V,C1+C2)
59804 if (IdxVal != 0 && InVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
59805 InVec.hasOneUse() && TLI.isTypeLegal(VT) &&
59806 TLI.isTypeLegal(InVec.getOperand(0).getValueType())) {
59807 unsigned NewIdx = IdxVal + InVec.getConstantOperandVal(1);
59808 return extractSubVector(InVec.getOperand(0), NewIdx, DAG, DL, SizeInBits);
59809 }
59810
59811 // EXTRACT_SUBVECTOR(INSERT_SUBVECTOR(SRC,SUB,C1),C2)
59812 // --> INSERT_SUBVECTOR(EXTRACT_SUBVECTOR(SRC,C2),SUB,C1-C2)
59813 // iff SUB is entirely contained in the extraction.
59814 if (VT.getVectorElementType() != MVT::i1 && TLI.isTypeLegal(VT) &&
59815 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse()) {
59816 SDValue Src = InVec.getOperand(0);
59817 SDValue Sub = InVec.getOperand(1);
59818 EVT SubVT = Sub.getValueType();
59819 uint64_t InsIdx = InVec.getConstantOperandVal(2);
59820 if (IdxVal <= InsIdx &&
59821 (IdxVal + NumSubElts) >= (InsIdx + SubVT.getVectorNumElements())) {
59822 SDValue NewSrc = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src,
59823 DAG.getVectorIdxConstant(IdxVal, DL));
59824 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewSrc, Sub,
59825 DAG.getVectorIdxConstant(InsIdx - IdxVal, DL));
59826 }
59827 }
59828
59829 // If we're extracting an upper subvector see if we'd get the same elements if
59830 // we extracted the lowest subvector instead which should allow
59831 // SimplifyDemandedVectorElts do more simplifications.
59832 if (IdxVal != 0) {
59833 bool AllEquiv = all_of(seq<unsigned>(NumSubElts), [&](unsigned I) {
59834 return IsElementEquivalent(NumInElts, InVec, InVec, I, I + IdxVal);
59835 });
59836 if (AllEquiv)
59837 return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
59838 }
59839
59840 // Check if we're extracting a whole broadcasted subvector.
59841 if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
59842 auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);
59843 EVT MemVT = MemIntr->getMemoryVT();
59844 if (MemVT == VT) {
59845 // If this is the only use, we can replace with a regular load (this may
59846 // have been missed by SimplifyDemandedVectorElts due to extra uses of the
59847 // memory chain).
59848 if (InVec.hasOneUse()) {
59849 SDValue Ld =
59850 DAG.getLoad(MemVT, DL, MemIntr->getChain(), MemIntr->getBasePtr(),
59851 MemIntr->getMemOperand());
59852 DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1), Ld.getValue(1));
59853 return Ld;
59854 }
59855 }
59856 }
59857
59858 // Attempt to extract from the source of a shuffle vector.
59859 if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
59860 SmallVector<int, 32> ShuffleMask;
59861 SmallVector<int, 32> ScaledMask;
59862 SmallVector<SDValue, 2> ShuffleInputs;
59863 unsigned NumSubVecs = InSizeInBits / SizeInBits;
59864 // Decode the shuffle mask and scale it so its shuffling subvectors.
59865 if (getTargetShuffleInputs(InVec, ShuffleInputs, ShuffleMask, DAG) &&
59866 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
59867 unsigned SubVecIdx = IdxVal / NumSubElts;
59868 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
59869 return DAG.getUNDEF(VT);
59870 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
59871 return getZeroVector(VT, Subtarget, DAG, DL);
59872 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
59873 if (Src.getValueSizeInBits() == InSizeInBits) {
59874 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
59875 unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;
59876 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
59877 DL, SizeInBits);
59878 }
59879 }
59880 }
59881
59882 auto IsExtractFree = [](SDValue V) {
59883 if (V.hasOneUse()) {
59885 if (V.getOpcode() == ISD::LOAD)
59886 return true;
59887 }
59888 V = peekThroughBitcasts(V);
59889 if (ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
59890 return true;
59892 return true;
59893 return V.isUndef();
59894 };
59895
59896 // If we're extracting the lowest subvector and we're the only user,
59897 // we may be able to perform this with a smaller vector width.
59898 unsigned InOpcode = InVec.getOpcode();
59899 if (InVec.hasOneUse()) {
59900 if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
59901 // v2f64 CVTDQ2PD(v4i32).
59902 if (InOpcode == ISD::SINT_TO_FP &&
59903 InVec.getOperand(0).getValueType() == MVT::v4i32) {
59904 return DAG.getNode(X86ISD::CVTSI2P, DL, VT, InVec.getOperand(0));
59905 }
59906 // v2f64 CVTUDQ2PD(v4i32).
59907 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
59908 InVec.getOperand(0).getValueType() == MVT::v4i32) {
59909 return DAG.getNode(X86ISD::CVTUI2P, DL, VT, InVec.getOperand(0));
59910 }
59911 // v2f64 CVTPS2PD(v4f32).
59912 if (InOpcode == ISD::FP_EXTEND &&
59913 InVec.getOperand(0).getValueType() == MVT::v4f32) {
59914 return DAG.getNode(X86ISD::VFPEXT, DL, VT, InVec.getOperand(0));
59915 }
59916 }
59917 // v4i32 CVTPS2DQ(v4f32) / CVTPS2UDQ(v4f32).
59918 // v4f32 CVTDQ2PS(v4i32) / CVTUDQ2PS(v4i32).
59919 if ((InOpcode == ISD::FP_TO_SINT || InOpcode == ISD::SINT_TO_FP ||
59920 ((InOpcode == ISD::FP_TO_UINT || InOpcode == ISD::UINT_TO_FP) &&
59921 Subtarget.hasVLX())) &&
59922 (VT == MVT::v4i32 || VT == MVT::v4f32)) {
59923 SDValue Src = InVec.getOperand(0);
59924 if (Src.getValueType().getScalarSizeInBits() == 32)
59925 return DAG.getNode(InOpcode, DL, VT,
59926 extractSubVector(Src, IdxVal, DAG, DL, SizeInBits));
59927 }
59928 if (IdxVal == 0 &&
59929 (ISD::isExtOpcode(InOpcode) || ISD::isExtVecInRegOpcode(InOpcode)) &&
59930 (SizeInBits == 128 || SizeInBits == 256) &&
59931 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
59932 SDValue Ext = InVec.getOperand(0);
59933 if (Ext.getValueSizeInBits() > SizeInBits)
59934 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
59935 unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);
59936 return DAG.getNode(ExtOp, DL, VT, Ext);
59937 }
59938 if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
59939 InVec.getOperand(0).getValueType().is256BitVector() &&
59940 InVec.getOperand(1).getValueType().is256BitVector() &&
59941 InVec.getOperand(2).getValueType().is256BitVector()) {
59942 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
59943 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
59944 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
59945 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
59946 }
59947 if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
59948 (SizeInBits == 128 || SizeInBits == 256)) {
59949 SDValue InVecSrc = InVec.getOperand(0);
59950 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
59951 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
59952 return DAG.getNode(InOpcode, DL, VT, Ext);
59953 }
59954
59955 if (SizeInBits == 128 || SizeInBits == 256) {
59956 switch (InOpcode) {
59957 case X86ISD::MOVDDUP:
59958 return DAG.getNode(
59959 InOpcode, DL, VT,
59960 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits));
59961 case X86ISD::PSHUFD:
59962 case X86ISD::VPERMILPI:
59963 if (InVec.getOperand(0).hasOneUse()) {
59964 uint64_t M = InVec.getConstantOperandVal(1) & 255;
59965 M = VT.getScalarSizeInBits() < 64 ? M : (M >> IdxVal);
59966 return DAG.getNode(InOpcode, DL, VT,
59967 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59968 DL, SizeInBits),
59969 DAG.getTargetConstant(M, DL, MVT::i8));
59970 }
59971 break;
59972 case X86ISD::PCMPEQ:
59973 case X86ISD::PCMPGT:
59974 case X86ISD::UNPCKH:
59975 case X86ISD::UNPCKL:
59976 if (IsExtractFree(InVec.getOperand(0)) ||
59977 IsExtractFree(InVec.getOperand(1)))
59978 return DAG.getNode(InOpcode, DL, VT,
59979 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59980 DL, SizeInBits),
59981 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59982 DL, SizeInBits));
59983 break;
59984 case X86ISD::CMPP:
59985 if (IsExtractFree(InVec.getOperand(0)) ||
59986 IsExtractFree(InVec.getOperand(1)))
59987 return DAG.getNode(InOpcode, DL, VT,
59988 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59989 DL, SizeInBits),
59990 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59991 DL, SizeInBits),
59992 InVec.getOperand(2));
59993 break;
59994 case X86ISD::BLENDI:
59995 if (IsExtractFree(InVec.getOperand(0)) ||
59996 IsExtractFree(InVec.getOperand(1))) {
59997 uint64_t M = InVec.getConstantOperandVal(2) & 255;
59998 M = VT.getScalarType() == MVT::i16 ? M : (M >> IdxVal);
59999 return DAG.getNode(InOpcode, DL, VT,
60000 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
60001 DL, SizeInBits),
60002 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
60003 DL, SizeInBits),
60004 DAG.getTargetConstant(M, DL, MVT::i8));
60005 }
60006 break;
60007 case X86ISD::VPERMV:
60008 if (IdxVal != 0) {
60009 SDValue Mask = InVec.getOperand(0);
60010 SDValue Src = InVec.getOperand(1);
60011 Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);
60012 Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,
60013 DL, InSizeInBits);
60014 SDValue Shuffle = DAG.getNode(InOpcode, DL, InVecVT, Mask, Src);
60015 return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);
60016 }
60017 break;
60018 case X86ISD::VPERMV3:
60019 if (IdxVal != 0) {
60020 SDValue Src0 = InVec.getOperand(0);
60021 SDValue Mask = InVec.getOperand(1);
60022 SDValue Src1 = InVec.getOperand(2);
60023 Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);
60024 Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,
60025 DL, InSizeInBits);
60026 SDValue Shuffle =
60027 DAG.getNode(InOpcode, DL, InVecVT, Src0, Mask, Src1);
60028 return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);
60029 }
60030 break;
60031 }
60032 }
60033 }
60034
60035 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
60036 // as this is very likely to fold into a shuffle/truncation.
60037 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
60038 InVecVT.getScalarSizeInBits() == 64 &&
60039 InVec.getConstantOperandAPInt(1) == 32) {
60040 SDValue Ext =
60041 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
60042 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
60043 }
60044
60045 return SDValue();
60046}
60047
60049 const X86Subtarget &Subtarget) {
60050 using namespace SDPatternMatch;
60051 EVT VT = N->getValueType(0);
60052 SDValue Src = N->getOperand(0);
60053 SDLoc DL(N);
60054
60055 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
60056 // This occurs frequently in our masked scalar intrinsic code and our
60057 // floating point select lowering with AVX512.
60058 // TODO: SimplifyDemandedBits instead?
60059 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&
60060 isOneConstant(Src.getOperand(1)))
60061 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));
60062
60063 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
60064 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60065 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
60066 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
60067 isNullConstant(Src.getOperand(1)))
60068 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
60069 Src.getOperand(1));
60070
60071 // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.
60072 // TODO: Move to DAGCombine/SimplifyDemandedBits?
60073 if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {
60074 auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {
60075 if (Op.getValueType() != MVT::i64)
60076 return SDValue();
60077 unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;
60078 if (Op.getOpcode() == Opc &&
60079 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
60080 return Op.getOperand(0);
60081 unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;
60082 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
60083 if (Ld->getExtensionType() == Ext &&
60084 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
60085 return Op;
60086 if (IsZeroExt) {
60087 KnownBits Known = DAG.computeKnownBits(Op);
60088 if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)
60089 return Op;
60090 }
60091 return SDValue();
60092 };
60093
60094 if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))
60095 return DAG.getBitcast(
60096 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
60097 DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));
60098
60099 if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))
60100 return DAG.getBitcast(
60101 VT,
60102 DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
60103 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
60104 DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));
60105 }
60106
60107 if (Src.getOpcode() == ISD::BITCAST) {
60108 SDValue SrcOp = Src.getOperand(0);
60109 // Combine (v4i32 (scalar_to_vector (i32 (bitcast (float))))) to MOVD.
60110 if (VT == MVT::v4i32 && SrcOp.getValueType() == MVT::f32)
60111 return DAG.getBitcast(
60112 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, SrcOp));
60113 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (double))))) to MOVQ.
60114 if (VT == MVT::v2i64 && SrcOp.getValueType() == MVT::f64)
60115 return DAG.getBitcast(
60116 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, SrcOp));
60117 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (mmx))))) to MOVQ2DQ.
60118 if (VT == MVT::v2i64 && SrcOp.getValueType() == MVT::x86mmx)
60119 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, SrcOp);
60120 }
60121
60122 if (VT == MVT::v4i32) {
60123 SDValue HalfSrc;
60124 // Combine (v4i32 (scalar_to_vector (i32 (anyext (bitcast (f16))))))
60125 // to remove XMM->GPR->XMM moves.
60126 if (sd_match(Src, m_AnyExt(m_BitCast(
60127 m_AllOf(m_SpecificVT(MVT::f16), m_Value(HalfSrc))))))
60128 return DAG.getBitcast(
60129 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, HalfSrc));
60130 }
60131
60132 // See if we're broadcasting the scalar value, in which case just reuse that.
60133 // Ensure the same SDValue from the SDNode use is being used.
60134 if (VT.getScalarType() == Src.getValueType())
60135 for (SDNode *User : Src->users())
60136 if (User->getOpcode() == X86ISD::VBROADCAST &&
60137 Src == User->getOperand(0)) {
60138 unsigned SizeInBits = VT.getFixedSizeInBits();
60139 unsigned BroadcastSizeInBits =
60140 User->getValueSizeInBits(0).getFixedValue();
60141 if (BroadcastSizeInBits == SizeInBits)
60142 return SDValue(User, 0);
60143 if (BroadcastSizeInBits > SizeInBits)
60144 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
60145 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
60146 // coverage.
60147 }
60148
60149 // Check for cases where we've ended up with a scalarized shift, typically
60150 // during type legalization.
60151 switch (Src.getOpcode()) {
60152 case ISD::SHL:
60153 case ISD::SRL:
60154 case ISD::SRA:
60155 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
60156 if (supportedVectorShiftWithImm(VT, Subtarget, Src.getOpcode()) &&
60157 Src.hasOneUse()) {
60158 SDValue SrcVec =
60159 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
60160 unsigned Opc = getTargetVShiftUniformOpcode(Src.getOpcode(), false);
60161 return getTargetVShiftByConstNode(Opc, DL, VT.getSimpleVT(), SrcVec,
60162 Amt->getZExtValue(), DAG);
60163 }
60164 }
60165 break;
60166 case ISD::FSHL:
60167 case ISD::FSHR:
60168 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(2))) {
60169 if (supportedVectorShiftWithImm(VT, Subtarget, ISD::SHL) &&
60170 Src.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60171 Src.getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60172 Src.hasOneUse()) {
60173 uint64_t AmtVal =
60174 Amt->getAPIntValue().urem(Src.getScalarValueSizeInBits());
60175 SDValue SrcVec0 =
60176 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
60177 SDValue SrcVec1 =
60178 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(1));
60179 return DAG.getNode(Src.getOpcode(), DL, VT, SrcVec0, SrcVec1,
60180 DAG.getConstant(AmtVal, DL, VT));
60181 }
60182 }
60183 break;
60184 }
60185
60186 return SDValue();
60187}
60188
60189// Simplify PMULDQ and PMULUDQ operations.
60192 const X86Subtarget &Subtarget) {
60193 SDValue LHS = N->getOperand(0);
60194 SDValue RHS = N->getOperand(1);
60195
60196 // Canonicalize constant to RHS.
60199 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
60200
60201 // Multiply by zero.
60202 // Don't return RHS as it may contain UNDEFs.
60203 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
60204 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
60205
60206 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
60207 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60208 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
60209 return SDValue(N, 0);
60210
60211 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
60212 // convert it to any_extend_invec, due to the LegalOperations check, do the
60213 // conversion directly to a vector shuffle manually. This exposes combine
60214 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
60215 // combineX86ShufflesRecursively on SSE4.1 targets.
60216 // FIXME: This is basically a hack around several other issues related to
60217 // ANY_EXTEND_VECTOR_INREG.
60218 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
60219 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
60220 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
60221 LHS.getOperand(0).getValueType() == MVT::v4i32) {
60222 SDLoc dl(N);
60223 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
60224 LHS.getOperand(0), { 0, -1, 1, -1 });
60225 LHS = DAG.getBitcast(MVT::v2i64, LHS);
60226 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
60227 }
60228 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
60229 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
60230 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
60231 RHS.getOperand(0).getValueType() == MVT::v4i32) {
60232 SDLoc dl(N);
60233 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
60234 RHS.getOperand(0), { 0, -1, 1, -1 });
60235 RHS = DAG.getBitcast(MVT::v2i64, RHS);
60236 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
60237 }
60238
60239 return SDValue();
60240}
60241
60242// Simplify VPMADDUBSW/VPMADDWD operations.
60245 MVT VT = N->getSimpleValueType(0);
60246 SDValue LHS = N->getOperand(0);
60247 SDValue RHS = N->getOperand(1);
60248 unsigned Opc = N->getOpcode();
60249 bool IsPMADDWD = Opc == X86ISD::VPMADDWD;
60251 "Unexpected PMADD opcode");
60252
60253 // Multiply by zero.
60254 // Don't return LHS/RHS as it may contain UNDEFs.
60255 if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
60257 return DAG.getConstant(0, SDLoc(N), VT);
60258
60259 // Constant folding.
60260 APInt LHSUndefs, RHSUndefs;
60261 SmallVector<APInt> LHSBits, RHSBits;
60262 unsigned SrcEltBits = LHS.getScalarValueSizeInBits();
60263 unsigned DstEltBits = VT.getScalarSizeInBits();
60264 if (getTargetConstantBitsFromNode(LHS, SrcEltBits, LHSUndefs, LHSBits) &&
60265 getTargetConstantBitsFromNode(RHS, SrcEltBits, RHSUndefs, RHSBits)) {
60266 SmallVector<APInt> Result;
60267 for (unsigned I = 0, E = LHSBits.size(); I != E; I += 2) {
60268 APInt LHSLo = LHSBits[I + 0], LHSHi = LHSBits[I + 1];
60269 APInt RHSLo = RHSBits[I + 0], RHSHi = RHSBits[I + 1];
60270 LHSLo = IsPMADDWD ? LHSLo.sext(DstEltBits) : LHSLo.zext(DstEltBits);
60271 LHSHi = IsPMADDWD ? LHSHi.sext(DstEltBits) : LHSHi.zext(DstEltBits);
60272 APInt Lo = LHSLo * RHSLo.sext(DstEltBits);
60273 APInt Hi = LHSHi * RHSHi.sext(DstEltBits);
60274 APInt Res = IsPMADDWD ? (Lo + Hi) : Lo.sadd_sat(Hi);
60275 Result.push_back(Res);
60276 }
60277 return getConstVector(Result, VT, DAG, SDLoc(N));
60278 }
60279
60280 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60281 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
60282 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
60283 return SDValue(N, 0);
60284
60285 return SDValue();
60286}
60287
60288// Simplify VPMADD52L/VPMADD52H operations.
60291 MVT VT = N->getSimpleValueType(0);
60292
60293 bool AddLow = N->getOpcode() == X86ISD::VPMADD52L;
60294 SDValue Op0 = N->getOperand(0);
60295 SDValue Op1 = N->getOperand(1);
60296 SDValue Op2 = N->getOperand(2);
60297 SDLoc DL(N);
60298
60299 APInt C0, C1;
60300 bool HasC0 = X86::isConstantSplat(Op0, C0),
60301 HasC1 = X86::isConstantSplat(Op1, C1);
60302
60303 // lo/hi(C * X) + Z --> lo/hi(X * C) + Z
60304 if (HasC0 && !HasC1)
60305 return DAG.getNode(N->getOpcode(), DL, VT, Op1, Op0, Op2);
60306
60307 // lo(X * 1) + Z --> lo(X) + Z iff X == lo(X)
60308 if (AddLow && HasC1 && C1.trunc(52).isOne()) {
60309 KnownBits KnownOp0 = DAG.computeKnownBits(Op0);
60310 if (KnownOp0.countMinLeadingZeros() >= 12)
60311 return DAG.getNode(ISD::ADD, DL, VT, Op0, Op2);
60312 }
60313
60314 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60315 unsigned NumEltBits = VT.getScalarSizeInBits();
60316 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
60317 DCI))
60318 return SDValue(N, 0);
60319
60320 return SDValue();
60321}
60322
60325 const X86Subtarget &Subtarget) {
60326 EVT VT = N->getValueType(0);
60327 SDValue In = N->getOperand(0);
60328 unsigned Opcode = N->getOpcode();
60329 unsigned InOpcode = In.getOpcode();
60330 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60331 SDLoc DL(N);
60332
60333 // Try to merge vector loads and extend_inreg to an extload.
60334 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
60335 In.hasOneUse()) {
60336 auto *Ld = cast<LoadSDNode>(In);
60337 if (Ld->isSimple()) {
60338 MVT SVT = In.getSimpleValueType().getVectorElementType();
60341 : ISD::ZEXTLOAD;
60342 EVT MemVT = VT.changeVectorElementType(SVT);
60343 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
60344 SDValue Load = DAG.getExtLoad(
60345 Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
60346 MemVT, Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
60347 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
60348 return Load;
60349 }
60350 }
60351 }
60352
60353 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
60354 if (Opcode == InOpcode)
60355 return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
60356
60357 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
60358 // -> EXTEND_VECTOR_INREG(X).
60359 // TODO: Handle non-zero subvector indices.
60360 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
60361 In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&
60362 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
60363 In.getValueSizeInBits())
60364 return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
60365
60366 // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
60367 // TODO: Move to DAGCombine?
60368 if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
60369 In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
60370 In.getValueSizeInBits() == VT.getSizeInBits()) {
60371 unsigned NumElts = VT.getVectorNumElements();
60372 unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
60373 EVT EltVT = In.getOperand(0).getValueType();
60374 SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
60375 for (unsigned I = 0; I != NumElts; ++I)
60376 Elts[I * Scale] = In.getOperand(I);
60377 return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
60378 }
60379
60380 // Attempt to combine as a shuffle on SSE41+ targets.
60381 if (Subtarget.hasSSE41()) {
60382 SDValue Op(N, 0);
60383 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
60384 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
60385 return Res;
60386 }
60387
60388 return SDValue();
60389}
60390
60393 EVT VT = N->getValueType(0);
60394 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60395 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
60396 return DAG.getConstant(0, SDLoc(N), VT);
60397
60398 // Fold kshiftr(extract_subvector(X,C1),C2)
60399 // --> extract_subvector(kshiftr(X,C1+C2),0)
60400 // Fold kshiftr(kshiftr(X,C1),C2) --> kshiftr(X,C1+C2)
60401 if (N->getOpcode() == X86ISD::KSHIFTR) {
60402 SDLoc DL(N);
60403 if (N->getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR ||
60404 N->getOperand(0).getOpcode() == X86ISD::KSHIFTR) {
60405 SDValue Src = N->getOperand(0).getOperand(0);
60406 uint64_t Amt = N->getConstantOperandVal(1) +
60407 N->getOperand(0).getConstantOperandVal(1);
60408 EVT SrcVT = Src.getValueType();
60409 if (TLI.isTypeLegal(SrcVT) && Amt < SrcVT.getVectorNumElements()) {
60410 SDValue Shift = DAG.getNode(X86ISD::KSHIFTR, DL, SrcVT, Src,
60411 DAG.getTargetConstant(Amt, DL, MVT::i8));
60412 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shift,
60413 DAG.getVectorIdxConstant(0, DL));
60414 }
60415 }
60416 }
60417
60418 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
60419 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
60420 return SDValue(N, 0);
60421
60422 return SDValue();
60423}
60424
60425// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
60426// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
60427// extra instructions between the conversion due to going to scalar and back.
60429 const X86Subtarget &Subtarget) {
60430 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
60431 return SDValue();
60432
60433 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
60434 return SDValue();
60435
60436 if (N->getValueType(0) != MVT::f32 ||
60437 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
60438 return SDValue();
60439
60440 SDLoc dl(N);
60441 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
60442 N->getOperand(0).getOperand(0));
60443 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
60444 DAG.getTargetConstant(4, dl, MVT::i32));
60445 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
60446 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
60447 DAG.getVectorIdxConstant(0, dl));
60448}
60449
60452 const X86Subtarget &Subtarget) {
60453 EVT VT = N->getValueType(0);
60454 bool IsStrict = N->isStrictFPOpcode();
60455 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
60456 EVT SrcVT = Src.getValueType();
60457
60458 SDLoc dl(N);
60459 if (SrcVT.getScalarType() == MVT::bf16) {
60460 if (DCI.isAfterLegalizeDAG() && Src.getOpcode() == ISD::FP_ROUND &&
60461 !IsStrict && Src.getOperand(0).getValueType() == VT)
60462 return Src.getOperand(0);
60463
60464 if (!SrcVT.isVector())
60465 return SDValue();
60466
60467 assert(!IsStrict && "Strict FP doesn't support BF16");
60468 if (VT.getVectorElementType() == MVT::f64) {
60469 EVT TmpVT = VT.changeVectorElementType(MVT::f32);
60470 return DAG.getNode(ISD::FP_EXTEND, dl, VT,
60471 DAG.getNode(ISD::FP_EXTEND, dl, TmpVT, Src));
60472 }
60473 assert(VT.getVectorElementType() == MVT::f32 && "Unexpected fpext");
60474 EVT NVT = SrcVT.changeVectorElementType(MVT::i32);
60475 Src = DAG.getBitcast(SrcVT.changeTypeToInteger(), Src);
60476 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Src);
60477 Src = DAG.getNode(ISD::SHL, dl, NVT, Src, DAG.getConstant(16, dl, NVT));
60478 return DAG.getBitcast(VT, Src);
60479 }
60480
60481 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
60482 return SDValue();
60483
60484 if (Subtarget.hasFP16())
60485 return SDValue();
60486
60487 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
60488 return SDValue();
60489
60490 if (VT.getVectorElementType() != MVT::f32 &&
60491 VT.getVectorElementType() != MVT::f64)
60492 return SDValue();
60493
60494 unsigned NumElts = VT.getVectorNumElements();
60495 if (NumElts == 1 || !isPowerOf2_32(NumElts))
60496 return SDValue();
60497
60498 // Convert the input to vXi16.
60499 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
60500 Src = DAG.getBitcast(IntVT, Src);
60501
60502 // Widen to at least 8 input elements.
60503 if (NumElts < 8) {
60504 unsigned NumConcats = 8 / NumElts;
60505 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
60506 : DAG.getConstant(0, dl, IntVT);
60507 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
60508 Ops[0] = Src;
60509 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
60510 }
60511
60512 // Destination is vXf32 with at least 4 elements.
60513 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
60514 std::max(4U, NumElts));
60515 SDValue Cvt, Chain;
60516 if (IsStrict) {
60517 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
60518 {N->getOperand(0), Src});
60519 Chain = Cvt.getValue(1);
60520 } else {
60521 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
60522 }
60523
60524 if (NumElts < 4) {
60525 assert(NumElts == 2 && "Unexpected size");
60526 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
60527 DAG.getVectorIdxConstant(0, dl));
60528 }
60529
60530 if (IsStrict) {
60531 // Extend to the original VT if necessary.
60532 if (Cvt.getValueType() != VT) {
60533 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
60534 {Chain, Cvt});
60535 Chain = Cvt.getValue(1);
60536 }
60537 return DAG.getMergeValues({Cvt, Chain}, dl);
60538 }
60539
60540 // Extend to the original VT if necessary.
60541 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
60542}
60543
60544// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract.
60547 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
60548 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
60549 "Unknown broadcast load type");
60550
60551 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
60552 SDValue Ptr = MemIntrin->getBasePtr();
60553 SDValue Chain = MemIntrin->getChain();
60554 EVT VT = N->getSimpleValueType(0);
60555 EVT MemVT = MemIntrin->getMemoryVT();
60556
60557 // Look at other users of our base pointer and try to find a wider broadcast.
60558 // The input chain and the size of the memory VT must match.
60559 for (SDNode *User : Ptr->users())
60560 if (User != N && User->getOpcode() == N->getOpcode() &&
60561 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
60562 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
60563 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
60564 MemVT.getSizeInBits() &&
60565 User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
60567 MemIntrin->isSimple() && "Illegal broadcast load type");
60569 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
60570 VT.getSizeInBits());
60571 Extract = DAG.getBitcast(VT, Extract);
60572 Extract = DCI.CombineTo(N, Extract, SDValue(User, 1));
60573 return Extract;
60574 }
60575
60576 return SDValue();
60577}
60578
60580 const X86Subtarget &Subtarget) {
60581 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
60582 return SDValue();
60583
60584 bool IsStrict = N->isStrictFPOpcode();
60585 EVT VT = N->getValueType(0);
60586 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
60587 EVT SrcVT = Src.getValueType();
60588
60589 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
60590 SrcVT.getVectorElementType() != MVT::f32)
60591 return SDValue();
60592
60593 SDLoc dl(N);
60594
60595 SDValue Cvt, Chain;
60596 unsigned NumElts = VT.getVectorNumElements();
60597 if (Subtarget.hasFP16()) {
60598 // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64),
60599 // v4f32 (xint_to_fp v4i64))))
60600 // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64),
60601 // v8f16 (CVTXI2P v4i64)))
60602 if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS &&
60603 Src.getNumOperands() == 2) {
60604 SDValue Cvt0, Cvt1;
60605 SDValue Op0 = Src.getOperand(0);
60606 SDValue Op1 = Src.getOperand(1);
60607 bool IsOp0Strict = Op0->isStrictFPOpcode();
60608 if (Op0.getOpcode() != Op1.getOpcode() ||
60609 Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||
60610 Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {
60611 return SDValue();
60612 }
60613 int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};
60614 if (IsStrict) {
60615 assert(IsOp0Strict && "Op0 must be strict node");
60616 unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP
60619 Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
60620 {Op0.getOperand(0), Op0.getOperand(1)});
60621 Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
60622 {Op1.getOperand(0), Op1.getOperand(1)});
60623 Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
60624 return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);
60625 }
60626 unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P
60628 Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));
60629 Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));
60630 return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
60631 }
60632 return SDValue();
60633 }
60634
60635 if (NumElts == 1 || !isPowerOf2_32(NumElts))
60636 return SDValue();
60637
60638 // Widen to at least 4 input elements.
60639 if (NumElts < 4)
60640 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
60641 DAG.getConstantFP(0.0, dl, SrcVT));
60642
60643 // Destination is v8i16 with at least 8 elements.
60644 EVT CvtVT =
60645 EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));
60646 SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
60647 if (IsStrict) {
60648 Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
60649 {N->getOperand(0), Src, Rnd});
60650 Chain = Cvt.getValue(1);
60651 } else {
60652 Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
60653 }
60654
60655 // Extract down to real number of elements.
60656 if (NumElts < 8) {
60658 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
60659 DAG.getVectorIdxConstant(0, dl));
60660 }
60661
60662 Cvt = DAG.getBitcast(VT, Cvt);
60663
60664 if (IsStrict)
60665 return DAG.getMergeValues({Cvt, Chain}, dl);
60666
60667 return Cvt;
60668}
60669
60671 SDValue Src = N->getOperand(0);
60672
60673 // Turn MOVDQ2Q+simple_load into an mmx load.
60674 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
60675 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
60676
60677 if (LN->isSimple()) {
60678 SDValue NewLd =
60679 DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(), LN->getBasePtr(),
60680 LN->getPointerInfo(), LN->getBaseAlign(),
60681 LN->getMemOperand()->getFlags());
60682 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
60683 return NewLd;
60684 }
60685 }
60686
60687 return SDValue();
60688}
60689
60692 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
60693 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60694 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
60695 return SDValue(N, 0);
60696
60697 return SDValue();
60698}
60699
60700// Fixup the MMX intrinsics' types: in IR they are expressed with <1 x i64>,
60701// and so SelectionDAGBuilder creates them with v1i64 types, but they need to
60702// use x86mmx instead.
60704 SDLoc dl(N);
60705
60706 bool MadeChange = false, CastReturnVal = false;
60708 for (const SDValue &Arg : N->op_values()) {
60709 if (Arg.getValueType() == MVT::v1i64) {
60710 MadeChange = true;
60711 Args.push_back(DAG.getBitcast(MVT::x86mmx, Arg));
60712 } else
60713 Args.push_back(Arg);
60714 }
60715 SDVTList VTs = N->getVTList();
60716 SDVTList NewVTs = VTs;
60717 if (VTs.NumVTs > 0 && VTs.VTs[0] == MVT::v1i64) {
60718 SmallVector<EVT> NewVTArr(ArrayRef<EVT>(VTs.VTs, VTs.NumVTs));
60719 NewVTArr[0] = MVT::x86mmx;
60720 NewVTs = DAG.getVTList(NewVTArr);
60721 MadeChange = true;
60722 CastReturnVal = true;
60723 }
60724
60725 if (MadeChange) {
60726 SDValue Result = DAG.getNode(N->getOpcode(), dl, NewVTs, Args);
60727 if (CastReturnVal) {
60729 for (unsigned i = 0, e = Result->getNumValues(); i != e; ++i)
60730 Returns.push_back(Result.getValue(i));
60731 Returns[0] = DAG.getBitcast(MVT::v1i64, Returns[0]);
60732 return DAG.getMergeValues(Returns, dl);
60733 }
60734 return Result;
60735 }
60736 return SDValue();
60737}
60740 if (!DCI.isBeforeLegalize())
60741 return SDValue();
60742
60743 unsigned IntNo = N->getConstantOperandVal(0);
60744 const IntrinsicData *IntrData = getIntrinsicWithoutChain(IntNo);
60745
60746 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60747 return FixupMMXIntrinsicTypes(N, DAG);
60748
60749 return SDValue();
60750}
60751
60754 if (!DCI.isBeforeLegalize())
60755 return SDValue();
60756
60757 unsigned IntNo = N->getConstantOperandVal(1);
60758 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
60759
60760 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60761 return FixupMMXIntrinsicTypes(N, DAG);
60762
60763 return SDValue();
60764}
60765
60768 if (!DCI.isBeforeLegalize())
60769 return SDValue();
60770
60771 unsigned IntNo = N->getConstantOperandVal(1);
60772 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
60773
60774 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60775 return FixupMMXIntrinsicTypes(N, DAG);
60776
60777 return SDValue();
60778}
60779
60781 DAGCombinerInfo &DCI) const {
60782 SelectionDAG &DAG = DCI.DAG;
60783 switch (N->getOpcode()) {
60784 // clang-format off
60785 default: break;
60787 return combineSCALAR_TO_VECTOR(N, DAG, Subtarget);
60789 case X86ISD::PEXTRW:
60790 case X86ISD::PEXTRB:
60791 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
60793 return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
60795 return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);
60797 return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);
60798 case ISD::VSELECT:
60799 case ISD::SELECT:
60800 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
60801 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
60802 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
60803 case X86ISD::CMP: return combineCMP(N, DAG, DCI, Subtarget);
60804 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
60805 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
60806 case X86ISD::ADD:
60807 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI, Subtarget);
60808 case X86ISD::CLOAD:
60809 case X86ISD::CSTORE: return combineX86CloadCstore(N, DAG);
60810 case X86ISD::SBB: return combineSBB(N, DAG);
60811 case X86ISD::ADC: return combineADC(N, DAG, DCI);
60812 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
60813 case ISD::SHL: return combineShiftLeft(N, DAG, Subtarget);
60814 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
60815 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
60816 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
60817 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
60818 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
60819 case ISD::BITREVERSE: return combineBITREVERSE(N, DAG, DCI, Subtarget);
60820 case ISD::AVGCEILS:
60821 case ISD::AVGCEILU:
60822 case ISD::AVGFLOORS:
60823 case ISD::AVGFLOORU: return combineAVG(N, DAG, DCI, Subtarget);
60824 case X86ISD::BEXTR:
60825 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
60826 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
60827 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
60828 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
60829 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
60831 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
60832 case ISD::SINT_TO_FP:
60834 return combineSIntToFP(N, DAG, DCI, Subtarget);
60835 case ISD::UINT_TO_FP:
60837 return combineUIntToFP(N, DAG, Subtarget);
60838 case ISD::FP_TO_SINT: return combineFPToSInt(N, DAG, Subtarget);
60839 case ISD::LRINT:
60840 case ISD::LLRINT: return combineLRINT_LLRINT(N, DAG, Subtarget);
60841 case ISD::FADD:
60842 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
60843 case X86ISD::VFCMULC:
60844 case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);
60845 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
60846 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
60847 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
60848 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
60849 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
60850 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
60851 case X86ISD::FXOR:
60852 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
60853 case X86ISD::FMIN:
60854 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
60855 case ISD::FMINNUM:
60856 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
60857 case X86ISD::CVTSI2P:
60858 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
60859 case X86ISD::CVTP2SI:
60860 case X86ISD::CVTP2UI:
60862 case X86ISD::CVTTP2SI:
60864 case X86ISD::CVTTP2UI:
60865 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
60867 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
60868 case X86ISD::BT: return combineBT(N, DAG, DCI);
60869 case ISD::ANY_EXTEND:
60870 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
60871 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
60872 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
60876 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
60877 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
60878 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
60879 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
60880 case X86ISD::PACKSS:
60881 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
60882 case X86ISD::HADD:
60883 case X86ISD::HSUB:
60884 case X86ISD::FHADD:
60885 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
60886 case X86ISD::VSHL:
60887 case X86ISD::VSRA:
60888 case X86ISD::VSRL:
60889 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
60890 case X86ISD::VSHLI:
60891 case X86ISD::VSRAI:
60892 case X86ISD::VSRLI:
60893 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
60895 case X86ISD::PINSRB:
60896 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
60897 case X86ISD::SHUFP: // Handle all target specific shuffles
60898 case X86ISD::INSERTPS:
60899 case X86ISD::EXTRQI:
60900 case X86ISD::INSERTQI:
60901 case X86ISD::VALIGN:
60902 case X86ISD::PALIGNR:
60903 case X86ISD::VSHLDQ:
60904 case X86ISD::VSRLDQ:
60905 case X86ISD::BLENDI:
60906 case X86ISD::UNPCKH:
60907 case X86ISD::UNPCKL:
60908 case X86ISD::MOVHLPS:
60909 case X86ISD::MOVLHPS:
60910 case X86ISD::PSHUFB:
60911 case X86ISD::PSHUFD:
60912 case X86ISD::PSHUFHW:
60913 case X86ISD::PSHUFLW:
60914 case X86ISD::MOVSHDUP:
60915 case X86ISD::MOVSLDUP:
60916 case X86ISD::MOVDDUP:
60917 case X86ISD::MOVSS:
60918 case X86ISD::MOVSD:
60919 case X86ISD::MOVSH:
60920 case X86ISD::VBROADCAST:
60921 case X86ISD::VPPERM:
60922 case X86ISD::VPERMI:
60923 case X86ISD::VPERMV:
60924 case X86ISD::VPERMV3:
60925 case X86ISD::VPERMIL2:
60926 case X86ISD::VPERMILPI:
60927 case X86ISD::VPERMILPV:
60928 case X86ISD::VPERM2X128:
60929 case X86ISD::SHUF128:
60930 case X86ISD::VZEXT_MOVL:
60931 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
60932 case X86ISD::FMADD_RND:
60933 case X86ISD::FMSUB:
60935 case X86ISD::FMSUB_RND:
60936 case X86ISD::FNMADD:
60938 case X86ISD::FNMADD_RND:
60939 case X86ISD::FNMSUB:
60941 case X86ISD::FNMSUB_RND:
60942 case ISD::FMA:
60943 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
60946 case X86ISD::FMADDSUB:
60947 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
60948 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
60949 case X86ISD::TESTP: return combineTESTP(N, DAG, DCI, Subtarget);
60950 case X86ISD::MGATHER:
60951 case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
60952 case ISD::MGATHER:
60953 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
60954 case X86ISD::PCMPEQ:
60955 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
60956 case X86ISD::PMULDQ:
60957 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
60958 case X86ISD::VPMADDUBSW:
60959 case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
60960 case X86ISD::VPMADD52L:
60961 case X86ISD::VPMADD52H: return combineVPMADD52LH(N, DAG, DCI);
60962 case X86ISD::KSHIFTL:
60963 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
60964 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
60966 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, DCI, Subtarget);
60968 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
60970 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
60971 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
60972 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
60973 case ISD::INTRINSIC_WO_CHAIN: return combineINTRINSIC_WO_CHAIN(N, DAG, DCI);
60974 case ISD::INTRINSIC_W_CHAIN: return combineINTRINSIC_W_CHAIN(N, DAG, DCI);
60975 case ISD::INTRINSIC_VOID: return combineINTRINSIC_VOID(N, DAG, DCI);
60977 case ISD::FP_TO_UINT_SAT: return combineFP_TO_xINT_SAT(N, DAG, Subtarget);
60978 // clang-format on
60979 }
60980
60981 return SDValue();
60982}
60983
60985 return Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64);
60986}
60987
60988// Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS.
60990 EVT ExtVT) const {
60991 return Subtarget.hasAVX512() || !VT.isVector();
60992}
60993
60995 if (!isTypeLegal(VT))
60996 return false;
60997
60998 // There are no vXi8 shifts.
60999 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
61000 return false;
61001
61002 // TODO: Almost no 8-bit ops are desirable because they have no actual
61003 // size/speed advantages vs. 32-bit ops, but they do have a major
61004 // potential disadvantage by causing partial register stalls.
61005 //
61006 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
61007 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
61008 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
61009 // check for a constant operand to the multiply.
61010 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
61011 return false;
61012
61013 // i16 instruction encodings are longer and some i16 instructions are slow,
61014 // so those are not desirable.
61015 if (VT == MVT::i16) {
61016 switch (Opc) {
61017 default:
61018 break;
61019 case ISD::LOAD:
61020 case ISD::SIGN_EXTEND:
61021 case ISD::ZERO_EXTEND:
61022 case ISD::ANY_EXTEND:
61023 case ISD::MUL:
61024 return false;
61025 case ISD::SHL:
61026 case ISD::SRA:
61027 case ISD::SRL:
61028 case ISD::SUB:
61029 case ISD::ADD:
61030 case ISD::AND:
61031 case ISD::OR:
61032 case ISD::XOR:
61033 // NDD instruction never has "partial register write" issue b/c it has
61034 // destination register's upper bits [63:OSIZE]) zeroed even when
61035 // OSIZE=8/16.
61036 return Subtarget.hasNDD();
61037 }
61038 }
61039
61040 // Any legal type not explicitly accounted for above here is desirable.
61041 return true;
61042}
61043
61045 SDValue Value, SDValue Addr,
61046 int JTI,
61047 SelectionDAG &DAG) const {
61048 const Module *M = DAG.getMachineFunction().getFunction().getParent();
61049 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
61050 if (IsCFProtectionSupported) {
61051 // In case control-flow branch protection is enabled, we need to add
61052 // notrack prefix to the indirect branch.
61053 // In order to do that we create NT_BRIND SDNode.
61054 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
61055 SDValue Chain = Value;
61056 // Jump table debug info is only needed if CodeView is enabled.
61058 Chain = DAG.getJumpTableDebugInfo(JTI, Chain, dl);
61059 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Chain, Addr);
61060 }
61061
61062 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, JTI, DAG);
61063}
61064
61067 const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {
61069 EVT VT = LogicOp->getValueType(0);
61070 EVT OpVT = SETCC0->getOperand(0).getValueType();
61071 if (!VT.isInteger())
61073
61074 if (VT.isVector())
61079
61080 // Don't use `NotAnd` as even though `not` is generally shorter code size than
61081 // `add`, `add` can lower to LEA which can save moves / spills. Any case where
61082 // `NotAnd` applies, `AddAnd` does as well.
61083 // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
61084 // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.
61086}
61087
61089 EVT VT = Op.getValueType();
61090 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
61091 isa<ConstantSDNode>(Op.getOperand(1));
61092
61093 // i16 is legal, but undesirable since i16 instruction encodings are longer
61094 // and some i16 instructions are slow.
61095 // 8-bit multiply-by-constant can usually be expanded to something cheaper
61096 // using LEA and/or other ALU ops.
61097 if (VT != MVT::i16 && !Is8BitMulByConstant)
61098 return false;
61099
61100 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
61101 if (!Op.hasOneUse())
61102 return false;
61103 SDNode *User = *Op->user_begin();
61105 return false;
61106 auto *Ld = cast<LoadSDNode>(Load);
61107 auto *St = cast<StoreSDNode>(User);
61108 return Ld->getBasePtr() == St->getBasePtr();
61109 };
61110
61111 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
61112 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
61113 return false;
61114 if (!Op.hasOneUse())
61115 return false;
61116 SDNode *User = *Op->user_begin();
61117 if (User->getOpcode() != ISD::ATOMIC_STORE)
61118 return false;
61119 auto *Ld = cast<AtomicSDNode>(Load);
61120 auto *St = cast<AtomicSDNode>(User);
61121 return Ld->getBasePtr() == St->getBasePtr();
61122 };
61123
61124 auto IsFoldableZext = [](SDValue Op) {
61125 if (!Op.hasOneUse())
61126 return false;
61127 SDNode *User = *Op->user_begin();
61128 EVT VT = User->getValueType(0);
61129 return (User->getOpcode() == ISD::ZERO_EXTEND &&
61130 (VT == MVT::i32 || VT == MVT::i64));
61131 };
61132
61133 bool Commute = false;
61134 switch (Op.getOpcode()) {
61135 default: return false;
61136 case ISD::SIGN_EXTEND:
61137 case ISD::ZERO_EXTEND:
61138 case ISD::ANY_EXTEND:
61139 break;
61140 case ISD::SHL:
61141 case ISD::SRA:
61142 case ISD::SRL: {
61143 SDValue N0 = Op.getOperand(0);
61144 // Look out for (store (shl (load), x)).
61145 if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
61146 return false;
61147 break;
61148 }
61149 case ISD::MUL:
61150 // When ZU is enabled, we prefer to not promote for MUL by a constant
61151 // when there is an opportunity to fold a zext with imulzu.
61152 if (Subtarget.hasZU() && IsFoldableZext(Op) &&
61153 (isa<ConstantSDNode>(Op.getOperand(0)) ||
61154 isa<ConstantSDNode>(Op.getOperand(1))))
61155 return false;
61156 [[fallthrough]];
61157 case ISD::ADD:
61158 case ISD::AND:
61159 case ISD::OR:
61160 case ISD::XOR:
61161 Commute = true;
61162 [[fallthrough]];
61163 case ISD::SUB: {
61164 SDValue N0 = Op.getOperand(0);
61165 SDValue N1 = Op.getOperand(1);
61166 // Avoid disabling potential load folding opportunities.
61167 if (X86::mayFoldLoad(N1, Subtarget) &&
61168 (!Commute || !isa<ConstantSDNode>(N0) ||
61169 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
61170 return false;
61171 if (X86::mayFoldLoad(N0, Subtarget) &&
61172 ((Commute && !isa<ConstantSDNode>(N1)) ||
61173 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
61174 return false;
61175 if (IsFoldableAtomicRMW(N0, Op) ||
61176 (Commute && IsFoldableAtomicRMW(N1, Op)))
61177 return false;
61178 }
61179 }
61180
61181 PVT = MVT::i32;
61182 return true;
61183}
61184
61185//===----------------------------------------------------------------------===//
61186// X86 Inline Assembly Support
61187//===----------------------------------------------------------------------===//
61188
61191 .Case("{@cca}", X86::COND_A)
61192 .Case("{@ccae}", X86::COND_AE)
61193 .Case("{@ccb}", X86::COND_B)
61194 .Case("{@ccbe}", X86::COND_BE)
61195 .Case("{@ccc}", X86::COND_B)
61196 .Case("{@cce}", X86::COND_E)
61197 .Case("{@ccz}", X86::COND_E)
61198 .Case("{@ccg}", X86::COND_G)
61199 .Case("{@ccge}", X86::COND_GE)
61200 .Case("{@ccl}", X86::COND_L)
61201 .Case("{@ccle}", X86::COND_LE)
61202 .Case("{@ccna}", X86::COND_BE)
61203 .Case("{@ccnae}", X86::COND_B)
61204 .Case("{@ccnb}", X86::COND_AE)
61205 .Case("{@ccnbe}", X86::COND_A)
61206 .Case("{@ccnc}", X86::COND_AE)
61207 .Case("{@ccne}", X86::COND_NE)
61208 .Case("{@ccnz}", X86::COND_NE)
61209 .Case("{@ccng}", X86::COND_LE)
61210 .Case("{@ccnge}", X86::COND_L)
61211 .Case("{@ccnl}", X86::COND_GE)
61212 .Case("{@ccnle}", X86::COND_G)
61213 .Case("{@ccno}", X86::COND_NO)
61214 .Case("{@ccnp}", X86::COND_NP)
61215 .Case("{@ccns}", X86::COND_NS)
61216 .Case("{@cco}", X86::COND_O)
61217 .Case("{@ccp}", X86::COND_P)
61218 .Case("{@ccs}", X86::COND_S)
61220 return Cond;
61221}
61222
61223/// Given a constraint letter, return the type of constraint for this target.
61226 if (Constraint.size() == 1) {
61227 switch (Constraint[0]) {
61228 case 'R':
61229 case 'q':
61230 case 'Q':
61231 case 'f':
61232 case 't':
61233 case 'u':
61234 case 'y':
61235 case 'x':
61236 case 'v':
61237 case 'l':
61238 case 'k': // AVX512 masking registers.
61239 return C_RegisterClass;
61240 case 'a':
61241 case 'b':
61242 case 'c':
61243 case 'd':
61244 case 'S':
61245 case 'D':
61246 case 'A':
61247 return C_Register;
61248 case 'I':
61249 case 'J':
61250 case 'K':
61251 case 'N':
61252 case 'G':
61253 case 'L':
61254 case 'M':
61255 return C_Immediate;
61256 case 'C':
61257 case 'e':
61258 case 'Z':
61259 return C_Other;
61260 default:
61261 break;
61262 }
61263 }
61264 else if (Constraint.size() == 2) {
61265 switch (Constraint[0]) {
61266 default:
61267 break;
61268 case 'W':
61269 if (Constraint[1] != 's')
61270 break;
61271 return C_Other;
61272 case 'Y':
61273 switch (Constraint[1]) {
61274 default:
61275 break;
61276 case 'z':
61277 return C_Register;
61278 case 'i':
61279 case 'm':
61280 case 'k':
61281 case 't':
61282 case '2':
61283 return C_RegisterClass;
61284 }
61285 break;
61286 case 'j':
61287 switch (Constraint[1]) {
61288 default:
61289 break;
61290 case 'r':
61291 case 'R':
61292 return C_RegisterClass;
61293 }
61294 }
61295 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
61296 return C_Other;
61297 return TargetLowering::getConstraintType(Constraint);
61298}
61299
61300/// Examine constraint type and operand type and determine a weight value.
61301/// This object must already have been set up with the operand type
61302/// and the current alternative constraint selected.
61305 AsmOperandInfo &Info, const char *Constraint) const {
61307 Value *CallOperandVal = Info.CallOperandVal;
61308 // If we don't have a value, we can't do a match,
61309 // but allow it at the lowest weight.
61310 if (!CallOperandVal)
61311 return CW_Default;
61312 Type *Ty = CallOperandVal->getType();
61313 // Look at the constraint type.
61314 switch (*Constraint) {
61315 default:
61317 [[fallthrough]];
61318 case 'R':
61319 case 'q':
61320 case 'Q':
61321 case 'a':
61322 case 'b':
61323 case 'c':
61324 case 'd':
61325 case 'S':
61326 case 'D':
61327 case 'A':
61328 if (CallOperandVal->getType()->isIntegerTy())
61329 Wt = CW_SpecificReg;
61330 break;
61331 case 'f':
61332 case 't':
61333 case 'u':
61334 if (Ty->isFloatingPointTy())
61335 Wt = CW_SpecificReg;
61336 break;
61337 case 'y':
61338 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
61339 Wt = CW_SpecificReg;
61340 break;
61341 case 'Y':
61342 if (StringRef(Constraint).size() != 2)
61343 break;
61344 switch (Constraint[1]) {
61345 default:
61346 return CW_Invalid;
61347 // XMM0
61348 case 'z':
61349 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
61350 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
61351 ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
61352 return CW_SpecificReg;
61353 return CW_Invalid;
61354 // Conditional OpMask regs (AVX512)
61355 case 'k':
61356 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
61357 return CW_Register;
61358 return CW_Invalid;
61359 // Any MMX reg
61360 case 'm':
61361 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
61362 return CW_SpecificReg;
61363 return CW_Invalid;
61364 // Any SSE reg when ISA >= SSE2, same as 'x'
61365 case 'i':
61366 case 't':
61367 case '2':
61368 if (!Subtarget.hasSSE2())
61369 return CW_Invalid;
61370 break;
61371 }
61372 break;
61373 case 'j':
61374 if (StringRef(Constraint).size() != 2)
61375 break;
61376 switch (Constraint[1]) {
61377 default:
61378 return CW_Invalid;
61379 case 'r':
61380 case 'R':
61381 if (CallOperandVal->getType()->isIntegerTy())
61382 Wt = CW_SpecificReg;
61383 break;
61384 }
61385 break;
61386 case 'v':
61387 if ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
61388 Wt = CW_Register;
61389 [[fallthrough]];
61390 case 'x':
61391 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
61392 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
61393 Wt = CW_Register;
61394 break;
61395 case 'k':
61396 // Enable conditional vector operations using %k<#> registers.
61397 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
61398 Wt = CW_Register;
61399 break;
61400 case 'I':
61401 if (auto *C = dyn_cast<ConstantInt>(Info.CallOperandVal))
61402 if (C->getZExtValue() <= 31)
61403 Wt = CW_Constant;
61404 break;
61405 case 'J':
61406 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61407 if (C->getZExtValue() <= 63)
61408 Wt = CW_Constant;
61409 break;
61410 case 'K':
61411 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61412 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
61413 Wt = CW_Constant;
61414 break;
61415 case 'L':
61416 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61417 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
61418 Wt = CW_Constant;
61419 break;
61420 case 'M':
61421 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61422 if (C->getZExtValue() <= 3)
61423 Wt = CW_Constant;
61424 break;
61425 case 'N':
61426 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61427 if (C->getZExtValue() <= 0xff)
61428 Wt = CW_Constant;
61429 break;
61430 case 'G':
61431 case 'C':
61432 if (isa<ConstantFP>(CallOperandVal))
61433 Wt = CW_Constant;
61434 break;
61435 case 'e':
61436 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61437 if ((C->getSExtValue() >= -0x80000000LL) &&
61438 (C->getSExtValue() <= 0x7fffffffLL))
61439 Wt = CW_Constant;
61440 break;
61441 case 'Z':
61442 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61443 if (C->getZExtValue() <= 0xffffffff)
61444 Wt = CW_Constant;
61445 break;
61446 }
61447 return Wt;
61448}
61449
61450/// Try to replace an X constraint, which matches anything, with another that
61451/// has more specific requirements based on the type of the corresponding
61452/// operand.
61454LowerXConstraint(EVT ConstraintVT) const {
61455 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
61456 // 'f' like normal targets.
61457 if (ConstraintVT.isFloatingPoint()) {
61458 if (Subtarget.hasSSE1())
61459 return "x";
61460 }
61461
61462 return TargetLowering::LowerXConstraint(ConstraintVT);
61463}
61464
61465// Lower @cc targets via setcc.
61467 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
61468 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
61469 X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
61470 if (Cond == X86::COND_INVALID)
61471 return SDValue();
61472 // Check that return type is valid.
61473 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
61474 OpInfo.ConstraintVT.getSizeInBits() < 8)
61475 report_fatal_error("Glue output operand is of invalid type");
61476
61477 // Get EFLAGS register. Only update chain when copyfrom is glued.
61478 if (Glue.getNode()) {
61479 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);
61480 Chain = Glue.getValue(1);
61481 } else
61482 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
61483 // Extract CC code.
61484 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
61485 // Extend to 32-bits
61486 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
61487
61488 return Result;
61489}
61490
61491/// Lower the specified operand into the Ops vector.
61492/// If it is invalid, don't add anything to Ops.
61494 StringRef Constraint,
61495 std::vector<SDValue> &Ops,
61496 SelectionDAG &DAG) const {
61497 SDValue Result;
61498 char ConstraintLetter = Constraint[0];
61499 switch (ConstraintLetter) {
61500 default: break;
61501 case 'I':
61502 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61503 if (C->getZExtValue() <= 31) {
61504 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61505 Op.getValueType());
61506 break;
61507 }
61508 }
61509 return;
61510 case 'J':
61511 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61512 if (C->getZExtValue() <= 63) {
61513 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61514 Op.getValueType());
61515 break;
61516 }
61517 }
61518 return;
61519 case 'K':
61520 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61521 if (isInt<8>(C->getSExtValue())) {
61522 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61523 Op.getValueType());
61524 break;
61525 }
61526 }
61527 return;
61528 case 'L':
61529 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61530 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
61531 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
61532 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
61533 Op.getValueType());
61534 break;
61535 }
61536 }
61537 return;
61538 case 'M':
61539 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61540 if (C->getZExtValue() <= 3) {
61541 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61542 Op.getValueType());
61543 break;
61544 }
61545 }
61546 return;
61547 case 'N':
61548 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61549 if (C->getZExtValue() <= 255) {
61550 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61551 Op.getValueType());
61552 break;
61553 }
61554 }
61555 return;
61556 case 'O':
61557 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61558 if (C->getZExtValue() <= 127) {
61559 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61560 Op.getValueType());
61561 break;
61562 }
61563 }
61564 return;
61565 case 'e': {
61566 // 32-bit signed value
61567 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61569 C->getSExtValue())) {
61570 // Widen to 64 bits here to get it sign extended.
61571 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
61572 break;
61573 }
61574 // FIXME gcc accepts some relocatable values here too, but only in certain
61575 // memory models; it's complicated.
61576 }
61577 return;
61578 }
61579 case 'W': {
61580 assert(Constraint[1] == 's');
61581 // Op is a BlockAddressSDNode or a GlobalAddressSDNode with an optional
61582 // offset.
61583 if (const auto *BA = dyn_cast<BlockAddressSDNode>(Op)) {
61584 Ops.push_back(DAG.getTargetBlockAddress(BA->getBlockAddress(),
61585 BA->getValueType(0)));
61586 } else {
61587 int64_t Offset = 0;
61588 if (Op->getOpcode() == ISD::ADD &&
61589 isa<ConstantSDNode>(Op->getOperand(1))) {
61590 Offset = cast<ConstantSDNode>(Op->getOperand(1))->getSExtValue();
61591 Op = Op->getOperand(0);
61592 }
61593 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
61594 Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
61595 GA->getValueType(0), Offset));
61596 }
61597 return;
61598 }
61599 case 'Z': {
61600 // 32-bit unsigned value
61601 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61603 C->getZExtValue())) {
61604 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61605 Op.getValueType());
61606 break;
61607 }
61608 }
61609 // FIXME gcc accepts some relocatable values here too, but only in certain
61610 // memory models; it's complicated.
61611 return;
61612 }
61613 case 'i': {
61614 // Literal immediates are always ok.
61615 if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
61616 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
61617 BooleanContent BCont = getBooleanContents(MVT::i64);
61618 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
61620 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
61621 : CST->getSExtValue();
61622 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
61623 break;
61624 }
61625
61626 // In any sort of PIC mode addresses need to be computed at runtime by
61627 // adding in a register or some sort of table lookup. These can't
61628 // be used as immediates. BlockAddresses and BasicBlocks are fine though.
61629 if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
61631 return;
61632
61633 // If we are in non-pic codegen mode, we allow the address of a global (with
61634 // an optional displacement) to be used with 'i'.
61635 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
61636 // If we require an extra load to get this address, as in PIC mode, we
61637 // can't accept it.
61639 Subtarget.classifyGlobalReference(GA->getGlobal())))
61640 return;
61641 break;
61642 }
61643 }
61644
61645 if (Result.getNode()) {
61646 Ops.push_back(Result);
61647 return;
61648 }
61649 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
61650}
61651
61652/// Check if \p RC is a general purpose register class.
61653/// I.e., GR* or one of their variant.
61654static bool isGRClass(const TargetRegisterClass &RC) {
61655 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
61656 RC.hasSuperClassEq(&X86::GR16RegClass) ||
61657 RC.hasSuperClassEq(&X86::GR32RegClass) ||
61658 RC.hasSuperClassEq(&X86::GR64RegClass) ||
61659 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
61660}
61661
61662/// Check if \p RC is a vector register class.
61663/// I.e., FR* / VR* or one of their variant.
61664static bool isFRClass(const TargetRegisterClass &RC) {
61665 return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
61666 RC.hasSuperClassEq(&X86::FR32XRegClass) ||
61667 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
61668 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
61669 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
61670 RC.hasSuperClassEq(&X86::VR512RegClass);
61671}
61672
61673/// Check if \p RC is a mask register class.
61674/// I.e., VK* or one of their variant.
61675static bool isVKClass(const TargetRegisterClass &RC) {
61676 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
61677 RC.hasSuperClassEq(&X86::VK2RegClass) ||
61678 RC.hasSuperClassEq(&X86::VK4RegClass) ||
61679 RC.hasSuperClassEq(&X86::VK8RegClass) ||
61680 RC.hasSuperClassEq(&X86::VK16RegClass) ||
61681 RC.hasSuperClassEq(&X86::VK32RegClass) ||
61682 RC.hasSuperClassEq(&X86::VK64RegClass);
61683}
61684
61685static bool useEGPRInlineAsm(const X86Subtarget &Subtarget) {
61686 return Subtarget.hasEGPR() && Subtarget.useInlineAsmGPR32();
61687}
61688
61689std::pair<unsigned, const TargetRegisterClass *>
61691 StringRef Constraint,
61692 MVT VT) const {
61693 // First, see if this is a constraint that directly corresponds to an LLVM
61694 // register class.
61695 if (Constraint.size() == 1) {
61696 // GCC Constraint Letters
61697 switch (Constraint[0]) {
61698 default: break;
61699 // 'A' means [ER]AX + [ER]DX.
61700 case 'A':
61701 if (Subtarget.is64Bit())
61702 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
61703 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
61704 "Expecting 64, 32 or 16 bit subtarget");
61705 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
61706
61707 // TODO: Slight differences here in allocation order and leaving
61708 // RIP in the class. Do they matter any more here than they do
61709 // in the normal allocation?
61710 case 'k':
61711 if (Subtarget.hasAVX512()) {
61712 if (VT == MVT::v1i1 || VT == MVT::i1)
61713 return std::make_pair(0U, &X86::VK1RegClass);
61714 if (VT == MVT::v8i1 || VT == MVT::i8)
61715 return std::make_pair(0U, &X86::VK8RegClass);
61716 if (VT == MVT::v16i1 || VT == MVT::i16)
61717 return std::make_pair(0U, &X86::VK16RegClass);
61718 }
61719 if (Subtarget.hasBWI()) {
61720 if (VT == MVT::v32i1 || VT == MVT::i32)
61721 return std::make_pair(0U, &X86::VK32RegClass);
61722 if (VT == MVT::v64i1 || VT == MVT::i64)
61723 return std::make_pair(0U, &X86::VK64RegClass);
61724 }
61725 break;
61726 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
61727 if (Subtarget.is64Bit()) {
61728 if (VT == MVT::i8 || VT == MVT::i1)
61729 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61730 ? &X86::GR8RegClass
61731 : &X86::GR8_NOREX2RegClass);
61732 if (VT == MVT::i16)
61733 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61734 ? &X86::GR16RegClass
61735 : &X86::GR16_NOREX2RegClass);
61736 if (VT == MVT::i32 || VT == MVT::f32)
61737 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61738 ? &X86::GR32RegClass
61739 : &X86::GR32_NOREX2RegClass);
61740 if (VT != MVT::f80 && !VT.isVector())
61741 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61742 ? &X86::GR64RegClass
61743 : &X86::GR64_NOREX2RegClass);
61744 break;
61745 }
61746 [[fallthrough]];
61747 // 32-bit fallthrough
61748 case 'Q': // Q_REGS
61749 if (VT == MVT::i8 || VT == MVT::i1)
61750 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
61751 if (VT == MVT::i16)
61752 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
61753 if (VT == MVT::i32 || VT == MVT::f32 ||
61754 (!VT.isVector() && !Subtarget.is64Bit()))
61755 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
61756 if (VT != MVT::f80 && !VT.isVector())
61757 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
61758 break;
61759 case 'r': // GENERAL_REGS
61760 case 'l': // INDEX_REGS
61761 if (VT == MVT::i8 || VT == MVT::i1)
61762 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61763 ? &X86::GR8RegClass
61764 : &X86::GR8_NOREX2RegClass);
61765 if (VT == MVT::i16)
61766 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61767 ? &X86::GR16RegClass
61768 : &X86::GR16_NOREX2RegClass);
61769 if (VT == MVT::i32 || VT == MVT::f32 ||
61770 (!VT.isVector() && !Subtarget.is64Bit()))
61771 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61772 ? &X86::GR32RegClass
61773 : &X86::GR32_NOREX2RegClass);
61774 if (VT != MVT::f80 && !VT.isVector())
61775 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61776 ? &X86::GR64RegClass
61777 : &X86::GR64_NOREX2RegClass);
61778 break;
61779 case 'R': // LEGACY_REGS
61780 if (VT == MVT::i8 || VT == MVT::i1)
61781 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
61782 if (VT == MVT::i16)
61783 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
61784 if (VT == MVT::i32 || VT == MVT::f32 ||
61785 (!VT.isVector() && !Subtarget.is64Bit()))
61786 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
61787 if (VT != MVT::f80 && !VT.isVector())
61788 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
61789 break;
61790 case 'f': // FP Stack registers.
61791 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
61792 // value to the correct fpstack register class.
61793 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
61794 return std::make_pair(0U, &X86::RFP32RegClass);
61795 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
61796 return std::make_pair(0U, &X86::RFP64RegClass);
61797 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
61798 return std::make_pair(0U, &X86::RFP80RegClass);
61799 break;
61800 case 'y': // MMX_REGS if MMX allowed.
61801 if (!Subtarget.hasMMX()) break;
61802 return std::make_pair(0U, &X86::VR64RegClass);
61803 case 'v':
61804 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
61805 if (!Subtarget.hasSSE1()) break;
61806 bool VConstraint = (Constraint[0] == 'v');
61807
61808 switch (VT.SimpleTy) {
61809 default: break;
61810 // Scalar SSE types.
61811 case MVT::f16:
61812 if (VConstraint && Subtarget.hasFP16())
61813 return std::make_pair(0U, &X86::FR16XRegClass);
61814 break;
61815 case MVT::f32:
61816 case MVT::i32:
61817 if (VConstraint && Subtarget.hasVLX())
61818 return std::make_pair(0U, &X86::FR32XRegClass);
61819 return std::make_pair(0U, &X86::FR32RegClass);
61820 case MVT::f64:
61821 case MVT::i64:
61822 if (VConstraint && Subtarget.hasVLX())
61823 return std::make_pair(0U, &X86::FR64XRegClass);
61824 return std::make_pair(0U, &X86::FR64RegClass);
61825 case MVT::i128:
61826 if (Subtarget.is64Bit()) {
61827 if (VConstraint && Subtarget.hasVLX())
61828 return std::make_pair(0U, &X86::VR128XRegClass);
61829 return std::make_pair(0U, &X86::VR128RegClass);
61830 }
61831 break;
61832 // Vector types and fp128.
61833 case MVT::v8f16:
61834 if (!Subtarget.hasFP16())
61835 break;
61836 if (VConstraint)
61837 return std::make_pair(0U, &X86::VR128XRegClass);
61838 return std::make_pair(0U, &X86::VR128RegClass);
61839 case MVT::v8bf16:
61840 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61841 break;
61842 if (VConstraint)
61843 return std::make_pair(0U, &X86::VR128XRegClass);
61844 return std::make_pair(0U, &X86::VR128RegClass);
61845 case MVT::f128:
61846 if (!Subtarget.is64Bit())
61847 break;
61848 [[fallthrough]];
61849 case MVT::v16i8:
61850 case MVT::v8i16:
61851 case MVT::v4i32:
61852 case MVT::v2i64:
61853 case MVT::v4f32:
61854 case MVT::v2f64:
61855 if (VConstraint && Subtarget.hasVLX())
61856 return std::make_pair(0U, &X86::VR128XRegClass);
61857 return std::make_pair(0U, &X86::VR128RegClass);
61858 // AVX types.
61859 case MVT::v16f16:
61860 if (!Subtarget.hasFP16())
61861 break;
61862 if (VConstraint)
61863 return std::make_pair(0U, &X86::VR256XRegClass);
61864 return std::make_pair(0U, &X86::VR256RegClass);
61865 case MVT::v16bf16:
61866 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61867 break;
61868 if (VConstraint)
61869 return std::make_pair(0U, &X86::VR256XRegClass);
61870 return std::make_pair(0U, &X86::VR256RegClass);
61871 case MVT::v32i8:
61872 case MVT::v16i16:
61873 case MVT::v8i32:
61874 case MVT::v4i64:
61875 case MVT::v8f32:
61876 case MVT::v4f64:
61877 if (VConstraint && Subtarget.hasVLX())
61878 return std::make_pair(0U, &X86::VR256XRegClass);
61879 if (Subtarget.hasAVX())
61880 return std::make_pair(0U, &X86::VR256RegClass);
61881 break;
61882 case MVT::v32f16:
61883 if (!Subtarget.hasFP16())
61884 break;
61885 if (VConstraint)
61886 return std::make_pair(0U, &X86::VR512RegClass);
61887 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61888 case MVT::v32bf16:
61889 if (!Subtarget.hasBF16())
61890 break;
61891 if (VConstraint)
61892 return std::make_pair(0U, &X86::VR512RegClass);
61893 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61894 case MVT::v64i8:
61895 case MVT::v32i16:
61896 case MVT::v8f64:
61897 case MVT::v16f32:
61898 case MVT::v16i32:
61899 case MVT::v8i64:
61900 if (!Subtarget.hasAVX512()) break;
61901 if (VConstraint)
61902 return std::make_pair(0U, &X86::VR512RegClass);
61903 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61904 }
61905 break;
61906 }
61907 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
61908 switch (Constraint[1]) {
61909 default:
61910 break;
61911 case 'i':
61912 case 't':
61913 case '2':
61914 return getRegForInlineAsmConstraint(TRI, "x", VT);
61915 case 'm':
61916 if (!Subtarget.hasMMX()) break;
61917 return std::make_pair(0U, &X86::VR64RegClass);
61918 case 'z':
61919 if (!Subtarget.hasSSE1()) break;
61920 switch (VT.SimpleTy) {
61921 default: break;
61922 // Scalar SSE types.
61923 case MVT::f16:
61924 if (!Subtarget.hasFP16())
61925 break;
61926 return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
61927 case MVT::f32:
61928 case MVT::i32:
61929 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
61930 case MVT::f64:
61931 case MVT::i64:
61932 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
61933 case MVT::v8f16:
61934 if (!Subtarget.hasFP16())
61935 break;
61936 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61937 case MVT::v8bf16:
61938 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61939 break;
61940 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61941 case MVT::f128:
61942 case MVT::v16i8:
61943 case MVT::v8i16:
61944 case MVT::v4i32:
61945 case MVT::v2i64:
61946 case MVT::v4f32:
61947 case MVT::v2f64:
61948 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61949 // AVX types.
61950 case MVT::v16f16:
61951 if (!Subtarget.hasFP16())
61952 break;
61953 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61954 case MVT::v16bf16:
61955 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61956 break;
61957 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61958 case MVT::v32i8:
61959 case MVT::v16i16:
61960 case MVT::v8i32:
61961 case MVT::v4i64:
61962 case MVT::v8f32:
61963 case MVT::v4f64:
61964 if (Subtarget.hasAVX())
61965 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61966 break;
61967 case MVT::v32f16:
61968 if (!Subtarget.hasFP16())
61969 break;
61970 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61971 case MVT::v32bf16:
61972 if (!Subtarget.hasBF16())
61973 break;
61974 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61975 case MVT::v64i8:
61976 case MVT::v32i16:
61977 case MVT::v8f64:
61978 case MVT::v16f32:
61979 case MVT::v16i32:
61980 case MVT::v8i64:
61981 if (Subtarget.hasAVX512())
61982 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61983 break;
61984 }
61985 break;
61986 case 'k':
61987 // This register class doesn't allocate k0 for masked vector operation.
61988 if (Subtarget.hasAVX512()) {
61989 if (VT == MVT::v1i1 || VT == MVT::i1)
61990 return std::make_pair(0U, &X86::VK1WMRegClass);
61991 if (VT == MVT::v8i1 || VT == MVT::i8)
61992 return std::make_pair(0U, &X86::VK8WMRegClass);
61993 if (VT == MVT::v16i1 || VT == MVT::i16)
61994 return std::make_pair(0U, &X86::VK16WMRegClass);
61995 }
61996 if (Subtarget.hasBWI()) {
61997 if (VT == MVT::v32i1 || VT == MVT::i32)
61998 return std::make_pair(0U, &X86::VK32WMRegClass);
61999 if (VT == MVT::v64i1 || VT == MVT::i64)
62000 return std::make_pair(0U, &X86::VK64WMRegClass);
62001 }
62002 break;
62003 }
62004 } else if (Constraint.size() == 2 && Constraint[0] == 'j') {
62005 switch (Constraint[1]) {
62006 default:
62007 break;
62008 case 'r':
62009 if (VT == MVT::i8 || VT == MVT::i1)
62010 return std::make_pair(0U, &X86::GR8_NOREX2RegClass);
62011 if (VT == MVT::i16)
62012 return std::make_pair(0U, &X86::GR16_NOREX2RegClass);
62013 if (VT == MVT::i32 || VT == MVT::f32)
62014 return std::make_pair(0U, &X86::GR32_NOREX2RegClass);
62015 if (VT != MVT::f80 && !VT.isVector())
62016 return std::make_pair(0U, &X86::GR64_NOREX2RegClass);
62017 break;
62018 case 'R':
62019 if (VT == MVT::i8 || VT == MVT::i1)
62020 return std::make_pair(0U, &X86::GR8RegClass);
62021 if (VT == MVT::i16)
62022 return std::make_pair(0U, &X86::GR16RegClass);
62023 if (VT == MVT::i32 || VT == MVT::f32)
62024 return std::make_pair(0U, &X86::GR32RegClass);
62025 if (VT != MVT::f80 && !VT.isVector())
62026 return std::make_pair(0U, &X86::GR64RegClass);
62027 break;
62028 }
62029 }
62030
62031 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
62032 return std::make_pair(0U, &X86::GR32RegClass);
62033
62034 // Use the default implementation in TargetLowering to convert the register
62035 // constraint into a member of a register class.
62036 std::pair<Register, const TargetRegisterClass*> Res;
62038
62039 // Not found as a standard register?
62040 if (!Res.second) {
62041 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
62042 // to/from f80.
62043 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
62044 // Map st(0) -> st(7) -> ST0
62045 if (Constraint.size() == 7 && Constraint[0] == '{' &&
62046 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
62047 Constraint[3] == '(' &&
62048 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
62049 Constraint[5] == ')' && Constraint[6] == '}') {
62050 // st(7) is not allocatable and thus not a member of RFP80. Return
62051 // singleton class in cases where we have a reference to it.
62052 if (Constraint[4] == '7')
62053 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
62054 return std::make_pair(X86::FP0 + Constraint[4] - '0',
62055 &X86::RFP80RegClass);
62056 }
62057
62058 // GCC allows "st(0)" to be called just plain "st".
62059 if (StringRef("{st}").equals_insensitive(Constraint))
62060 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
62061 }
62062
62063 // flags -> EFLAGS
62064 if (StringRef("{flags}").equals_insensitive(Constraint))
62065 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
62066
62067 // dirflag -> DF
62068 // Only allow for clobber.
62069 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
62070 VT == MVT::Other)
62071 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
62072
62073 // fpsr -> FPSW
62074 // Only allow for clobber.
62075 if (StringRef("{fpsr}").equals_insensitive(Constraint) && VT == MVT::Other)
62076 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
62077
62078 return Res;
62079 }
62080
62081 // Make sure it isn't a register that requires 64-bit mode.
62082 if (!Subtarget.is64Bit() &&
62083 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
62084 TRI->getEncodingValue(Res.first) >= 8) {
62085 // Register requires REX prefix, but we're in 32-bit mode.
62086 return std::make_pair(0, nullptr);
62087 }
62088
62089 // Make sure it isn't a register that requires AVX512.
62090 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
62091 TRI->getEncodingValue(Res.first) & 0x10) {
62092 // Register requires EVEX prefix.
62093 return std::make_pair(0, nullptr);
62094 }
62095
62096 // Otherwise, check to see if this is a register class of the wrong value
62097 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
62098 // turn into {ax},{dx}.
62099 // MVT::Other is used to specify clobber names.
62100 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
62101 return Res; // Correct type already, nothing to do.
62102
62103 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
62104 // return "eax". This should even work for things like getting 64bit integer
62105 // registers when given an f64 type.
62106 const TargetRegisterClass *Class = Res.second;
62107 // The generic code will match the first register class that contains the
62108 // given register. Thus, based on the ordering of the tablegened file,
62109 // the "plain" GR classes might not come first.
62110 // Therefore, use a helper method.
62111 if (isGRClass(*Class)) {
62112 unsigned Size = VT.getSizeInBits();
62113 if (Size == 1) Size = 8;
62114 if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
62115 return std::make_pair(0, nullptr);
62116 Register DestReg = getX86SubSuperRegister(Res.first, Size);
62117 if (DestReg.isValid()) {
62118 bool is64Bit = Subtarget.is64Bit();
62119 const TargetRegisterClass *RC =
62120 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
62121 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
62122 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
62123 : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
62124 if (Size == 64 && !is64Bit) {
62125 // Model GCC's behavior here and select a fixed pair of 32-bit
62126 // registers.
62127 switch (DestReg) {
62128 case X86::RAX:
62129 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
62130 case X86::RDX:
62131 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
62132 case X86::RCX:
62133 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
62134 case X86::RBX:
62135 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
62136 case X86::RSI:
62137 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
62138 case X86::RDI:
62139 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
62140 case X86::RBP:
62141 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
62142 default:
62143 return std::make_pair(0, nullptr);
62144 }
62145 }
62146 if (RC && RC->contains(DestReg))
62147 return std::make_pair(DestReg, RC);
62148 return Res;
62149 }
62150 // No register found/type mismatch.
62151 return std::make_pair(0, nullptr);
62152 } else if (isFRClass(*Class)) {
62153 // Handle references to XMM physical registers that got mapped into the
62154 // wrong class. This can happen with constraints like {xmm0} where the
62155 // target independent register mapper will just pick the first match it can
62156 // find, ignoring the required type.
62157
62158 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
62159 if (VT == MVT::f16)
62160 Res.second = &X86::FR16XRegClass;
62161 else if (VT == MVT::f32 || VT == MVT::i32)
62162 Res.second = &X86::FR32XRegClass;
62163 else if (VT == MVT::f64 || VT == MVT::i64)
62164 Res.second = &X86::FR64XRegClass;
62165 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
62166 Res.second = &X86::VR128XRegClass;
62167 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
62168 Res.second = &X86::VR256XRegClass;
62169 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
62170 Res.second = &X86::VR512RegClass;
62171 else {
62172 // Type mismatch and not a clobber: Return an error;
62173 Res.first = 0;
62174 Res.second = nullptr;
62175 }
62176 } else if (isVKClass(*Class)) {
62177 if (VT == MVT::v1i1 || VT == MVT::i1)
62178 Res.second = &X86::VK1RegClass;
62179 else if (VT == MVT::v8i1 || VT == MVT::i8)
62180 Res.second = &X86::VK8RegClass;
62181 else if (VT == MVT::v16i1 || VT == MVT::i16)
62182 Res.second = &X86::VK16RegClass;
62183 else if (VT == MVT::v32i1 || VT == MVT::i32)
62184 Res.second = &X86::VK32RegClass;
62185 else if (VT == MVT::v64i1 || VT == MVT::i64)
62186 Res.second = &X86::VK64RegClass;
62187 else {
62188 // Type mismatch and not a clobber: Return an error;
62189 Res.first = 0;
62190 Res.second = nullptr;
62191 }
62192 }
62193
62194 return Res;
62195}
62196
62197bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
62198 // Integer division on x86 is expensive. However, when aggressively optimizing
62199 // for code size, we prefer to use a div instruction, as it is usually smaller
62200 // than the alternative sequence.
62201 // The exception to this is vector division. Since x86 doesn't have vector
62202 // integer division, leaving the division as-is is a loss even in terms of
62203 // size, because it will have to be scalarized, while the alternative code
62204 // sequence can be performed in vector form.
62205 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
62206 return OptSize && !VT.isVector();
62207}
62208
62209void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
62210 if (!Subtarget.is64Bit())
62211 return;
62212
62213 // Update IsSplitCSR in X86MachineFunctionInfo.
62215 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
62216 AFI->setIsSplitCSR(true);
62217}
62218
62219void X86TargetLowering::insertCopiesSplitCSR(
62220 MachineBasicBlock *Entry,
62221 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
62222 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
62223 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
62224 if (!IStart)
62225 return;
62226
62227 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
62228 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
62229 MachineBasicBlock::iterator MBBI = Entry->begin();
62230 for (const MCPhysReg *I = IStart; *I; ++I) {
62231 const TargetRegisterClass *RC = nullptr;
62232 if (X86::GR64RegClass.contains(*I))
62233 RC = &X86::GR64RegClass;
62234 else
62235 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
62236
62237 Register NewVR = MRI->createVirtualRegister(RC);
62238 // Create copy from CSR to a virtual register.
62239 // FIXME: this currently does not emit CFI pseudo-instructions, it works
62240 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
62241 // nounwind. If we want to generalize this later, we may need to emit
62242 // CFI pseudo-instructions.
62243 assert(
62244 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
62245 "Function should be nounwind in insertCopiesSplitCSR!");
62246 Entry->addLiveIn(*I);
62247 BuildMI(*Entry, MBBI, MIMetadata(), TII->get(TargetOpcode::COPY), NewVR)
62248 .addReg(*I);
62249
62250 // Insert the copy-back instructions right before the terminator.
62251 for (auto *Exit : Exits)
62252 BuildMI(*Exit, Exit->getFirstTerminator(), MIMetadata(),
62253 TII->get(TargetOpcode::COPY), *I)
62254 .addReg(NewVR);
62255 }
62256}
62257
62259 return Subtarget.is64Bit();
62260}
62261
62265 const TargetInstrInfo *TII) const {
62266 assert(MBBI->isCall() && MBBI->getCFIType() &&
62267 "Invalid call instruction for a KCFI check");
62268
62269 MachineFunction &MF = *MBB.getParent();
62270 // If the call target is a memory operand, unfold it and use R11 for the
62271 // call, so KCFI_CHECK won't have to recompute the address.
62272 switch (MBBI->getOpcode()) {
62273 case X86::CALL64m:
62274 case X86::CALL64m_NT:
62275 case X86::TAILJMPm64:
62276 case X86::TAILJMPm64_REX: {
62279 if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true,
62280 /*UnfoldStore=*/false, NewMIs))
62281 report_fatal_error("Failed to unfold memory operand for a KCFI check");
62282 for (auto *NewMI : NewMIs)
62283 MBBI = MBB.insert(OrigCall, NewMI);
62284 assert(MBBI->isCall() &&
62285 "Unexpected instruction after memory operand unfolding");
62286 if (OrigCall->shouldUpdateAdditionalCallInfo())
62287 MF.moveAdditionalCallInfo(&*OrigCall, &*MBBI);
62288 MBBI->setCFIType(MF, OrigCall->getCFIType());
62289 OrigCall->eraseFromParent();
62290 break;
62291 }
62292 default:
62293 break;
62294 }
62295
62296 MachineOperand &Target = MBBI->getOperand(0);
62297 Register TargetReg;
62298 switch (MBBI->getOpcode()) {
62299 case X86::CALL64r:
62300 case X86::CALL64r_ImpCall:
62301 case X86::CALL64r_NT:
62302 case X86::TAILJMPr64:
62303 case X86::TAILJMPr64_REX:
62304 assert(Target.isReg() && "Unexpected target operand for an indirect call");
62305 Target.setIsRenamable(false);
62306 TargetReg = Target.getReg();
62307 break;
62308 case X86::CALL64pcrel32:
62309 case X86::TAILJMPd64:
62310 assert(Target.isSymbol() && "Unexpected target operand for a direct call");
62311 // X86TargetLowering::EmitLoweredIndirectThunk always uses r11 for
62312 // 64-bit indirect thunk calls.
62313 assert(StringRef(Target.getSymbolName()).ends_with("_r11") &&
62314 "Unexpected register for an indirect thunk call");
62315 TargetReg = X86::R11;
62316 break;
62317 default:
62318 llvm_unreachable("Unexpected CFI call opcode");
62319 break;
62320 }
62321
62322 return BuildMI(MBB, MBBI, MIMetadata(*MBBI), TII->get(X86::KCFI_CHECK))
62323 .addReg(TargetReg)
62324 .addImm(MBBI->getCFIType())
62325 .getInstr();
62326}
62327
62328/// Returns true if stack probing through a function call is requested.
62332
62333/// Returns true if stack probing through inline assembly is requested.
62335
62336 // No inline stack probe for Windows, they have their own mechanism.
62337 if (Subtarget.isOSWindows() || Subtarget.isUEFI() ||
62338 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
62339 return false;
62340
62341 // If the function specifically requests inline stack probes, emit them.
62342 if (MF.getFunction().hasFnAttribute("probe-stack"))
62343 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
62344 "inline-asm";
62345
62346 return false;
62347}
62348
62349/// Returns the name of the symbol used to emit stack probes or the empty
62350/// string if not applicable.
62353 // Inline Stack probes disable stack probe call
62354 if (hasInlineStackProbe(MF))
62355 return "";
62356
62357 // If the function specifically requests stack probes, emit them.
62358 if (MF.getFunction().hasFnAttribute("probe-stack"))
62359 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
62360
62361 // Generally, if we aren't on Windows, the platform ABI does not include
62362 // support for stack probes, so don't emit them.
62363 if ((!Subtarget.isOSWindows() && !Subtarget.isUEFI()) ||
62364 Subtarget.isTargetMachO() ||
62365 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
62366 return "";
62367
62368 // We need a stack probe to conform to the Windows ABI. Choose the right
62369 // symbol.
62370 if (Subtarget.is64Bit())
62371 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
62372 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
62373}
62374
62375unsigned
62377 // The default stack probe size is 4096 if the function has no stackprobesize
62378 // attribute.
62379 return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
62380 4096);
62381}
62382
62384 if (ML && ML->isInnermost() &&
62385 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
62388}
unsigned const MachineRegisterInfo * MRI
#define Success
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
return SDValue()
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
#define NODE_NAME_CASE(node)
constexpr LLT F64
constexpr LLT S1
AMDGPU Register Bank Select
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
#define EXPAND(Op)
Function Alias Analysis Results
BitTracker BT
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_ATTRIBUTE_UNUSED
Definition Compiler.h:298
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isSigned(unsigned int Opcode)
Hexagon Common GEP
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static int matchShuffleAsBitRotate(ArrayRef< int > Mask, int NumSubElts)
Try to lower a vector shuffle as a bit rotation.
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
static Value * LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctlz of V before the specified instruction IP.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
Live Register Matrix
static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget)
Dispatching routine to lower various 128-bit LoongArch vector shuffles.
static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size, unsigned Depth)
static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, SDValue Src, const SDLoc &DL)
static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget)
Dispatching routine to lower various 256-bit LoongArch vector shuffles.
static void computeZeroableShuffleElements(ArrayRef< int > Mask, SDValue V1, SDValue V2, APInt &KnownUndef, APInt &KnownZero)
Compute whether each element of a shuffle is zeroable.
static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Attempts to match vector shuffle as byte rotation.
static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, unsigned ScalarSizeInBits, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
Attempts to match a shuffle mask against the VBSLL, VBSRL, VSLLI and VSRLI instruction.
static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each sub-lane.
static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc)
Return true if node is an ISD::AND or ISD::OR of two M68k::SETcc nodes each of which has no other use...
static bool hasNonFlagsUse(SDValue Op)
return true if Op has a use that doesn't just read flags.
static bool isCMOVPseudo(MachineInstr &MI)
static SDValue combineCarryThroughADD(SDValue CCR)
static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG)
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
#define R2(n)
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
#define T1
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
PowerPC TLS Dynamic Call Fixup
if(PassOpts->AAPipeline)
static constexpr MCPhysReg SPReg
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc)
static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
Try to map an integer comparison with size > XLEN to vector instructions before type legalization spl...
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
Contains matchers for matching SelectionDAG nodes and values.
static bool isSimple(Instruction *I)
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
unsigned OpIndex
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file defines the SmallVector class.
static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
static bool Enabled
Definition Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
This file describes how to lower LLVM code to machine code.
static const char LUT[]
static llvm::Type * getVectorElementType(llvm::Type *Ty)
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
static KnownBits computeKnownBitsForHorizontalOperation(const Operator *I, const APInt &DemandedElts, const SimplifyQuery &Q, unsigned Depth, const function_ref< KnownBits(const KnownBits &, const KnownBits &)> KnownBitsFunc)
static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &DL, unsigned VectorWidth)
static bool is64Bit(const char *name)
#define GET_EGPR_IF_ENABLED(OPC)
static unsigned getSUBriOpcode(bool IsLP64)
static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If both input operands of a logic op are being cast from floating-point types or FP compares,...
static bool isNoopOrBroadcastShuffleMask(ArrayRef< int > Mask)
static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask)
static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget)
Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer t...
static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::ANDNP nodes.
static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT, SDValue X, SDValue Y, SelectionDAG &DAG, bool ZeroSecondOpOnly=false)
If this is an add or subtract where one operand is produced by a cmp+setcc, then try to convert it to...
static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, SmallVectorImpl< SDValue > &SrcOps, SmallVectorImpl< APInt > *SrcMask=nullptr)
Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...)) style scalarized (associative) ...
static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, SDValue &Op1, bool &IsAlwaysSignaling)
Turns an ISD::CondCode into a value suitable for SSE floating-point mask CMPs.
static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC)
static bool useEGPRInlineAsm(const X86Subtarget &Subtarget)
static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If a value is a scalar FP zero or a vector FP zero (potentially including undefined elements),...
static bool matchBinaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static SDValue combineSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isGRClass(const TargetRegisterClass &RC)
Check if RC is a general purpose register class.
static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero, SmallVectorImpl< SDValue > &Ops, SmallVectorImpl< int > &Mask, bool &IsUnary)
Calculates the shuffle mask corresponding to the target-specific opcode.
static SDValue vectorizeExtractedCast(SDValue Cast, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast operation that is extracted from a vector, try to vectorize the cast op followed ...
static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG)
static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode, const SDLoc &DL, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1)
This is a helper function of LowerToHorizontalOp().
static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In, const SDLoc &dl, SelectionDAG &DAG)
static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > HalfMask, int HalfIdx1, int HalfIdx2, bool UndefLower, SelectionDAG &DAG, bool UseConcat=false)
Given the output values from getHalfShuffleMask(), create a half width shuffle of extracted vectors f...
static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFPToSInt(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, SDValue ShAmt, int ShAmtIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle vector element shifts by a splat shift amount.
@ ConstantBit
@ NotConstantBit
@ NotShiftBit
static SDValue combineZext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc, bool NSW)
Given a buildvector constant, return a new vector constant with each element incremented or decrement...
static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode)
static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane 32-bit floating point shuffles.
static MachineBasicBlock * emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII)
Utility function to emit xbegin specifying the start of an RTM region.
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef< SDValue > Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
Given the initializing elements 'Elts' of a vector of type 'VT', see if the elements can be replaced ...
static bool scaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts, SmallVectorImpl< int > &ScaledMask)
static SDValue GetTLSADDR(SelectionDAG &DAG, GlobalAddressSDNode *GA, const EVT PtrVT, unsigned ReturnReg, unsigned char OperandFlags, bool LoadGlobalBaseReg=false, bool LocalDynamic=false)
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > BrMergingCcmpBias("x86-br-merging-ccmp-bias", cl::init(6), cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target " "supports conditional compare instructions."), cl::Hidden)
static APInt getExtractedDemandedElts(SDNode *N)
static SDValue combineAndMaskToShift(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If this is a zero/all-bits result that is bitwise-anded with a low bits mask.
static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit integer shuffles.
static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineX86ShufflesConstants(MVT VT, ArrayRef< SDValue > Ops, ArrayRef< int > Mask, ArrayRef< const SDNode * > SrcNodes, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we are inverting an PTEST/TESTP operand, attempt to adjust the CC to avoid the inversion.
static unsigned getAltBitOpcode(unsigned Opcode)
static Constant * getConstantVector(MVT VT, ArrayRef< APInt > Bits, const APInt &Undefs, LLVMContext &C)
static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue promoteXINT_TO_FP(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert i1-subvector to i1-vector.
static SDValue materializeVectorConstant(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Create a vector constant without a load.
static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle with a single PSHUFB of V1 or V2.
static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
Helper that combines an array of subvector ops as if they were the operands of a ISD::CONCAT_VECTORS ...
static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
64-bit unsigned integer to double expansion.
static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, const X86Subtarget &Subtarget)
static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a 128-bit shuffles.
static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDNodeFlags Flags=SDNodeFlags())
This function lowers a vector truncation of 'extended sign-bits' or 'extended zero-bits' values.
static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on SELECT and VSELECT nodes.
static bool isUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is undef or ...
static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
static SDValue getConstVector(ArrayRef< int > Values, MVT VT, SelectionDAG &DAG, const SDLoc &dl, bool IsMask=false)
static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static MachineInstrBuilder createPHIsForCMOVsInSinkBB(MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock *SinkMBB)
static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to put 128-bits into a vector > 128 bits.
static bool onlyZeroFlagUsed(SDValue Flags)
static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 256-bits from a 512-bit vector.
static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineMulToPMADDWD(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isFreeToSplitVector(SDValue V, SelectionDAG &DAG)
static SDValue lowerShuffleAsLanePermuteAndShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one source with a lane permutatio...
static SDValue checkSignTestSetCCCombine(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG)
static bool isFoldableUseOfShuffle(SDNode *N)
static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return (and Op, Mask) for compare instructions or (vselect Mask, Op, PreservedSrc) for others along w...
static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg sign extension and X86ISD::PACKSS.
static SDValue combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool isShuffleMaskInputInPlace(int Input, ArrayRef< int > Mask)
Test whether the specified input (0 or 1) is in-place blended by the given mask.
static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether elements in each LaneSizeInBits lane in this shuffle mask come from multiple lanes - thi...
static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, ISD::CondCode Cond, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
As another special case, use PSUBUS[BW] when it's profitable.
static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static APInt getBLENDIBlendMask(SDValue V)
Get the expanded blend mask from a BLENDI node.
static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "test Op0,Op0", or something equivalent.
static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 128-bit lane.
static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineSCALAR_TO_VECTOR(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerSELECTWithCmpZero(SDValue CmpVal, SDValue LHS, SDValue RHS, unsigned X86CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineADC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static std::optional< unsigned > CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS, unsigned NumSignificantBitsRHS)
static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static bool isShuffleFoldableLoad(SDValue)
Helper to test for a load that can be folded with x86 shuffles.
static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
If both arms of a vector select are concatenated vectors, split the select, and concatenate the resul...
static SDValue lowerShuffleAsElementInsertion(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower insertion of a single element into a zero vector.
static SDValue combineXor(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnpackWdShuffleMask(ArrayRef< int > Mask, MVT VT, const SelectionDAG &DAG)
static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into X86ISD::PACKUS/X86ISDPAC...
static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle case where shuffle sources are coming from the same 128-bit lane and every lane can be represe...
static SDValue getSHUFPDImmForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static int getSEHRegistrationNodeSize(const Function *Fn)
static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Creates an SDNode for a predicated scalar operation.
static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL, SelectionDAG &DAG)
static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
If a BUILD_VECTOR's source elements all apply the same bit operation and one of their operands is con...
static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth=0)
Returns the negated value if the node N flips sign of FP value.
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT, ArrayRef< int > OriginalMask, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 16-bit integer shuffles.
static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower atomic_load_ops into LOCK-prefixed operations.
static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 8-bit integer shuffles.
static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static void computeKnownBitsForPMADDWD(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG)
static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, int BroadcastIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single - truncated - integer element, coming from a scalar_to_vector/buil...
static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, unsigned &NumExtracts, bool &IsSubAdd, bool &HasAllowContract)
Returns true iff BV builds a vector with the result equivalent to the result of ADDSUB/SUBADD operati...
static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI)
Emit a sequence of two 128-bit horizontal add/sub followed by a concat_vector.
static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL, MVT VT, unsigned Amt=0)
static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to fold: and (vector_shuffle<Z,...,Z> (insert_vector_elt undef, (xor X, -1), Z),...
static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a bitmask instruction for a shuffle.
static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit) followed by unpack 256-bit.
static bool is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 256-bit lane.
static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl, SDValue V1, SDValue V2, ArrayRef< int > Mask)
static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerUINT_TO_FP_i32(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
32-bit unsigned integer to float expansion.
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerTruncateVecI1(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > ExperimentalPrefInnermostLoopAlignment("x86-experimental-pref-innermost-loop-alignment", cl::init(4), cl::desc("Sets the preferable loop alignment for experiments (as log2 bytes) " "for innermost loops only. If specified, this option overrides " "alignment set by x86-experimental-pref-loop-alignment."), cl::Hidden)
static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute from a vector of source v...
static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, const SDLoc &DL, SelectionDAG &DAG, unsigned HOpcode, SDValue V0, SDValue V1)
static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool needCarryOrOverflowFlag(SDValue Flags)
static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
Returns a vector of specified type with all bits set.
static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefLowerHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose lower half is undefined.
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineRedundantDWordShuffle(SDValue N, MutableArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
Search for a combinable shuffle across a chain ending in pshufd.
static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG, SDValue OpMustEq, SDValue Op, unsigned Depth)
static SDValue createPSADBW(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL, X86Subtarget const &Subtarget, SelectionDAG &DAG)
Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats representing a blend.
static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG)
Handle vector element shifts where the shift amount is a constant.
static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS, bool PackHiHalf=false)
Returns a node that packs the LHS + RHS nodes together at half width.
static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG)
static bool matchUnaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue V1, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT)
static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts, bool AllowUndefs)
static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast to FP with a cast to integer operand (almost an ftrunc), try to vectorize the cas...
static SDValue combineAndXorSubWithBMI(SDNode *And, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Fold AND(Y, XOR(X, NEG(X))) -> ANDN(Y, BLSMSK(X)) if BMI is available.
static SDValue combineX86SubCmpForFlags(SDNode *N, SDValue Flag, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue LowerVectorCTLZ_GFNI(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool getHalfShuffleMask(ArrayRef< int > Mask, MutableArrayRef< int > HalfMask, int &HalfIdx1, int &HalfIdx2)
If the input shuffle mask results in a vector that is undefined in all upper or lower half elements a...
static cl::opt< int > BrMergingBaseCostThresh("x86-br-merging-base-cost", cl::init(2), cl::desc("Sets the cost threshold for when multiple conditionals will be merged " "into one branch versus be split in multiple branches. Merging " "conditionals saves branches at the cost of additional instructions. " "This value sets the instruction cost limit, below which conditionals " "will be merged, and above which conditionals will be split. Set to -1 " "to never merge branches."), cl::Hidden)
static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT)
static SDValue emitLockedStackOp(SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue Chain, const SDLoc &DL)
Emit a locked operation on a stack location which does not change any memory location,...
static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool &ForceV1Zero, bool &ForceV2Zero, unsigned &ShuffleImm, ArrayRef< int > Mask, const APInt &Zeroable)
static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 8-lane 16-bit floating point shuffles.
static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, F Builder, bool CheckBWI=true, bool AllowAVX512=true)
static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle using bit math.
static SDValue reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-extending masked load, it is a scalar load and ve...
static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, unsigned TargetOpcode, unsigned SrcReg, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics with chain that return their value into registers EDX:EAX.
static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI)
static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If this is a dynamic select (non-constant condition) and we can match this node with one of the varia...
static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBuildVectorAsInsert(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, unsigned EltSizeInBits, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a target shuffle mask is equivalent within each sub-lane.
static const char * getIndirectThunkSymbol(const X86Subtarget &Subtarget, Register Reg)
static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether there are elements crossing LaneSizeInBits lanes in this shuffle mask.
static SDValue FixupMMXIntrinsicTypes(SDNode *N, SelectionDAG &DAG)
static bool isShuffleMaskInputBroadcastable(int Input, ArrayRef< int > Mask, int BroadcastableElement=0)
Test whether the specified input (0 or 1) is a broadcast/splat blended by the given mask.
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, X86::CondCode &X86CC)
Result of 'and' is compared against zero.
static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsZeroOrAnyExtend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a zero extension on any microarch.
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Compute the horizontal sum of bytes in V for the elements of VT.
static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 16-bit integer shuffles.
static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG)
static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG)
static void growShuffleMask(ArrayRef< int > SrcMask, SmallVectorImpl< int > &DstMask, unsigned SrcSizeInBits, unsigned DstSizeInBits)
static SDValue lowerShuffleWithEXPAND(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static void computeInLaneShuffleMask(const ArrayRef< int > &Mask, int LaneSize, SmallVector< int > &InLaneMask)
Helper to get compute inlane shuffle mask for a complete shuffle mask.
static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isX86CCSigned(X86::CondCode X86CC)
Return true if the condition is an signed comparison operation.
static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT, EVT MemVT, MemSDNode *Mem, unsigned Offset, SelectionDAG &DAG)
static bool isUndefUpperHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose upper half is undefined.
static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt=0)
static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleAsSpecificExtension(const SDLoc &DL, MVT VT, int Scale, int Offset, unsigned ExtOpc, SDValue InputV, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle as an any/signed/zero extension.
static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG)
Lower SRA_PARTS and friends, which return two i32 values and take a 2 x i32 value to shift plus a shi...
static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode)
static std::pair< SDValue, SDValue > getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs reference the same FP CMP,...
static bool isVKClass(const TargetRegisterClass &RC)
Check if RC is a mask register class.
static int canLowerByDroppingElements(ArrayRef< int > Mask, bool MatchEven, bool IsSingleInput)
Check whether a compaction lowering can be done by dropping even/odd elements and compute how many ti...
static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
Attempt to pre-truncate inputs to arithmetic ops if it will simplify the codegen.
static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single element.
static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void resolveTargetShuffleInputsAndMask(SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask)
Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 64-lane 8-bit integer shuffles.
static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineAndNotIntoANDNP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to combine a shuffle into a target-specific add-sub or mul-add-sub node.
static SDValue lowerShuffleAsLanePermuteAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes as a lane permutation followed by a per-lane p...
static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of 8-lane i16 shuffles.
static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG)
static bool canonicalizeShuffleMaskWithCommute(ArrayRef< int > Mask)
Helper function that returns true if the shuffle mask should be commuted to improve canonicalization.
static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getV4X86ShuffleImm8ForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static SDValue splitVSETCC(EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SelectionDAG &DAG, const SDLoc &dl)
Break a VSETCC 256/512-bit vector into two new 128/256 ones and then concatenate the result back.
static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG)
Change a vector store into a pair of half-size vector stores.
static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a vector to a larger size with the same scalar type, with the new elements either zero or undef...
static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static bool isUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FANDN nodes.
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit, bool isPIC)
static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendBoolVectorInReg(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break a binary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isBlendOrUndef(ArrayRef< int > Mask)
Return true if every element in Mask, is an in-place blend/select mask or is undef.
static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static unsigned getV4X86ShuffleImm(ArrayRef< int > Mask)
Get a 4-lane 8-bit shuffle immediate for a mask.
static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void resolveTargetShuffleFromZeroables(SmallVectorImpl< int > &Mask, const APInt &KnownUndef, const APInt &KnownZero, bool ResolveKnownZeros=true)
static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert one bit to mask vector, like v16i1 or v8i1.
static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle by first fixing the 128-bit lanes and then shuffling each lane.
static bool isSoftF16(T VT, const X86Subtarget &Subtarget)
static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Detect vector gather/scatter index generation and convert it from being a bunch of shuffles and extra...
static bool isSingleSHUFPSMask(ArrayRef< int > Mask)
Test whether this can be lowered with a single SHUFPS instruction.
static SDValue LowerFCanonicalize(SDValue Op, SelectionDAG &DAG)
static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, X86::CondCode &CC1, SDValue &Flags, bool &isAnd)
Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
static bool isX86LogicalCmp(SDValue Op)
Return true if opcode is a X86 logical comparison.
static bool isAnyInRange(ArrayRef< int > Mask, int Low, int Hi)
Return true if the value of any element in Mask falls within the specified range (L,...
static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static cl::opt< bool > WidenShift("x86-widen-shift", cl::init(true), cl::desc("Replace narrow shifts with wider shifts."), cl::Hidden)
static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG)
static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS=false)
Detect patterns of truncation with signed saturation: (truncate (smin ((smax (x, signed_min_of_dest_t...
const unsigned FPStateSize
static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, unsigned &UnpackOpcode, bool IsUnary, ArrayRef< int > TargetMask, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating point negations.
static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth)
static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, unsigned &HOpcode, SDValue &V0, SDValue &V1)
static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_VOID(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool createShuffleMaskFromVSELECT(SmallVectorImpl< int > &Mask, SDValue Cond, bool IsBLENDV=false)
static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Return Mask with the necessary casting or extending for Mask according to MaskVT when lowering maskin...
static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit floating point shuffles.
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Horizontal vector math instructions may be slower than normal math with shuffles.
static bool isFRClass(const TargetRegisterClass &RC)
Check if RC is a vector register class.
static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool SimpleOnly)
Generic routine to split vector shuffle into half-sized shuffles.
static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue IsNOT(SDValue V, SelectionDAG &DAG)
static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG)
Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, const SDLoc &dl)
Return a vector logical shift node.
static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_WO_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane i32 vector shuffles.
static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer types.
static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isInRange(int Val, int Low, int Hi)
Return true if Val falls within the specified range (L, H].
static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Try to combine x86 target specific shuffles.
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static std::pair< SDValue, SDValue > splitVector(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG)
Helper for attempting to create a X86ISD::BT node.
static SDValue EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Truncating Store with signed or unsigned saturation.
static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, bool FillWithZeroes=false)
Widen a vector input to a vector of NVT.
static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool ImmBlends=false)
Try to lower as a blend of elements from two inputs followed by a single-input permutation.
static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx, const APInt &Zeroable)
const unsigned X87StateSize
static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit integer shuffles.
static bool isUndefOrEqual(int Val, int CmpVal)
Val is the undef sentinel value or equal to the specified value.
static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isTargetShuffle(unsigned Opcode)
static bool isSingleElementRepeatedMask(ArrayRef< int > Mask)
Check if the Mask consists of the same element repeated multiple times.
static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG)
static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86ShufflesRecursively(ArrayRef< SDValue > SrcOps, int SrcOpIndex, unsigned RootOpc, MVT RootVT, ArrayRef< int > RootMask, ArrayRef< const SDNode * > SrcNodes, unsigned Depth, unsigned MaxDepth, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Fully generic combining of x86 shuffle instructions.
static SDValue LowerIntVSETCC_AVX512(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, SDValue N1, ArrayRef< int > Mask, SelectionDAG &DAG)
If we are extracting two 128-bit halves of a vector and shuffling the result, match that to a 256-bit...
static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit floating point shuffles.
static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or 'fsubadd' operation accordingly...
static SDValue lowerV8I16GeneralSingleInputShuffle(const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 shuffle lowering,...
static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit floating point shuffles.
static SDValue isUpperSubvectorUndef(SDValue V, const SDLoc &DL, SelectionDAG &DAG)
static cl::opt< int > BrMergingLikelyBias("x86-br-merging-likely-bias", cl::init(0), cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "likely, then it is likely that if the conditionals are split " "both sides will be executed, so it may be desirable to increase " "the instruction cost threshold. Set to -1 to never merge likely " "branches."), cl::Hidden)
static SDValue getInvertedVectorForFMA(SDValue V, SelectionDAG &DAG)
static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, int Idx, int ExpectedIdx)
Checks whether the vector elements referenced by two shuffle masks are equivalent.
static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Try to match a vector shuffle as an element rotation.
static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi)
Return true if Val is undef, zero or if its value falls within the specified range (L,...
static const Constant * getTargetConstantFromBasePtr(SDValue Ptr)
static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Original, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle.
static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset)
static bool isUndefOrInRange(int Val, int Low, int Hi)
Return true if Val is undef or if its value falls within the specified range (L, H].
static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT)
static bool collectConcatOps(SDNode *N, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG)
static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue combineSBB(SDNode *N, SelectionDAG &DAG)
static void computeKnownBitsForPMADDUBSW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static std::pair< Value *, BitTestKind > FindSingleBitChange(Value *V)
static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG)
If we are converting a value to floating-point, try to replace scalar truncate of an extracted vector...
static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef< int > Mask)
Test whether there are elements crossing 128-bit lanes in this shuffle mask.
static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "cmp Op0,Op1", or something equivalent.
static SDValue LowerI64IntToFP16(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit integer shuffles.
static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG)
const unsigned FPStateSizeInBits
static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-truncating masked store, it is a vector extract a...
static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode)
static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue narrowExtractedVectorSelect(SDNode *Ext, const SDLoc &DL, SelectionDAG &DAG)
If we are extracting a subvector of a vector select and the select condition is composed of concatena...
static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNoopShuffleMask(ArrayRef< int > Mask)
Tiny helper function to identify a no-op mask.
static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackh operation.
static SDValue combineExtractFromVectorLoad(SDNode *N, EVT VecVT, SDValue SrcVec, uint64_t Idx, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a byte shift sequence.
static SDValue combineFP_TO_xINT_SAT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isTargetShuffleVariableMask(unsigned Opcode)
static bool isLogicOp(unsigned Opcode)
static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool BitwiseOnly)
static SDValue LowerBuildVectorv8i16(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v8i16.
static bool matchBinaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, bool IsUnary)
static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to lower as an unpack of elements from two inputs followed by a single-input permutation.
static bool canScaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts)
static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG)
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return a vector_shuffle of the specified vector of zero or undef vector.
static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Attempt to use the vbroadcast instruction to generate a splat value from a splat BUILD_VECTOR which u...
static SDValue combineMulToPMULDQ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG)
static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsCommutative, SmallVectorImpl< int > &PostShuffleMask, bool ForceHorizOp)
Return 'true' if this vector operation is "horizontal" and return the operands for the horizontal ope...
static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl< uint64_t > &RawMask, APInt &UndefElts)
static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG, const X86Subtarget &Subtarget)
sext(add_nsw(x, C)) --> add(sext(x), C_sext) zext(add_nuw(x, C)) --> add(zext(x), C_zext) Promoting a...
static const Constant * getTargetConstantFromNode(LoadSDNode *Load)
static SDValue canonicalizeBitSelect(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canCombineAsMaskOperation(SDValue V, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a dword/qword rotation.
static SDValue lowerVECTOR_COMPRESS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isProfitableToUseFlagOp(SDValue Op)
static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG)
ISD::FROUND is defined to round to nearest with ties rounding away from 0.
static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
Detect patterns of truncation with unsigned saturation:
static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG)
If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the low half of each source v...
static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG)
Do a one-to-one translation of a ISD::CondCode to the X86-specific condition code,...
static SDValue getFlagsOfCmpZeroFori1(SelectionDAG &DAG, const SDLoc &DL, SDValue Mask)
static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
High-level routine to lower various 512-bit x86 vector shuffles.
static SDValue LowerBuildVectorv16i8(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v16i8.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, APInt &UndefElts, SmallVectorImpl< APInt > &EltBits, bool AllowWholeUndefs=true, bool AllowPartialUndefs=false)
static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0, SDValue &Op1)
static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerAddSubToHorizontalOp(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Depending on uarch and/or optimizing for size, we might prefer to use a vector operation in place of ...
static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, SelectionDAG &DAG, SDValue &Addr, SDValue &Index, Align &Alignment, unsigned &Offset)
Given a masked memory load/store operation, return true if it has one mask bit set.
static SDValue reduceVMULWidth(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
When the operands of vector mul are extended from smaller size values, like i8 and i16,...
static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode)
static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG)
static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCMP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combinei64TruncSrlConstant(SDValue N, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned, const X86Subtarget &Subtarget)
static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, unsigned &LogBias, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering 2-lane 128-bit shuffles.
static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute)
static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG)
The only differences between FABS and FNEG are the mask and the logic op.
ShrinkMode
Different mul shrinking modes.
static SDValue combineVPMADD52LH(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, const SDLoc &dl)
static SDValue combineINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue canonicalizeShuffleMaskWithHorizOp(MutableArrayRef< SDValue > Ops, MutableArrayRef< int > Mask, unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineConstantPoolLoads(SDNode *N, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Masked Truncating Store with signed or unsigned saturation.
static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a VSELECT instruction to a vector shuffle.
static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2, MutableArrayRef< int > Mask, const APInt &Zeroable, bool &ForceV1Zero, bool &ForceV2Zero, uint64_t &BlendMask)
static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src, const SDLoc &DL)
static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, EVT VT, const SDLoc &DL)
static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackl operation.
static SDValue getScalarValueForVectorElement(SDValue V, int Idx, SelectionDAG &DAG)
Try to get a scalar value for a specific element of a vector.
static SDValue LowerZERO_EXTEND_Mask(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static unsigned getOpcodeForIndirectThunk(unsigned RPOpc)
static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of v16i8 shuffles.
static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDNodeFlags Flags=SDNodeFlags())
Helper to determine if In truncated to DstVT has the necessary signbits / leading zero bits to be tru...
static unsigned getSHUFPDImm(ArrayRef< int > Mask)
static bool isNullFPScalarOrVectorConst(SDValue V)
static bool hasIdenticalHalvesShuffleMask(ArrayRef< int > Mask)
Return true if a shuffle mask chooses elements identically in its top and bottom halves.
static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, unsigned &PackOpcode, ArrayRef< int > TargetMask, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned MaxStages=1)
static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget)
static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth)
static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to convert a vector reduction sequence composed of binops and shuffles into horizontal ops.
static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffle using X86ISD::VROTLI rotations.
static SDValue lowerShuffleAsDecomposedShuffleMerge(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic routine to decompose a shuffle and blend into independent blends and permutes.
static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT)
static SDValue combineBlendOfPermutes(MVT VT, SDValue N0, SDValue N1, ArrayRef< int > BlendMask, const APInt &DemandedElts, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Combine: (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S) to: (brcond/cmov/setcc ....
static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize an EFLAGS definition used according to the condition code CC into a simpler EFLAGS value,...
static bool isBroadcastShuffleMask(ArrayRef< int > Mask)
static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue foldXor1SetCC(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
static SDValue MatchVectorAllEqualTest(SDValue OrigLHS, SDValue OrigRHS, ISD::CondCode CC, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static StringRef getInstrStrFromOpNo(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo)
static bool isSequentialOrUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size,...
static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Either split a vector in halves or decompose the shuffles and the blend/unpack.
static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG)
static bool canWidenShuffleElements(ArrayRef< int > Mask, SmallVectorImpl< int > &WidenedMask)
Helper function to test whether a shuffle mask could be simplified by widening the elements being shu...
static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an unary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue combineSext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit integer shuffles.
static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoConditionalNegate(EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getShuffleScalarElt(SDValue Op, unsigned Index, SelectionDAG &DAG, unsigned Depth)
Returns the scalar element that will make up the i'th element of the result of the vector shuffle.
static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable)
static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, const APInt &Zeroable, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef< int > Mask, const EVT &VectorType, bool &IsZeroSideLeft)
static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG, EVT VecVT, EVT CmpVT, bool HasPT, F SToV)
Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp expansion.
static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just pre-promote its result type since...
static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle as a permute of the inputs followed by an UNPCK instruction.
static SDValue combineAndOrForCcmpCtest(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT, SelectionDAG &DAG)
static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget, TargetLowering::DAGCombinerInfo &DCI)
Extracting a scalar FP value from vector element 0 is free, so extract each operand first,...
static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isAddSubOrSubAddMask(ArrayRef< int > Mask, bool &Op0Even)
Checks if the shuffle mask takes subsequent elements alternately from two vectors.
static bool isCompletePermute(ArrayRef< int > Mask)
Return true if every element of a single input is referenced by the shuffle mask.
static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, SDValue EntryEBP)
When the MSVC runtime transfers control to us, either to an outlined function or when returning to a ...
static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, SelectionDAG &DAG, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics that read the time stamp counter (x86_rdtsc and x86_rdtscp...
static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS, ISD::CondCode CC, const APInt &OriginalMask, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static bool is128BitUnpackShuffleMask(ArrayRef< int > Mask, const SelectionDAG &DAG)
static bool isOrXorXorTree(SDValue X, bool Root=true)
Recursive helper for combineVectorSizedSetCCEquality() to see if we have a recognizable memcmp expans...
static SDValue LowerAVXExtend(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static ConstantPoolSDNode * getTargetConstantPoolFromBasePtr(SDValue Ptr)
static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, SelectionDAG &DAG, const SDLoc &DL)
Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
static bool isShuffleEquivalent(ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a shuffle mask is equivalent to an explicit list of arguments.
static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit floating point shuffles.
static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsByteRotateAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then permuting the elements of th...
static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineX86ShuffleChain(ArrayRef< SDValue > Inputs, unsigned RootOpc, MVT RootVT, ArrayRef< int > BaseMask, int Depth, ArrayRef< const SDNode * > SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Combine an arbitrary chain of shuffles into a single instruction if possible.
static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool ZeroUppers)
static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, unsigned ExpectedUses, bool AllowSubAddOrAddSubContract)
Returns true if is possible to fold MUL and an idiom that has already been recognized as ADDSUB/SUBAD...
static void createPackShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Unary, unsigned NumStages=1)
Create a shuffle mask that matches the PACKSS/PACKUS truncation.
static bool isUndefOrEqualInRange(ArrayRef< int > Mask, int CmpVal, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating-point adds/subs.
static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an operation into 2 half sized ops and then concatenate the results.
static cl::opt< bool > MulConstantOptimization("mul-constant-optimization", cl::init(true), cl::desc("Replace 'mul x, Const' with more effective instructions like " "SHIFT, LEA, etc."), cl::Hidden)
static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld)
static bool isAnyZero(ArrayRef< int > Mask)
Return true if the value of any element in Mask is the zero sentinel value.
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKUS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
static SDValue lowerINT_TO_FP_vXi64(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget)
static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl< int > &Mask, APInt &KnownUndef, APInt &KnownZero)
static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, SDValue Index, SDValue Base, SDValue Scale, SelectionDAG &DAG)
static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SmallVector< int, 4 > getPSHUFShuffleMask(SDValue N)
Get the PSHUF-style mask from PSHUF node.
static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, SelectionDAG &DAG)
Scalarize a vector store, bitcasting to TargetVT to determine the scalar type.
static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineShuffleToFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isUndefOrZero(int Val)
Val is either the undef or zero sentinel value.
static SDValue combineAndNotOrIntoAndNotAnd(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z)) This undoes the inverse fold performed in InstCom...
static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL].
static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 128-bits from a vector > 128 bits.
static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue &X86CC)
static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
Lower a vector shuffle using the SHUFPS instruction.
static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86ShuffleChainWithExtract(ArrayRef< SDValue > Inputs, unsigned RootOpcode, MVT RootVT, ArrayRef< int > BaseMask, int Depth, ArrayRef< const SDNode * > SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isHorizOp(unsigned Opcode)
static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector CTLZ using native supported vector CTLZ instruction.
static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Extract one bit from mask vector, like v16i1 or v8i1.
static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl, MVT VT, bool IsSigned, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue *Low=nullptr)
static SDValue lowerShuffleAsBlendOfPSHUFBs(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse)
Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the blend if only one input i...
static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx)
static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS, SDValue Mask, SelectionDAG &DAG)
static SDValue combineAVG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isSequentialOrUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos + Size,...
static cl::opt< int > BrMergingUnlikelyBias("x86-br-merging-unlikely-bias", cl::init(-1), cl::desc("Decreases 'x86-br-merging-base-cost' in cases that it is unlikely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "unlikely, then it is unlikely that if the conditionals are split " "both sides will be executed, so it may be desirable to decrease " "the instruction cost threshold. Set to -1 to never merge unlikely " "branches."), cl::Hidden)
static SDValue createSetFPEnvNodes(SDValue Ptr, SDValue Chain, const SDLoc &DL, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, APInt &KnownUndef, APInt &KnownZero)
Decode a target shuffle mask and inputs and see if any values are known to be undef or zero from thei...
static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBuildVectorv4x32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v4i32 or v4f32.
static bool isTargetShuffleEquivalent(MVT VT, ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, const SelectionDAG &DAG, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a target shuffle mask is equivalent to an explicit pattern.
static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
CMOV of constants requires materializing constant operands in registers.
static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, bool Is64Bit, bool Is64BitLP64)
static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx)
For an EXTRACT_VECTOR_ELT with a constant index return the real underlying vector and index.
static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnaryOp(unsigned Opcode)
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize branch condition evaluation.
static bool hasFPCMov(unsigned X86CC)
Is there a floating point cmov for the specific X86 condition code?
static int getOneTrueElt(SDValue V)
If V is a build vector of boolean constants and exactly one of those constants is true,...
static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue foldXorTruncShiftIntoCmp(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Try to turn tests against the signbit in the form of: XOR(TRUNCATE(SRL(X, size(X)-1)),...
static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static constexpr int Concat[]
Value * RHS
Value * LHS
BinaryOperator * Mul
if(isa< SExtInst >(LHS)) std auto IsFreeTruncation
static const unsigned FramePtr
The Input class is used to parse a yaml document into in-memory structs and vectors.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:6057
static LLVM_ABI APFloat getAllOnesValue(const fltSemantics &Semantics)
Returns a float which is bitcasted from an all one value int.
Definition APFloat.cpp:6082
void clearSign()
Definition APFloat.h:1298
opStatus next(bool nextDown)
Definition APFloat.h:1254
void changeSign()
Definition APFloat.h:1297
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1079
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1406
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:449
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition APInt.h:229
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition APInt.h:423
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1670
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1385
LLVM_ABI uint64_t extractBitsAsZExtValue(unsigned numBits, unsigned bitPosition) const
Definition APInt.cpp:520
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1033
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1512
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:206
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1330
APInt abs() const
Get the absolute value.
Definition APInt.h:1795
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:371
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:466
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition APInt.cpp:1666
void setSignBit()
Set the sign bit to 1.
Definition APInt.h:1340
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1111
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:209
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition APInt.h:216
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:329
bool intersects(const APInt &RHS) const
This operation tests if there are any pairs of corresponding bits between this APInt and RHS that are...
Definition APInt.h:1249
bool eq(const APInt &RHS) const
Equality comparison.
Definition APInt.h:1079
int32_t exactLogBase2() const
Definition APInt.h:1783
void clearAllBits()
Set every bit to 0.
Definition APInt.h:1396
void ashrInPlace(unsigned ShiftAmt)
Arithmetic right-shift this APInt by ShiftAmt in place.
Definition APInt.h:834
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition APInt.h:435
unsigned getNumSignBits() const
Computes the number of leading bits of this APInt that are equal to its sign bit.
Definition APInt.h:1628
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition APInt.h:1598
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:219
unsigned countTrailingZeros() const
Definition APInt.h:1647
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition APInt.h:1531
void flipAllBits()
Toggle every bit to its opposite value.
Definition APInt.h:1452
unsigned countl_one() const
Count the number of leading one bits.
Definition APInt.h:1615
LLVM_ABI void insertBits(const APInt &SubBits, unsigned bitPosition)
Insert the bits from a smaller APInt starting at bitPosition.
Definition APInt.cpp:397
void clearLowBits(unsigned loBits)
Set bottom loBits bits to 0.
Definition APInt.h:1435
unsigned logBase2() const
Definition APInt.h:1761
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:827
void setAllBits()
Set every bit to 1.
Definition APInt.h:1319
bool getBoolValue() const
Convert APInt to a boolean value.
Definition APInt.h:471
bool isMask(unsigned numBits) const
Definition APInt.h:488
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition APInt.h:405
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition APInt.h:334
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1150
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition APInt.h:1367
APInt shl(unsigned shiftAmt) const
Left-shift function.
Definition APInt.h:873
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1257
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:306
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition APInt.h:341
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:200
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition APInt.h:1388
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition APInt.cpp:482
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition APInt.h:432
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:389
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition APInt.h:286
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:239
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition APInt.h:858
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition APInt.h:851
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1656
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1221
bool isMaxValue() const
Determine if this is the largest unsigned value.
Definition APInt.h:399
LLVM_ABI APInt truncSSat(unsigned width) const
Truncate to new width with signed saturation.
Definition APInt.cpp:973
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition ArrayRef.h:183
iterator end() const
Definition ArrayRef.h:136
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition ArrayRef.h:206
iterator begin() const
Definition ArrayRef.h:135
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:191
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
This is an SDNode representing atomic operations.
LLVM_ABI StringRef getValueAsString() const
Return the attribute's value as a string.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
size_type count() const
count - Returns the number of bits which are set.
Definition BitVector.h:181
bool any() const
any - Returns true if any bit is set.
Definition BitVector.h:189
bool none() const
none - Returns true if none of the bits are set.
Definition BitVector.h:207
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI bool getRepeatedSequence(const APInt &DemandedElts, SmallVectorImpl< SDValue > &Sequence, BitVector *UndefElements=nullptr) const
Find the shortest repeating sequence of values in the build vector.
LLVM_ABI SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI bool isConstant() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ ICMP_NE
not equal
Definition InstrTypes.h:698
Predicate getPredicate() const
Return the predicate for this instruction.
Definition InstrTypes.h:765
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static LLVM_ABI Constant * get(LLVMContext &Context, ArrayRef< uint8_t > Elts)
get() constructors - Return a constant with vector type with an element count and element type matchi...
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
const Constant * getConstVal() const
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getIntegerValue(Type *Ty, const APInt &V)
Return the value for an integer or pointer constant, or a vector thereof, with the given scalar value...
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
LLVM_ABI Constant * getAggregateElement(unsigned Elt) const
For aggregates (struct/array/vector) return the constant that corresponds to the specified element if...
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
A debug info location.
Definition DebugLoc.h:124
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:167
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:237
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
iterator begin()
Definition DenseMap.h:78
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:222
Tagged union holding either a T or a Error.
Definition Error.h:485
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type::subtype_iterator param_iterator
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:706
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition Function.cpp:774
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:703
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
bool hasPersonalityFn() const
Check whether this function has a personality function.
Definition Function.h:903
Constant * getPersonalityFn() const
Get the personality function associated with this function.
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:727
const GlobalValue * getGlobal() const
static StringRef dropLLVMManglingEscape(StringRef Name)
If the given string begins with the GlobalValue name mangling escape character '\1',...
LLVM_ABI bool isAbsoluteSymbolRef() const
Returns whether this is a reference to an absolute symbol.
Definition Globals.cpp:437
ThreadLocalMode getThreadLocalMode() const
Module * getParent()
Get the module that this global value is contained inside of...
This class is used to form a handle around another node that is persistent and is updated across invo...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
Class to represent integer types.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
bool usesWindowsCFI() const
Definition MCAsmInfo.h:652
LLVM_ABI MCSymbol * getOrCreateParentFrameOffsetSymbol(const Twine &FuncName)
LLVM_ABI MCSymbol * getOrCreateLSDASymbol(const Twine &FuncName)
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
Set of metadata that should be preserved when using BuildMI().
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
@ INVALID_SIMPLE_VALUE_TYPE
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
bool is512BitVector() const
Return true if this is a 512-bit vector type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool bitsGE(MVT VT) const
Return true if this has no less bits than VT.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
MVT getDoubleNumVectorElementsVT() const
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
MVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
void push_back(MachineInstr *MI)
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
succ_reverse_iterator succ_rbegin()
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator insertAfter(iterator I, MachineInstr *MI)
Insert MI into the instruction list after I.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
succ_reverse_iterator succ_rend()
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
LLVM_ABI bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setReturnAddressIsTaken(bool s)
void setHasCopyImplyingStackAdjustment(bool B)
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
const WinEHFuncInfo * getWinEHFuncInfo() const
getWinEHFuncInfo - Return information about how the current function uses Windows exception handling.
void moveAdditionalCallInfo(const MachineInstr *Old, const MachineInstr *New)
Move the call site info from Old to \New call site info.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
bool shouldSplitStack() const
Should we be emitting segmented stack stuff for the function.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDisp(const MachineOperand &Disp, int64_t off, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
bool killsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr kills the specified register.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_LabelDifference32
EK_LabelDifference32 - Each entry is the address of the block minus the address of the jump table.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This class is used to represent an MGATHER node.
This is a base class used to represent MGATHER and MSCATTER nodes.
This class is used to represent an MLOAD node.
This base class is used to represent MLOAD and MSTORE nodes.
const SDValue & getMask() const
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
This class is used to represent an MSTORE node.
bool isCompressingStore() const
Returns true if the op does a compression to the vector before storing.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID for this memory operation.
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isNonTemporal() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition Module.cpp:353
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:303
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
constexpr bool isValid() const
Definition Register.h:107
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
SDNode * getGluedUser() const
If this node has a glue value with a user, return the user (there is at most one).
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
SDNodeFlags getFlags() const
TypeSize getValueSizeInBits(unsigned ResNo) const
Returns MVT::getSizeInBits(getValueType(ResNo)).
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
static LLVM_ABI bool areOnlyUsersOf(ArrayRef< const SDNode * > Nodes, const SDNode *N)
Return true if all the users of N are contained in Nodes.
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
unsigned getNumOperands() const
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
virtual bool isTargetStrictFPOpcode(unsigned Opcode) const
Returns true if a node with the given target-specific opcode has strict floating-point semantics.
Help to insert SDNodeFlags automatically in transforming.
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
bool willNotOverflowAdd(bool IsSigned, SDValue N0, SDValue N1) const
Determine if the result of the addition of 2 nodes can never overflow.
static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode)
Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
LLVM_ABI SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op)
Return the specified value casted to the target's desired shift amount type.
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
LLVM_ABI SDValue getSplatSourceVector(SDValue V, int &SplatIndex)
If V is a splatted value, return the source vector and its splat index.
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
LLVM_ABI SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI SDValue getConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offs=0, bool isT=false, unsigned TargetFlags=0)
LLVM_ABI SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
LLVM_ABI SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getExtractSubvector(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Return the VT typed sub-vector of Vec at Idx.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getInsertSubvector(const SDLoc &DL, SDValue Vec, SDValue SubVec, unsigned Idx)
Insert SubVec at the Idx element of Vec.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
LLVM_ABI bool shouldOptForSize() const
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI bool isEqualTo(SDValue A, SDValue B) const
Test whether two SDValues are known to compare equal.
static constexpr unsigned MaxRecursionDepth
LLVM_ABI SDValue expandVACopy(SDNode *Node)
Expand the specified ISD::VACOPY node as the Legalize pass would.
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI std::optional< unsigned > getValidShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has a uniform shift amount that is less than the element bit-width of the shi...
LLVM_ABI SDValue simplifySelect(SDValue Cond, SDValue TVal, SDValue FVal)
Try to simplify a select/vselect into 1 of its operands or a constant.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue expandVAArg(SDNode *Node)
Expand the specified ISD::VAARG node as the Legalize pass would.
LLVM_ABI bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
LLVM_ABI bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getCommutedVectorShuffle(const ShuffleVectorSDNode &SV)
Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to the shuffle node in input but with swa...
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(SDValue Op, bool PoisonOnly=false, unsigned Depth=0) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
LLVM_ABI MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, SDNodeFlags Flags=SDNodeFlags())
LLVM_ABI std::optional< unsigned > getValidMinimumShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has shift amounts that are all less than the element bit-width of the shift n...
LLVM_ABI SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
LLVM_ABI std::pair< SDValue, SDValue > getStrictFPExtendOrRound(SDValue Op, SDValue Chain, const SDLoc &DL, EVT VT)
Convert Op, which must be a STRICT operation of float type, to the float type VT, by either extending...
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI bool isKnownNeverZeroFloat(SDValue Op) const
Test whether the given floating point SDValue is known to never be positive or negative zero.
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool MaskedVectorIsZero(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
Return true if 'Op' is known to be zero in DemandedElts.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDValue getMCSymbol(MCSymbol *Sym, EVT VT)
LLVM_ABI SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
SDValue getSplat(EVT VT, const SDLoc &DL, SDValue Op)
Returns a node representing a splat of one value into all lanes of the provided vector type.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
static unsigned getOpcode_EXTEND(unsigned Opcode)
Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
LLVM_ABI SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp, ArrayRef< ISD::NodeType > CandidateBinOps, bool AllowPartials=false)
Match a binop + shuffle pyramid that represents a horizontal reduction over the elements of a vector ...
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
static LLVM_ABI bool isBitRotateMask(ArrayRef< int > Mask, unsigned EltSizeInBits, unsigned MinSubElts, unsigned MaxSubElts, unsigned &NumSubElts, unsigned &RotateAmt)
Checks if the shuffle is a bit rotation of the first operand across multiple subelements,...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
static int getSplatMaskIndex(ArrayRef< int > Mask)
ArrayRef< int > getMask() const
static void commuteMask(MutableArrayRef< int > Mask)
Change values in a shuffle permute mask assuming the two vector operands have swapped position.
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
size_type size() const
Definition SmallSet.h:170
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
iterator erase(const_iterator CI)
typename SuperClass::const_iterator const_iterator
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
pointer data()
Return a pointer to the vector's buffer, even if empty().
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:273
static constexpr size_t npos
Definition StringRef.h:57
bool equals_insensitive(StringRef RHS) const
Check for string equality, ignoring case.
Definition StringRef.h:172
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:414
Information about stack frame layout on the target.
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
virtual bool hasAndNot(SDValue X) const
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp to/from int conversion the backend supports.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
virtual bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
virtual bool isCommutativeBinOp(unsigned Opcode) const
Returns true if the opcode is a commutative binary operation.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const
Return true if it is profitable to fold a pair of shifts into a mask.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
BooleanContent getBooleanContents(bool isVec, bool isFloat) const
For targets without i1 registers, this gives the nature of the high-bits of boolean values held in ty...
virtual MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const
Returns preferred type for switch condition.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
BooleanContent
Enum that describes how the target represents true/false values.
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
Return true if the target supports a memory access of this type for the given address space and align...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
AndOrSETCCFoldKind
Enum of different potentially desirable ways to fold (and/or (setcc ...), (setcc ....
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
NegatibleCost
Enum that specifies when a float negation is beneficial.
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual bool shouldConvertPhiType(Type *From, Type *To) const
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
static ISD::NodeType getExtendForContent(BooleanContent Content)
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const
Expands target specific indirect branch for the case of JumpTable expansion.
SDValue getCheaperNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, unsigned Depth=0) const
This is the helper function to return the newly negated expression only when the cost is cheaper.
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue SimplifyMultipleUseDemandedVectorElts(SDValue Op, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
Helper wrapper around SimplifyMultipleUseDemandedBits, demanding all bits from only some vector eleme...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual const char * LowerXConstraint(EVT ConstraintVT) const
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
TargetLowering(const TargetLowering &)=delete
virtual bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth=0) const
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index) const
Get a pointer to vector element Idx located in memory for a vector of type VecVT starting at a base a...
virtual unsigned combineRepeatedFPDivisors() const
Indicate whether this target prefers to combine FDIVs with the same divisor.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
virtual bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useTLSDESC() const
Returns true if this target uses TLS Descriptors.
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
unsigned NoSignedZerosFPMath
NoSignedZerosFPMath - This flag is enabled when the -enable-no-signed-zeros-fp-math is specified on t...
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetInstrInfo * getInstrInfo() const
Target - Wrapper for Target specific information.
bool isOSBinFormatCOFF() const
Tests whether the OS uses the COFF binary format.
Definition Triple.h:774
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:344
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
use_iterator use_begin()
Definition Value.h:364
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
bool has128ByteRedZone(const MachineFunction &MF) const
Return true if the function has a redzone (accessible bytes past the frame of the top of stack functi...
bool Uses64BitFramePtr
True if the 64-bit frame or stack pointer should be used.
Register getGlobalBaseReg(MachineFunction *MF) const
getGlobalBaseReg - Return a virtual register initialized with the the global base register value.
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
void setAMXProgModel(AMXProgModelEnum Model)
ArrayRef< size_t > getPreallocatedArgOffsets(const size_t Id)
void setRestoreBasePointer(const MachineFunction *MF)
size_t getPreallocatedStackSize(const size_t Id)
bool hasBasePointer(const MachineFunction &MF) const
Register getPtrSizedFrameRegister(const MachineFunction &MF) const
Register getFrameRegister(const MachineFunction &MF) const override
Register getPtrSizedStackRegister(const MachineFunction &MF) const
Register getStackRegister() const
unsigned getSlotSize() const
Register getBaseRegister() const
const uint32_t * getNoPreservedMask() const override
bool canExtendTo512BW() const
bool hasAnyFMA() const
bool hasSSE1() const
bool avoidMFence() const
Avoid use of mfence forfence seq_cst, and instead use lock or.
bool hasBitScanPassThrough() const
bool hasSSE42() const
const X86TargetLowering * getTargetLowering() const override
bool hasMFence() const
Use mfence if we have SSE2 or we're on x86-64 (even if we asked for no-sse2).
bool canUseCMOV() const
bool isTargetDarwin() const
bool isTarget64BitLP64() const
Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
const X86InstrInfo * getInstrInfo() const override
bool useAVX512Regs() const
bool hasSSE3() const
bool isCallingConvWin64(CallingConv::ID CC) const
bool hasAVX512() const
bool canExtendTo512DQ() const
bool hasSSE41() const
bool hasSSE2() const
bool hasSSSE3() const
bool hasInt256() const
const X86RegisterInfo * getRegisterInfo() const override
bool hasAVX() const
unsigned getPreferVectorWidth() const
const X86FrameLowering * getFrameLowering() const override
bool useBWIRegs() const
bool hasAVX2() const
bool shouldFormOverflowOp(unsigned Opcode, EVT VT, bool MathUsed) const override
Overflow nodes should get combined/lowered to optimal instructions (they should allow eliminating exp...
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
bool isLegalAddImmediate(int64_t Imm) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT, EVT ExtVT) const override
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool preferABDSToABSWithNSW(EVT VT) const override
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
std::pair< SDValue, SDValue > BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer, MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const
bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded vector elements, returning true on success...
SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, const SDLoc &DL, const AsmOperandInfo &Constraint, SelectionDAG &DAG) const override
Handle Lowering flag assembly outputs.
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const override
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth) const override
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
bool convertSelectOfConstantsToMath(EVT VT) const override
Return true if a select of constants (select Cond, C1, C2) should be transformed into simple math ops...
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint letter, return the type of constraint for this target.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool isLegalStoreImmediate(int64_t Imm) const override
Return true if the specified immediate is legal for the value input of a store instruction.
SDValue visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue Ptr, SDValue Val, SDValue Mask) const override
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isCtlzFast() const override
Return true if ctlz instruction is fast.
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool supportSwiftError() const override
Return true if the target supports swifterror attribute.
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
bool shouldSplatInsEltVarIndex(EVT VT) const override
Return true if inserting a scalar into a variable element of an undef vector is more efficiently hand...
bool isInlineAsmTargetBranch(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo) const override
On x86, return true if the operand with index OpNo is a CALL or JUMP instruction, which can use eithe...
MVT hasFastEqualityCompare(unsigned NumBits) const override
Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op, const APInt &DemandedElts, unsigned MaskIndex, TargetLoweringOpt &TLO, unsigned Depth) const
bool isLegalICmpImmediate(int64_t Imm) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
bool hasInlineStackProbe(const MachineFunction &MF) const override
Returns true if stack probing through inline assembly is requested.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
unsigned preferedOpcodeForCmpEqPiecesOfOperand(EVT VT, unsigned ShiftOpc, bool MayTransformRotate, const APInt &ShiftOrRotateAmt, const std::optional< APInt > &AndMask) const override
bool isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond, EVT VT) const override
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool hasAndNot(SDValue Y) const override
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset) const override
Return true if we believe it is correct and profitable to reduce the load node to a smaller type.
bool preferScalarizeSplat(SDNode *N) const override
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const override
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
bool hasAndNotCompare(SDValue Y) const override
Return true if the target should transform: (X & Y) == Y ---> (~X & Y) == 0 (X & Y) !...
bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override
Return true if it is profitable to convert a select of FP constants into a constant pool load whose a...
StringRef getStackProbeSymbolName(const MachineFunction &MF) const override
Returns the name of the symbol used to emit stack probes or the empty string if not applicable.
bool hasBitTest(SDValue X, SDValue Y) const override
Return true if the target has a bit-test instruction: (X & (1 << Y)) ==/!= 0 This knowledge can be us...
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
bool isShuffleMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
bool useStackGuardXorFP() const override
If this function returns true, stack protection checks should XOR the frame pointer (or whichever poi...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine the number of bits in the operation that are sign bits.
bool shouldScalarizeBinop(SDValue) const override
Scalar ops always have equal or better analysis/performance/power than the vector equivalent,...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type Ty1 to type Ty2.
bool decomposeMulByConstant(LLVMContext &Context, EVT VT, SDValue C) const override
Return true if it is profitable to transform an integer multiplication-by-constant into simpler opera...
bool areJTsAllowed(const Function *Fn) const override
Returns true if lowering to a jump table is allowed.
bool isCommutativeBinOp(unsigned Opcode) const override
Returns true if the opcode is a commutative binary operation.
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const override
Returns preferred type for switch condition.
SDValue visitMaskedLoad(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const override
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isVectorClearMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Similar to isShuffleMaskLegal.
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &Info, const char *Constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Customize the preferred legalization strategy for certain types.
bool shouldConvertPhiType(Type *From, Type *To) const override
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool hasStackProbeSymbol(const MachineFunction &MF) const override
Returns true if stack probing through a function call is requested.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type Ty1 implicit zero-extends the valu...
bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
This function returns true if the memory access is aligned or if the target allows this specific unal...
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const override
TargetLowering::AndOrSETCCFoldKind isDesirableToCombineLogicOpOfSETCC(const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const override
Return prefered fold type, Abs if this is a vector, AddAnd if its an integer, None otherwise.
bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override
There are two ways to clear extreme bits (either low or high): Mask: x & (-1 << y) (the instcombine c...
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool addressingModeSupportsTLS(const GlobalValue &GV) const override
Returns true if the targets addressing mode can target thread local storage (TLS).
SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const override
Expands target specific indirect branch for the case of JumpTable expansion.
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isBinOp(unsigned Opcode) const override
Add x86-specific opcodes to the default list.
bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const override
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDValue unwrapAddress(SDValue N) const override
CondMergingParams getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs, const Value *Rhs) const override
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the value type to use for ISD::SETCC.
X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI)
bool isTargetCanonicalSelect(SDNode *N) const override
Return true if the given select/vselect should be considered canonical and not be transformed.
bool isVectorLoadExtDesirable(SDValue) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const Constant * getTargetConstantFromLoad(LoadSDNode *LD) const override
This method returns the constant pool value that will be loaded by LD.
EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const override
For types supported by the target, this is an identity function.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
unsigned getStackProbeSize(const MachineFunction &MF) const
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
Replace the results of node with an illegal result type with new values built out of custom code.
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
bool needsFixedCatchObjects() const override
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:201
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
#define INT64_MIN
Definition DataTypes.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
LLVM_ABI APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition APInt.cpp:3009
@ COND_NE
Not equal.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ X86_ThisCall
Similar to X86_StdCall.
@ X86_StdCall
stdcall is mostly used by the Win32 API.
Definition CallingConv.h:99
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ X86_FastCall
'fast' analog of X86_StdCall.
LLVM_ABI bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:774
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:504
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition ISDOpcodes.h:163
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition ISDOpcodes.h:525
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ FRAME_TO_ARGS_OFFSET
FRAME_TO_ARGS_OFFSET - This node represents offset from frame pointer to first (possible) on-stack ar...
Definition ISDOpcodes.h:140
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition ISDOpcodes.h:167
@ GlobalAddress
Definition ISDOpcodes.h:88
@ STRICT_FMINIMUM
Definition ISDOpcodes.h:464
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ SIGN_EXTEND_VECTOR_INREG
SIGN_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register sign-extension of the low ...
Definition ISDOpcodes.h:892
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition ISDOpcodes.h:431
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:89
@ EH_RETURN
OUTCHAIN = EH_RETURN(INCHAIN, OFFSET, HANDLER) - This node represents 'eh_return' gcc dwarf builtin,...
Definition ISDOpcodes.h:151
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:706
@ STRICT_UINT_TO_FP
Definition ISDOpcodes.h:478
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:656
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition ISDOpcodes.h:117
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:773
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:809
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:528
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition ISDOpcodes.h:663
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ STRICT_FMAXIMUM
Definition ISDOpcodes.h:463
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition ISDOpcodes.h:130
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition ISDOpcodes.h:881
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition ISDOpcodes.h:477
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:457
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition ISDOpcodes.h:174
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:701
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition ISDOpcodes.h:420
@ FREEZE
FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or is evaluated to UNDEF),...
Definition ISDOpcodes.h:236
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ ExternalSymbol
Definition ISDOpcodes.h:93
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ VECTOR_COMPRESS
VECTOR_COMPRESS(Vec, Mask, Passthru) consecutively place vector elements based on mask e....
Definition ISDOpcodes.h:690
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition ISDOpcodes.h:903
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:451
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:927
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition ISDOpcodes.h:157
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:713
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:333
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
bool isExtVecInRegOpcode(unsigned Opcode)
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
LLVM_ABI bool isBuildVectorOfConstantSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantSDNode or undef.
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isExtOpcode(unsigned Opcode)
LLVM_ABI bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
bool isTrueWhenEqual(CondCode Cond)
Return true if the specified condition returns true if the two operands to the condition are equal.
bool isUNINDEXEDLoad(const SDNode *N)
Returns true if the specified node is an unindexed load.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
LLVM_ABI bool isFreezeUndef(const SDNode *N)
Return true if the specified node is FREEZE(UNDEF).
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
bool matchUnaryPredicate(SDValue Op, std::function< bool(ConstantSDNode *)> Match, bool AllowUndefs=false, bool AllowTruncation=false)
Hook for matching ConstantSDNode predicate.
LLVM_ABI bool isBuildVectorOfConstantFPSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantFPSDNode or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LLVM_ABI bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
bool isUnsignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs an unsigned comparison when used with intege...
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
cst_pred_ty< is_sign_mask > m_SignMask()
Match an integer or vector with only the sign bit(s) set.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
CmpClass_match< LHS, RHS, ICmpInst, true > m_c_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
Matches an ICmp with a predicate over LHS and RHS in either order.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Xor, true > m_c_Xor(const LHS &L, const RHS &R)
Matches an Xor with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
AnyBinaryOp_match< LHS, RHS, true > m_c_BinOp(const LHS &L, const RHS &R)
Matches a BinaryOperator with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
BinOpPred_match< LHS, RHS, is_bitwiselogic_op > m_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
LLVM_ABI Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Opcode_match m_Opc(unsigned Opcode)
BinaryOpc_match< LHS, RHS > m_Srl(const LHS &L, const RHS &R)
auto m_SpecificVT(EVT RefVT, const Pattern &P)
Match a specific ValueType.
TernaryOpc_match< LHS, RHS, IDX > m_InsertSubvector(const LHS &Base, const RHS &Sub, const IDX &Idx)
UnaryOpc_match< Opnd > m_Abs(const Opnd &Op)
Or< Preds... > m_AnyOf(const Preds &...preds)
And< Preds... > m_AllOf(const Preds &...preds)
TernaryOpc_match< T0_P, T1_P, T2_P > m_SetCC(const T0_P &LHS, const T1_P &RHS, const T2_P &CC)
UnaryOpc_match< Opnd > m_AnyExt(const Opnd &Op)
auto m_Node(unsigned Opcode, const OpndPreds &...preds)
TernaryOpc_match< T0_P, T1_P, T2_P > m_VSelect(const T0_P &Cond, const T1_P &T, const T2_P &F)
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
CondCode_match m_SpecificCondCode(ISD::CondCode CC)
Match a conditional code SDNode with a specific ISD::CondCode.
auto m_SpecificVectorElementVT(EVT RefVT, const Pattern &P)
Match a vector ValueType.
CondCode_match m_CondCode()
Match any conditional code SDNode.
ConstantInt_match m_ConstInt()
Match any integer constants or splat of an integer constant.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
Invariant opcodes: All instruction sets have these as their low opcodes.
@ X86
Windows x64, Windows Itanium (IA-64)
Definition MCAsmInfo.h:50
@ PTR32_UPTR
Definition X86.h:217
@ PTR64
Definition X86.h:218
@ PTR32_SPTR
Definition X86.h:216
@ MO_TLSLD
MO_TLSLD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
@ MO_GOTPCREL_NORELAX
MO_GOTPCREL_NORELAX - Same as MO_GOTPCREL except that R_X86_64_GOTPCREL relocations are guaranteed to...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
@ MO_NTPOFF
MO_NTPOFF - On a symbol operand this indicates that the immediate is the negative thread-pointer offs...
@ MO_INDNTPOFF
MO_INDNTPOFF - On a symbol operand this indicates that the immediate is the absolute address of the G...
@ MO_GOTNTPOFF
MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry w...
@ MO_TPOFF
MO_TPOFF - On a symbol operand this indicates that the immediate is the thread-pointer offset for the...
@ MO_TLVP_PIC_BASE
MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate is some TLS offset from the ...
@ MO_TLSGD
MO_TLSGD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
@ MO_TLVP
MO_TLVP - On a symbol operand this indicates that the immediate is some TLS offset.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the reference is actually to the "__imp...
@ MO_GOTTPOFF
MO_GOTTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry wi...
@ MO_SECREL
MO_SECREL - On a symbol operand this indicates that the immediate is the offset from beginning of sec...
@ MO_DTPOFF
MO_DTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
@ MO_TLSLDM
MO_TLSLDM - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
@ MO_GOTPCREL
MO_GOTPCREL - On a symbol operand this indicates that the immediate is offset to the GOT entry for th...
@ FST
This instruction implements a truncating store from FP stack slots.
@ CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FMAX
Floating point max and min.
@ BT
X86 bit-test instructions.
@ HADD
Integer horizontal add/sub.
@ MOVQ2DQ
Copies a 64-bit value from an MMX vector to the low word of an XMM vector, with the high word zero fi...
@ BLENDI
Blend where the selector is an immediate.
@ CMP
X86 compare and logical compare instructions.
@ BLENDV
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
@ ADDSUB
Combined add and sub on an FP vector.
@ STRICT_FMAX
Floating point max and min.
@ STRICT_CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FHADD
Floating point horizontal add/sub.
@ BSR
Bit scan reverse.
@ SETCC
X86 SetCC.
@ NT_BRIND
BRIND node with NoTrack prefix.
@ SELECTS
X86 Select.
@ FSETCCM
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE.
@ PEXTRB
Extract an 8-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRB.
@ FXOR
Bitwise logical XOR of floating point values.
@ BRCOND
X86 conditional branches.
@ FSETCC
X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
@ PINSRB
Insert the lower 8-bits of a 32-bit value to a vector, corresponds to X86::PINSRB.
@ INSERTPS
Insert any element of a 4 x float vector into any element of a destination 4 x floatvector.
@ PSHUFB
Shuffle 16 8-bit values within a vector.
@ PEXTRW
Extract a 16-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRW.
@ AADD
RAO arithmetic instructions.
@ FANDN
Bitwise logical ANDNOT of floating point values.
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
@ FMAXC
Commutative FMIN and FMAX.
@ EXTRQI
SSE4A Extraction and Insertion.
@ FLD
This instruction implements an extending load to FP stack slots.
@ PSADBW
Compute Sum of Absolute Differences.
@ FOR
Bitwise logical OR of floating point values.
@ FIST
This instruction implements a fp->int store from FP stack slots.
@ FP_TO_INT_IN_MEM
This instruction implements FP_TO_SINT with the integer destination in memory and a FP reg source.
@ LADD
LOCK-prefixed arithmetic read-modify-write instructions.
@ MMX_MOVW2D
Copies a GPR into the low 32-bit word of a MMX vector and zero out the high word.
@ Wrapper
A wrapper node for TargetConstantPool, TargetJumpTable, TargetExternalSymbol, TargetGlobalAddress,...
@ PINSRW
Insert the lower 16-bits of a 32-bit value to a vector, corresponds to X86::PINSRW.
@ CMPCCXADD
Compare and Add if Condition is Met.
@ MMX_MOVD2W
Copies a 32-bit value from the low word of a MMX vector to a GPR.
@ FILD
This instruction implements SINT_TO_FP with the integer source in memory and FP reg result.
@ MOVDQ2Q
Copies a 64-bit value from the low word of an XMM vector to an MMX vector.
@ ANDNP
Bitwise Logical AND NOT of Packed FP values.
@ BSF
Bit scan forward.
@ VAARG_64
These instructions grab the address of the next argument from a va_list.
@ FAND
Bitwise logical AND of floating point values.
@ CMOV
X86 conditional moves.
@ WrapperRIP
Special wrapper used under X86-64 PIC mode for RIP relative displacements.
@ FSHL
X86 funnel/double shift i16 instructions.
@ FRSQRT
Floating point reciprocal-sqrt and reciprocal approximation.
Define some predicates that are used for node matching.
@ AddrNumOperands
Definition X86BaseInfo.h:36
bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into a vector splat instruction as a memory oper...
bool isZeroNode(SDValue Elt)
Returns true if Elt is a constant zero or floating point constant +0.0.
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
bool mayFoldIntoZeroExtend(SDValue Op)
Check if Op is an operation that could be folded into a zero extend x86 instruction.
bool mayFoldIntoStore(SDValue Op)
Check if Op is a value that could be used to fold a store into some other x86 instruction as a memory...
bool isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget, const MachineFunction &MF)
True if the target supports the extended frame for async Swift functions.
int getRoundingModeX86(unsigned RM)
Convert LLVM rounding mode to X86 rounding mode.
int getCCMPCondFlagsFromCondCode(CondCode CC)
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
initializer< Ty > init(const Ty &Val)
constexpr double e
Definition MathExtras.h:47
@ User
could "use" a pointer
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, unsigned NumDstElts, bool IsAnyExtend, SmallVectorImpl< int > &ShuffleMask)
Decode a zero extension instruction as a shuffle mask.
IterT next_nodbg(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It, then continue incrementing it while it points to a debug instruction.
static bool isGlobalStubReference(unsigned char TargetFlag)
isGlobalStubReference - Return true if the specified TargetFlag operand is a reference to a stub for ...
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:477
@ Length
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVHLPS instruction as a v2f64/v4f32 shuffle mask.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
static bool isGlobalRelativeToPICBase(unsigned char TargetFlag)
isGlobalRelativeToPICBase - Return true if the specified global value reference is relative to a 32-b...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1727
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1657
void DecodeZeroMoveLowMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decode a move lower and zero upper instruction as a shuffle mask.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
LLVM_ABI bool isAllOnesOrAllOnesSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant -1 integer or a splatted vector of a constant -1 integer (with...
Definition Utils.cpp:1607
void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshuflw.
static const IntrinsicData * getIntrinsicWithChain(unsigned IntNo)
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2474
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:361
MCRegister getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High=false)
@ SjLj
setjmp/longjmp based exceptions
Definition CodeGen.h:56
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
void DecodeVPERMV3Mask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:289
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
void DecodeBLENDMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a BLEND immediate mask into a shuffle mask.
void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a shuffle packed values at 128-bit granularity (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) immed...
void DecodeVPERMMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for VPERMQ/VPERMPD.
static const MachineInstrBuilder & addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset=0, bool mem=true)
addFrameReference - This function is used to add a reference to the base of an abstract object on the...
void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A EXTRQ instruction as a shuffle mask.
static const MachineInstrBuilder & addFullAddress(const MachineInstrBuilder &MIB, const X86AddressMode &AM)
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:293
static const IntrinsicData * getIntrinsicWithoutChain(unsigned IntNo)
auto unique(Range &&R, Predicate P)
Definition STLExtras.h:2078
LLVM_ABI bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition Utils.cpp:1589
void DecodePSRLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:331
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1779
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:348
LLVM_ABI bool isMinSignedConstant(SDValue V)
Returns true if V is a constant min signed integer value.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl< int > &ShuffleMask, bool SrcIsMem)
Decode a 128-bit INSERTPS instruction as a v4f32 shuffle mask.
void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
static void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand, Register Reg)
Replace the address used in the instruction with the direct memory reference.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:754
void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1734
void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVLHPS instruction as a v2f64/v4f32 shuffle mask.
LLVM_ABI bool getShuffleDemandedElts(int SrcWidth, ArrayRef< int > Mask, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS, bool AllowUndefElts=false)
Transform a shuffle mask's output demanded element mask into demanded element masks for the 2 operand...
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:222
bool isAlpha(char C)
Checks if character C is a valid letter as classified by "C" locale.
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
LLVM_ABI void getHorizDemandedEltsForFirstOperand(unsigned VectorBitWidth, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
Compute the demanded elements mask of horizontal binary operations.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void createUnpackShuffleMask(EVT VT, SmallVectorImpl< int > &Mask, bool Lo, bool Unary)
Generate unpacklo/unpackhi shuffle mask.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI SDValue peekThroughTruncates(SDValue V)
Return the non-truncated source operand of V if it exists.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1741
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A INSERTQ instruction as a shuffle mask.
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
LLVM_ABI SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
LLVM_ABI EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
@ Default
-O2, -Os
Definition CodeGen.h:85
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
@ Success
The lock was released successfully.
void DecodeVPERMVMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
static void verifyIntrinsicTables()
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Mod
The access may modify the value stored in memory.
Definition ModRef.h:34
void createSplat2ShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Lo)
Similar to unpacklo/unpackhi, but without the 128-bit lane limitation imposed by AVX and specific to ...
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
bool isFuncletEHPersonality(EHPersonality Pers)
Returns true if this is a personality function that invokes handler funclets (which must return to it...
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
void DecodeVALIGNMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
CombineLevel
Definition DAGCombine.h:15
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:1996
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
To bit_cast(const From &from) noexcept
Definition bit.h:90
void replace(R &&Range, const T &OldValue, const T &NewValue)
Provide wrappers to std::replace which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1862
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Xor
Bitwise or logical XOR of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad, SmallVectorImpl< int > &ShuffleMask)
Decode a scalar float move instruction as a shuffle mask.
LLVM_ABI bool isNullConstantOrUndef(SDValue V)
Returns true if V is a constant integer zero or an UNDEF node.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1956
static X86AddressMode getAddressFromInstr(const MachineInstr *MI, unsigned Operand)
Compute the addressing mode from an machine instruction starting with the given operand.
void DecodeVPPERMMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPPERM mask from a raw array of constants such as from BUILD_VECTOR.
DWARFExpression::Operation Op
void DecodePALIGNRMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
bool isPhysRegUsedAfter(Register Reg, MachineBasicBlock::iterator MBI)
Check if physical register Reg is used after MBI.
void DecodeMOVSLDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
RoundingMode
Rounding mode.
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1837
constexpr unsigned BitWidth
void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
void DecodePSLLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1963
void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:560
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1760
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
static const MachineInstrBuilder & addDirectMem(const MachineInstrBuilder &MIB, Register Reg)
addDirectMem - This function is used to add a direct memory reference to the current instruction – th...
static uint32_t extractBits(uint64_t Val, uint32_t Hi, uint32_t Lo)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1899
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
LLVM_ABI bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2110
@ TRUNCATE_TO_MEM_VI16
@ INTR_TYPE_SCALAR_MASK_SAE
@ INTR_TYPE_1OP_SAE
@ TRUNCATE_TO_MEM_VI32
@ INTR_TYPE_2OP_SAE
@ INTR_TYPE_3OP_SCALAR_MASK_SAE
@ INTR_TYPE_3OP_MASK_SAE
@ INTR_TYPE_2OP_MASK
@ TRUNCATE_TO_MEM_VI8
@ CVTNEPS2BF16_MASK
@ CMP_MASK_SCALAR_CC
@ INTR_TYPE_1OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK
@ INTR_TYPE_3OP_IMM8
@ INTR_TYPE_2OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK_RND
@ INTR_TYPE_1OP_MASK
@ COMPRESS_EXPAND_IN_REG
@ INTR_TYPE_CAST_MMX
@ INTR_TYPE_4OP_IMM8
void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
void DecodeMOVDDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
void array_pod_sort(IteratorTy Start, IteratorTy End)
array_pod_sort - This sorts an array with the specified start and end extent.
Definition STLExtras.h:1584
void DecodeVectorBroadcast(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decodes a broadcast of the first element of a vector.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:316
void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for shufp*.
void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufhw.
static const MachineInstrBuilder & addRegOffset(const MachineInstrBuilder &MIB, Register Reg, bool isKill, int Offset)
addRegOffset - This function is used to add a memory reference of the form [Reg + Offset],...
void DecodeMOVSHDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
@ SM_SentinelUndef
@ SM_SentinelZero
LLVM_ABI bool scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Attempt to narrow/widen the Mask shuffle mask to the NumDstElts target width.
void DecodePSHUFBMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a PSHUFB mask from a raw array of constants such as from BUILD_VECTOR.
LLVM_ABI int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
#define EQ(a, b)
Definition regexec.c:65
static LLVM_ABI const fltSemantics & IEEEsingle() LLVM_READNONE
Definition APFloat.cpp:266
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:304
static constexpr roundingMode rmTowardZero
Definition APFloat.h:308
static LLVM_ABI const fltSemantics & x87DoubleExtended() LLVM_READNONE
Definition APFloat.cpp:289
static LLVM_ABI const fltSemantics & IEEEquad() LLVM_READNONE
Definition APFloat.cpp:268
static LLVM_ABI unsigned int semanticsPrecision(const fltSemantics &)
Definition APFloat.cpp:324
static LLVM_ABI const fltSemantics & IEEEdouble() LLVM_READNONE
Definition APFloat.cpp:267
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition APFloat.cpp:264
static LLVM_ABI const fltSemantics & BFloat() LLVM_READNONE
Definition APFloat.cpp:265
opStatus
IEEE-754R 7: Default exception handling.
Definition APFloat.h:320
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:359
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
bool is512BitVector() const
Return true if this is a 512-bit vector type.
Definition ValueTypes.h:217
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition ValueTypes.h:212
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:453
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:202
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:301
static LLVM_ABI KnownBits sadd_sat(const KnownBits &LHS, const KnownBits &RHS)
Compute knownbits resulting from llvm.sadd.sat(LHS, RHS)
static LLVM_ABI std::optional< bool > eq(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_EQ result.
KnownBits anyextOrTrunc(unsigned BitWidth) const
Return known bits for an "any" extension or truncation of the value we're tracking.
Definition KnownBits.h:186
static LLVM_ABI KnownBits mulhu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits from zero-extended multiply-hi.
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition KnownBits.h:108
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:80
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:242
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
unsigned countMaxTrailingZeros() const
Returns the maximum number of trailing zero bits possible.
Definition KnownBits.h:274
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:161
unsigned countMaxPopulation() const
Returns the maximum number of bits that could be one.
Definition KnownBits.h:289
void setAllZero()
Make all bits known to be zero and discard any previous information.
Definition KnownBits.h:86
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:172
bool isConstant() const
Returns true if we know the value of all bits.
Definition KnownBits.h:54
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
bool isNonZero() const
Returns true if this value is known to be non-zero.
Definition KnownBits.h:111
static LLVM_ABI KnownBits abdu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits for abdu(LHS, RHS).
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:225
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:296
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:311
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:180
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
KnownBits zextOrTrunc(unsigned BitWidth) const
Return known bits for a zero extension or truncation of the value we're tracking.
Definition KnownBits.h:196
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:248
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:145
static LLVM_ABI KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition KnownBits.cpp:60
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:105
void setAllOnes()
Make all bits known to be one and discard any previous information.
Definition KnownBits.h:92
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
static LLVM_ABI std::optional< bool > sgt(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_SGT result.
bool isAllOnes() const
Returns true if value is all one bits.
Definition KnownBits.h:83
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition KnownBits.h:60
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static LLVM_ABI MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasAllowContract() const
bool hasNoSignedZeros() const
void setNoSignedWrap(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setChain(SDValue InChain)
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI bool recursivelyDeleteUnusedNodes(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
This structure is used to pass arguments to makeLibCall function.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
X86AddressMode - This struct holds a generalized full x86 address mode.